324 files changed, 10216 insertions, 6881 deletions
diff --git a/configure.ac b/configure.ac
index 6ff50abda3d..f236dad6441 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1579,6 +1579,8 @@ fi
 AM_CONDITIONAL(HAVE_EGL, test "x$enable_egl" = xyes)
 AC_SUBST([EGL_LIB_DEPS])
 
+gallium_st="mesa"
+
 dnl
 dnl XA configuration
 dnl
@@ -1592,6 +1594,7 @@ if test "x$enable_xa" = xyes; then
           Example: ./configure --enable-xa --with-gallium-drivers=svga...])
     fi
     enable_gallium_loader=$enable_shared_pipe_drivers
+    gallium_st="$gallium_st xa"
 fi
 AM_CONDITIONAL(HAVE_ST_XA, test "x$enable_xa" = xyes)
 
@@ -1637,24 +1640,28 @@ AM_CONDITIONAL(NEED_GALLIUM_VL_WINSYS, test "x$need_gallium_vl_winsys" = xyes)
 if test "x$enable_xvmc" = xyes; then
     PKG_CHECK_MODULES([XVMC], [xvmc >= $XVMC_REQUIRED])
     enable_gallium_loader=$enable_shared_pipe_drivers
+    gallium_st="$gallium_st xvmc"
 fi
 AM_CONDITIONAL(HAVE_ST_XVMC, test "x$enable_xvmc" = xyes)
 
 if test "x$enable_vdpau" = xyes; then
     PKG_CHECK_MODULES([VDPAU], [vdpau >= $VDPAU_REQUIRED])
     enable_gallium_loader=$enable_shared_pipe_drivers
+    gallium_st="$gallium_st vdpau"
 fi
 AM_CONDITIONAL(HAVE_ST_VDPAU, test "x$enable_vdpau" = xyes)
 
 if test "x$enable_omx" = xyes; then
     PKG_CHECK_MODULES([OMX], [libomxil-bellagio >= $LIBOMXIL_BELLAGIO_REQUIRED])
     enable_gallium_loader=$enable_shared_pipe_drivers
+    gallium_st="$gallium_st omx"
 fi
 AM_CONDITIONAL(HAVE_ST_OMX, test "x$enable_omx" = xyes)
 
 if test "x$enable_va" = xyes; then
     PKG_CHECK_MODULES([VA], [libva >= $LIBVA_REQUIRED])
     enable_gallium_loader=$enable_shared_pipe_drivers
+    gallium_st="$gallium_st va"
 fi
 AM_CONDITIONAL(HAVE_ST_VA, test "x$enable_va" = xyes)
 
@@ -1677,6 +1684,7 @@ if test "x$enable_nine" = xyes; then
     fi
 
     enable_gallium_loader=$enable_shared_pipe_drivers
+    gallium_st="$gallium_st nine"
 fi
 AM_CONDITIONAL(HAVE_ST_NINE, test "x$enable_nine" = xyes)
 
@@ -1716,6 +1724,7 @@ if test "x$enable_opencl" = xyes; then
 
     # XXX: Use $enable_shared_pipe_drivers once converted to use static/shared pipe-drivers
     enable_gallium_loader=yes
+    gallium_st="$gallium_st clover"
 
     if test "x$enable_opencl_icd" = xyes; then
         OPENCL_LIBNAME="MesaOpenCL"
@@ -2526,7 +2535,8 @@ fi
 
 echo ""
 if test -n "$with_gallium_drivers"; then
-    echo "        Gallium:         yes"
+    echo "        Gallium drivers: $gallium_drivers"
+    echo "        Gallium st:      $gallium_st"
 else
     echo "        Gallium:         no"
 fi
diff --git a/docs/GL3.txt b/docs/GL3.txt
index e17e783d331..6503e2ab1da 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -96,18 +96,18 @@ GL 4.0, GLSL 4.00 --- all DONE: nvc0, radeonsi
 
   GL_ARB_draw_buffers_blend                            DONE (i965, nv50, r600, llvmpipe, softpipe)
   GL_ARB_draw_indirect                                 DONE (i965, r600, llvmpipe, softpipe)
-  GL_ARB_gpu_shader5                                   DONE (i965)
+  GL_ARB_gpu_shader5                                   DONE (i965, r600)
   - 'precise' qualifier                                DONE
-  - Dynamically uniform sampler array indices          DONE (r600, softpipe)
-  - Dynamically uniform UBO array indices              DONE (r600)
+  - Dynamically uniform sampler array indices          DONE (softpipe)
+  - Dynamically uniform UBO array indices              DONE ()
   - Implicit signed -> unsigned conversions            DONE
   - Fused multiply-add                                 DONE ()
-  - Packing/bitfield/conversion functions              DONE (r600, softpipe)
-  - Enhanced textureGather                             DONE (r600, softpipe)
-  - Geometry shader instancing                         DONE (r600, llvmpipe, softpipe)
+  - Packing/bitfield/conversion functions              DONE (softpipe)
+  - Enhanced textureGather                             DONE (softpipe)
+  - Geometry shader instancing                         DONE (llvmpipe, softpipe)
   - Geometry shader multiple streams                   DONE ()
-  - Enhanced per-sample shading                        DONE (r600)
-  - Interpolation functions                            DONE (r600)
+  - Enhanced per-sample shading                        DONE ()
+  - Interpolation functions                            DONE ()
   - New overload resolution rules                      DONE
   GL_ARB_gpu_shader_fp64                               DONE (r600, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, r600)
diff --git a/docs/index.html b/docs/index.html
index 9aa2821dcfe..138447fc500 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,12 @@
 
 <h1>News</h1>
 
+<h2>October 10, 2015</h2>
+<p>
+<a href="relnotes/11.0.3.html">Mesa 11.0.3</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>October 3, 2015</h2>
 <p>
 <a href="relnotes/10.6.9.html">Mesa 10.6.9</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 1c47853d81e..074c3b6a612 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/11.0.3.html">11.0.3 release notes</a>
 <li><a href="relnotes/10.6.9.html">10.6.9 release notes</a>
 <li><a href="relnotes/11.0.2.html">11.0.2 release notes</a>
 <li><a href="relnotes/11.0.1.html">11.0.1 release notes</a>
diff --git a/docs/relnotes/11.0.3.html b/docs/relnotes/11.0.3.html
new file mode 100644
index 00000000000..e839c2121e5
--- /dev/null
+++ b/docs/relnotes/11.0.3.html
@@ -0,0 +1,185 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.3 Release Notes / October 10, 2015</h1>
+
+<p>
+Mesa 11.0.3 is a bug fix release which fixes bugs found since the 11.0.2 release.
+</p>
+<p>
+Mesa 11.0.3 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+c2210e3daecc10ed9fdcea500327652ed6effc2f47c4b9cee63fb08f560d7117  mesa-11.0.3.tar.gz
+ab2992eece21adc23c398720ef8c6933cb69ea42e1b2611dc09d031e17e033d6  mesa-11.0.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=55552">Bug 55552</a> - Compile errors with --enable-mangling</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=71789">Bug 71789</a> - [r300g] Visuals not found in (default) depth = 24</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91044">Bug 91044</a> - piglit spec/egl_khr_create_context/valid debug flag gles* fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91342">Bug 91342</a> - Very dark textures on some objects in indoors environments in Postal 2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91596">Bug 91596</a> - EGL_KHR_gl_colorspace (v2) causes problem with Android-x86 GUI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91718">Bug 91718</a> - piglit.spec.arb_shader_image_load_store.invalid causes intermittent GPU HANG</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92072">Bug 92072</a> - Wine breakage since d082c5324 (st/mesa: don't call st_validate_state in BlitFramebuffer)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92265">Bug 92265</a> - Black windows in weston after update mesa to 11.0.2-1</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>st/mesa: try PIPE_BIND_RENDER_TARGET when choosing float texture formats</li>
+</ul>
+
+<p>Daniel Scharrer (1):</p>
+<ul>
+  <li>mesa: Add abs input modifier to base for POW in ffvertex_prog</li>
+</ul>
+
+<p>Emil Velikov (3):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.2</li>
+  <li>Revert "nouveau: make sure there's always room to emit a fence"</li>
+  <li>Update version to 11.0.3</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965/fs: Fix hang on IVB and VLV with image format mismatch.</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>meta: Handle array textures in scaled MSAA blits</li>
+</ul>
+
+<p>Ilia Mirkin (6):</p>
+<ul>
+  <li>nouveau: be more careful about freeing temporary transfer buffers</li>
+  <li>nouveau: delay deleting buffer with unflushed fence</li>
+  <li>nouveau: wait to unref the transfer's bo until it's no longer used</li>
+  <li>nv30: pretend to have packed texture/surface formats</li>
+  <li>nv30: always go through translate module on big-endian</li>
+  <li>nouveau: make sure there's always room to emit a fence</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>mesa: Correctly handle GL_BGRA_EXT in ES3 format_and_type checks</li>
+</ul>
+
+<p>Kyle Brenneman (3):</p>
+<ul>
+  <li>glx: Fix build errors with --enable-mangling (v2)</li>
+  <li>mapi: Make _glapi_get_stub work with "gl" or "mgl" prefix.</li>
+  <li>glx: Don't hard-code the name "libGL.so.1" in driOpenDriver (v3)</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>radeon/vce: fix vui time_scale zero error</li>
+</ul>
+
+<p>Marek Olšák (21):</p>
+<ul>
+  <li>st/mesa: fix front buffer regression after dropping st_validate_state in Blit</li>
+  <li>radeonsi: handle index buffer alloc failures</li>
+  <li>radeonsi: handle constant buffer alloc failures</li>
+  <li>gallium/radeon: handle buffer_map staging buffer failures better</li>
+  <li>gallium/radeon: handle buffer alloc failures in r600_draw_rectangle</li>
+  <li>gallium/radeon: add a fail path for depth MSAA texture readback</li>
+  <li>radeonsi: report alloc failure from si_shader_binary_read</li>
+  <li>radeonsi: add malloc fail paths to si_create_shader_state</li>
+  <li>radeonsi: skip drawing if the tess factor ring allocation fails</li>
+  <li>radeonsi: skip drawing if GS ring allocations fail</li>
+  <li>radeonsi: handle shader precompile failures</li>
+  <li>radeonsi: handle fixed-func TCS shader create failure</li>
+  <li>radeonsi: skip drawing if VS, TCS, TES, GS fail to compile or upload</li>
+  <li>radeonsi: skip drawing if PS fails to compile or upload</li>
+  <li>radeonsi: skip drawing if updating the scratch buffer fails</li>
+  <li>radeonsi: don't forget to update scratch relocations for LS, HS, ES shaders</li>
+  <li>radeonsi: handle dummy constant buffer allocation failure</li>
+  <li>gallium/u_blitter: handle allocation failures</li>
+  <li>radeonsi: add scratch buffer to the buffer list when it's re-allocated</li>
+  <li>st/dri: don't use _ctx in client_wait_sync</li>
+  <li>egl/dri2: don't require a context for ClientWaitSync (v2)</li>
+</ul>
+
+<p>Matthew Waters (1):</p>
+<ul>
+  <li>egl: rework handling EGL_CONTEXT_FLAGS</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>st/dri: Use packed RGB formats</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>mesa: fix mipmap generation for immutable, compressed textures</li>
+</ul>
+
+<p>Tom Stellard (3):</p>
+<ul>
+  <li>gallium/radeon: Use call_once() when initailizing LLVM targets</li>
+  <li>gallivm: Allow drivers and state trackers to initialize gallivm LLVM targets v2</li>
+  <li>radeon/llvm: Initialize gallivm targets when initializing the AMDGPU target v2</li>
+</ul>
+
+<p>Varad Gautam (1):</p>
+<ul>
+  <li>egl: restore surface type before linking config to its display</li>
+</ul>
+
+<p>Ville Syrjälä (3):</p>
+<ul>
+  <li>i830: Fix collision between I830_UPLOAD_RASTER_RULES and I830_UPLOAD_TEX(0)</li>
+  <li>i915: Fix texcoord vs. varying collision in fragment programs</li>
+  <li>i915: Remember to call intel_prepare_render() before blitting</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
index d4f30d0da62..dcf425e4c68 100644
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -46,6 +46,7 @@ Note: some of the new features are only available with certain drivers.
 <ul>
 <li>GL_ARB_blend_func_extended on freedreno (a3xx)</li>
 <li>GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips</li>
+<li>GL_ARB_gpu_shader5 on r600 for Evergreen and later chips</li>
 <li>GL_ARB_shader_storage_buffer_object on i965</li>
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
diff --git a/src/Makefile.am b/src/Makefile.am
index 13cfaa5b367..da638a811fb 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -66,7 +66,6 @@ AM_CPPFLAGS = \
 noinst_LTLIBRARIES = libglsl_util.la
 
 libglsl_util_la_SOURCES = \
-	glsl/shader_enums.c \
 	mesa/main/imports.c \
 	mesa/program/prog_hash_table.c \
 	mesa/program/symbol_table.c \
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 1fa36416b8e..9df4e265b5b 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -137,6 +137,8 @@ C_SOURCES := \
 	tgsi/tgsi_dump.h \
 	tgsi/tgsi_exec.c \
 	tgsi/tgsi_exec.h \
+	tgsi/tgsi_emulate.c \
+	tgsi/tgsi_emulate.h \
 	tgsi/tgsi_info.c \
 	tgsi/tgsi_info.h \
 	tgsi/tgsi_iterate.c \
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index c4ae30461cb..c88dfbf974a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -129,7 +129,8 @@ lp_build_emit_llvm_unary(
    unsigned tgsi_opcode,
    LLVMValueRef arg0)
 {
-   struct lp_build_emit_data emit_data;
+   struct lp_build_emit_data emit_data = {{0}};
+   emit_data.info = tgsi_get_opcode_info(tgsi_opcode);
    emit_data.arg_count = 1;
    emit_data.args[0] = arg0;
    return lp_build_emit_llvm(bld_base, tgsi_opcode, &emit_data);
@@ -142,7 +143,8 @@ lp_build_emit_llvm_binary(
    LLVMValueRef arg0,
    LLVMValueRef arg1)
 {
-   struct lp_build_emit_data emit_data;
+   struct lp_build_emit_data emit_data = {{0}};
+   emit_data.info = tgsi_get_opcode_info(tgsi_opcode);
    emit_data.arg_count = 2;
    emit_data.args[0] = arg0;
    emit_data.args[1] = arg1;
@@ -157,7 +159,8 @@ lp_build_emit_llvm_ternary(
    LLVMValueRef arg1,
    LLVMValueRef arg2)
 {
-   struct lp_build_emit_data emit_data;
+   struct lp_build_emit_data emit_data = {{0}};
+   emit_data.info = tgsi_get_opcode_info(tgsi_opcode);
    emit_data.arg_count = 3;
    emit_data.args[0] = arg0;
    emit_data.args[1] = arg1;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 0ad78b0ace2..3d5e2cb316b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -538,12 +538,19 @@ lrp_emit(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   LLVMValueRef tmp;
-   tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_SUB,
-                                   emit_data->args[1],
-                                   emit_data->args[2]);
-   emit_data->output[emit_data->chan] = lp_build_emit_llvm_ternary(bld_base,
-                    TGSI_OPCODE_MAD, emit_data->args[0], tmp, emit_data->args[2]);
+   struct lp_build_context *bld = &bld_base->base;
+   LLVMValueRef inv, a, b;
+
+   /* This uses the correct version: (1 - t)*a + t*b
+    *
+    * An alternative version is "a + t*(b-a)". The problem is this version
+    * doesn't return "b" for t = 1, because "a + (b-a)" isn't equal to "b"
+    * because of the floating-point rounding.
+    */
+   inv = lp_build_sub(bld, bld_base->base.one, emit_data->args[0]);
+   a = lp_build_mul(bld, emit_data->args[1], emit_data->args[0]);
+   b = lp_build_mul(bld, emit_data->args[2], inv);
+   emit_data->output[emit_data->chan] = lp_build_add(bld, a, b);
 }
 
 /* TGSI_OPCODE_MAD */
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 95eed2698bc..ffe30b8fa79 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -987,6 +987,9 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
 
       case ',':
          env++;
+         if (!pane)
+            break;
+
          y += height + hud->font.glyph_height * (pane->num_graphs + 2);
          height = 100;
 
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index cf43ef2506f..0539cfc16a1 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -27,7 +27,7 @@
 #include "glsl/nir/nir_control_flow.h"
 #include "glsl/nir/nir_builder.h"
 #include "glsl/list.h"
-#include "glsl/shader_enums.h"
+#include "glsl/nir/shader_enums.h"
 
 #include "nir/tgsi_to_nir.h"
 #include "tgsi/tgsi_parse.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 8ceb5b47584..5d80cca5b0e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -648,6 +648,7 @@ tgsi_dump_instruction(
    ctx.indent = 0;
    ctx.dump_printf = dump_ctx_printf;
    ctx.indentation = 0;
+   ctx.file = NULL;
 
    iter_instruction( &ctx.iter, (struct tgsi_full_instruction *)inst );
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_emulate.c b/src/gallium/auxiliary/tgsi/tgsi_emulate.c
new file mode 100644
index 00000000000..59d2e4c95b1
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_emulate.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_dump.h"
+#include "util/u_debug.h"
+
+#include "tgsi_emulate.h"
+
+struct tgsi_emulation_context {
+   struct tgsi_transform_context base;
+   struct tgsi_shader_info info;
+   unsigned flags;
+   bool first_instruction_emitted;
+};
+
+static inline struct tgsi_emulation_context *
+tgsi_emulation_context(struct tgsi_transform_context *tctx)
+{
+   return (struct tgsi_emulation_context *)tctx;
+}
+
+static void
+transform_decl(struct tgsi_transform_context *tctx,
+               struct tgsi_full_declaration *decl)
+{
+   struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx);
+
+   if (ctx->flags & TGSI_EMU_FORCE_PERSAMPLE_INTERP &&
+       decl->Declaration.File == TGSI_FILE_INPUT) {
+      assert(decl->Declaration.Interpolate);
+      decl->Interp.Location = TGSI_INTERPOLATE_LOC_SAMPLE;
+   }
+
+   tctx->emit_declaration(tctx, decl);
+}
+
+static void
+passthrough_edgeflag(struct tgsi_transform_context *tctx)
+{
+   struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx);
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction new_inst;
+
+   /* Input */
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   decl.Range.First = decl.Range.Last = ctx->info.num_inputs;
+   tctx->emit_declaration(tctx, &decl);
+
+   /* Output */
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_OUTPUT;
+   decl.Declaration.Semantic = true;
+   decl.Range.First = decl.Range.Last = ctx->info.num_outputs;
+   decl.Semantic.Name = TGSI_SEMANTIC_EDGEFLAG;
+   decl.Semantic.Index = 0;
+   tctx->emit_declaration(tctx, &decl);
+
+   /* MOV */
+   new_inst = tgsi_default_full_instruction();
+   new_inst.Instruction.Opcode = TGSI_OPCODE_MOV;
+
+   new_inst.Instruction.NumDstRegs = 1;
+   new_inst.Dst[0].Register.File  = TGSI_FILE_OUTPUT;
+   new_inst.Dst[0].Register.Index = ctx->info.num_outputs;
+   new_inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+
+   new_inst.Instruction.NumSrcRegs = 1;
+   new_inst.Src[0].Register.File  = TGSI_FILE_INPUT;
+   new_inst.Src[0].Register.Index = ctx->info.num_inputs;
+   new_inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
+   new_inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X;
+   new_inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X;
+   new_inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_X;
+
+   tctx->emit_instruction(tctx, &new_inst);
+}
+
+static void
+transform_instr(struct tgsi_transform_context *tctx,
+                struct tgsi_full_instruction *inst)
+{
+   struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx);
+
+   /* Pass through edgeflags. */
+   if (!ctx->first_instruction_emitted) {
+      ctx->first_instruction_emitted = true;
+
+      if (ctx->flags & TGSI_EMU_PASSTHROUGH_EDGEFLAG)
+         passthrough_edgeflag(tctx);
+   }
+
+   /* Clamp color outputs. */
+   if (ctx->flags & TGSI_EMU_CLAMP_COLOR_OUTPUTS) {
+      int i;
+      for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+         unsigned semantic;
+
+         if (inst->Dst[i].Register.File != TGSI_FILE_OUTPUT ||
+             inst->Dst[i].Register.Indirect)
+            continue;
+
+         semantic =
+            ctx->info.output_semantic_name[inst->Dst[i].Register.Index];
+
+         if (semantic == TGSI_SEMANTIC_COLOR ||
+             semantic == TGSI_SEMANTIC_BCOLOR)
+            inst->Instruction.Saturate = true;
+      }
+   }
+
+   tctx->emit_instruction(tctx, inst);
+}
+
+const struct tgsi_token *
+tgsi_emulate(const struct tgsi_token *tokens, unsigned flags)
+{
+   struct tgsi_emulation_context ctx;
+   struct tgsi_token *newtoks;
+   int newlen;
+
+   if (!(flags & (TGSI_EMU_CLAMP_COLOR_OUTPUTS |
+                  TGSI_EMU_PASSTHROUGH_EDGEFLAG |
+                  TGSI_EMU_FORCE_PERSAMPLE_INTERP)))
+      return NULL;
+
+   memset(&ctx, 0, sizeof(ctx));
+   ctx.flags = flags;
+   tgsi_scan_shader(tokens, &ctx.info);
+
+   if (flags & TGSI_EMU_FORCE_PERSAMPLE_INTERP)
+      ctx.base.transform_declaration = transform_decl;
+
+   if (flags & (TGSI_EMU_CLAMP_COLOR_OUTPUTS |
+                TGSI_EMU_PASSTHROUGH_EDGEFLAG))
+      ctx.base.transform_instruction = transform_instr;
+
+   newlen = tgsi_num_tokens(tokens) + 20;
+   newtoks = tgsi_alloc_tokens(newlen);
+   if (!newtoks)
+      return NULL;
+
+   tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base);
+   return newtoks;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_emulate.h b/src/gallium/auxiliary/tgsi/tgsi_emulate.h
new file mode 100644
index 00000000000..425cec72ee1
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_emulate.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef TGSI_GL_EMULATION_H_
+#define TGSI_GL_EMULATION_H_
+
+#include "pipe/p_shader_tokens.h"
+
+#define TGSI_EMU_CLAMP_COLOR_OUTPUTS      (1 << 0)
+#define TGSI_EMU_PASSTHROUGH_EDGEFLAG     (1 << 1)
+#define TGSI_EMU_FORCE_PERSAMPLE_INTERP   (1 << 2)
+
+const struct tgsi_token *
+tgsi_emulate(const struct tgsi_token *tokens, unsigned flags);
+
+#endif /* TGSI_GL_EMULATION_H_ */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index d76dddbf7d9..b84a1753eeb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -409,6 +409,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                         info->writes_edgeflag = TRUE;
                      }
                   }
+               } else if (file == TGSI_FILE_SAMPLER) {
+                  info->samplers_declared |= 1 << reg;
                }
             }
          }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 3ceb55717ee..d60ccabda6d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -64,6 +64,7 @@ struct tgsi_shader_info
    uint file_count[TGSI_FILE_COUNT];  /**< number of declared registers */
    int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
    int const_file_max[PIPE_MAX_CONSTANT_BUFFERS];
+   unsigned samplers_declared; /**< bitmask of declared samplers */
 
    ubyte input_array_first[PIPE_MAX_SHADER_INPUTS];
    ubyte input_array_last[PIPE_MAX_SHADER_INPUTS];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 3d213195090..f2f518130fb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -35,6 +35,7 @@
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_sanity.h"
 #include "util/u_debug.h"
+#include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_bitmask.h"
@@ -1830,29 +1831,6 @@ void ureg_free_tokens( const struct tgsi_token *tokens )
 }
 
 
-static inline unsigned
-pipe_shader_from_tgsi_processor(unsigned processor)
-{
-   switch (processor) {
-   case TGSI_PROCESSOR_VERTEX:
-      return PIPE_SHADER_VERTEX;
-   case TGSI_PROCESSOR_TESS_CTRL:
-      return PIPE_SHADER_TESS_CTRL;
-   case TGSI_PROCESSOR_TESS_EVAL:
-      return PIPE_SHADER_TESS_EVAL;
-   case TGSI_PROCESSOR_GEOMETRY:
-      return PIPE_SHADER_GEOMETRY;
-   case TGSI_PROCESSOR_FRAGMENT:
-      return PIPE_SHADER_FRAGMENT;
-   case TGSI_PROCESSOR_COMPUTE:
-      return PIPE_SHADER_COMPUTE;
-   default:
-      assert(0);
-      return PIPE_SHADER_VERTEX;
-   }
-}
-
-
 struct ureg_program *
 ureg_create(unsigned processor)
 {
@@ -1872,7 +1850,7 @@ ureg_create_with_screen(unsigned processor, struct pipe_screen *screen)
    ureg->supports_any_inout_decl_range =
       screen &&
       screen->get_shader_param(screen,
-                               pipe_shader_from_tgsi_processor(processor),
+                               util_pipe_shader_from_tgsi_processor(processor),
                                PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0;
 
    for (i = 0; i < Elements(ureg->properties); i++)
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 5fe9e33e208..7388a499c74 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -276,7 +276,7 @@ debug_get_flags_option(const char *name,
       for (; flags->name; ++flags)
          namealign = MAX2(namealign, strlen(flags->name));
       for (flags = orig; flags->name; ++flags)
-         _debug_printf("| %*s [0x%0*"PRIu64"]%s%s\n", namealign, flags->name,
+         _debug_printf("| %*s [0x%0*"PRIx64"]%s%s\n", namealign, flags->name,
                       (int)sizeof(uint64_t)*CHAR_BIT/4, flags->value,
                       flags->desc ? " " : "", flags->desc ? flags->desc : "");
    }
@@ -291,9 +291,9 @@ debug_get_flags_option(const char *name,
 
    if (debug_get_option_should_print()) {
       if (str) {
-         debug_printf("%s: %s = 0x%"PRIu64" (%s)\n", __FUNCTION__, name, result, str);
+         debug_printf("%s: %s = 0x%"PRIx64" (%s)\n", __FUNCTION__, name, result, str);
       } else {
-         debug_printf("%s: %s = 0x%"PRIu64"\n", __FUNCTION__, name, result);
+         debug_printf("%s: %s = 0x%"PRIx64"\n", __FUNCTION__, name, result);
       }
    }
 
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index bb99a02ce49..384e267b593 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -651,6 +651,28 @@ util_max_layer(const struct pipe_resource *r, unsigned level)
    }
 }
 
+static inline unsigned
+util_pipe_shader_from_tgsi_processor(unsigned processor)
+{
+   switch (processor) {
+   case TGSI_PROCESSOR_VERTEX:
+      return PIPE_SHADER_VERTEX;
+   case TGSI_PROCESSOR_TESS_CTRL:
+      return PIPE_SHADER_TESS_CTRL;
+   case TGSI_PROCESSOR_TESS_EVAL:
+      return PIPE_SHADER_TESS_EVAL;
+   case TGSI_PROCESSOR_GEOMETRY:
+      return PIPE_SHADER_GEOMETRY;
+   case TGSI_PROCESSOR_FRAGMENT:
+      return PIPE_SHADER_FRAGMENT;
+   case TGSI_PROCESSOR_COMPUTE:
+      return PIPE_SHADER_COMPUTE;
+   default:
+      assert(0);
+      return PIPE_SHADER_VERTEX;
+   }
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 3d2193c3bf5..b31ada138b8 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -544,6 +544,7 @@ u_vbuf_translate_find_free_vb_slots(struct u_vbuf *mgr,
 
          index = ffs(unused_vb_mask) - 1;
          fallback_vbs[type] = index;
+         unused_vb_mask &= ~(1 << index);
          /*printf("found slot=%i for type=%i\n", index, type);*/
       }
    }
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index dff95ba5270..3de8e0fd5ad 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -19,7 +19,7 @@ libfreedreno_la_SOURCES = \
 
 noinst_PROGRAMS = ir3_compiler
 
-# XXX: Required due to the C++ sources in libnir/libglsl_util
+# XXX: Required due to the C++ sources in libnir
 nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp
 ir3_compiler_SOURCES = \
 	ir3/ir3_cmdline.c
@@ -28,7 +28,6 @@ ir3_compiler_LDADD = \
 	libfreedreno.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/glsl/libnir.la \
-	$(top_builddir)/src/libglsl_util.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_COMMON_LIB_DEPS) \
 	$(FREEDRENO_LIBS)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 6153d92dc21..411f5b76329 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -798,11 +798,7 @@ fd3_emit_restore(struct fd_context *ctx)
 	OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) |
 			A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0));
 
-	OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
-	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
-	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
-			A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
-			A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);
+	fd3_emit_cache_flush(ctx, ring);
 
 	OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
 	OUT_RING(ring, 0x00000000);                  /* GRAS_CL_CLIP_CNTL */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
index 795654706a7..42483f6c39b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -90,4 +90,15 @@ void fd3_emit_restore(struct fd_context *ctx);
 
 void fd3_emit_init(struct pipe_context *pctx);
 
+static inline void
+fd3_emit_cache_flush(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+	fd_wfi(ctx, ring);
+	OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
+	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
+	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
+			A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
+			A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);
+}
+
 #endif /* FD3_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 9a5b45e2fcb..21fb59e450d 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -558,6 +558,8 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, fui(x1));
 	OUT_RING(ring, fui(y1));
 
+	fd3_emit_cache_flush(ctx, ring);
+
 	for (i = 0; i < 4; i++) {
 		OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) |
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index 6831a58749c..7bf3343f43a 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -187,6 +187,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	for (i = 0; i < ctx->streamout.num_targets; i++)
 		ctx->streamout.offsets[i] += prims;
 
+	if (fd_mesa_debug & FD_DBG_DDRAW)
+		ctx->dirty = 0xffffffff;
+
 	/* if an app (or, well, piglit test) does many thousands of draws
 	 * without flush (or anything which implicitly flushes, like
 	 * changing render targets), we can exceed the ringbuffer size.
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 0d0100590d6..b64f78ca32b 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -61,7 +61,7 @@ static const struct debug_named_value debug_options[] = {
 		{"msgs",      FD_DBG_MSGS,   "Print debug messages"},
 		{"disasm",    FD_DBG_DISASM, "Dump TGSI and adreno shader disassembly"},
 		{"dclear",    FD_DBG_DCLEAR, "Mark all state dirty after clear"},
-		{"flush",     FD_DBG_FLUSH,  "Force flush after every draw"},
+		{"ddraw",     FD_DBG_DDRAW,  "Mark all state dirty after draw"},
 		{"noscis",    FD_DBG_NOSCIS, "Disable scissor optimization"},
 		{"direct",    FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"},
 		{"nobypass",  FD_DBG_NOBYPASS, "Disable GMEM bypass"},
@@ -70,6 +70,7 @@ static const struct debug_named_value debug_options[] = {
 		{"optmsgs",   FD_DBG_OPTMSGS,"Enable optimizer debug messages"},
 		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"},
 		{"shaderdb",  FD_DBG_SHADERDB, "Enable shaderdb output"},
+		{"flush",     FD_DBG_FLUSH,  "Force flush after every draw"},
 		DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 7129a1bddd1..0d2418e1e00 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -63,7 +63,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_MSGS     0x0001
 #define FD_DBG_DISASM   0x0002
 #define FD_DBG_DCLEAR   0x0004
-#define FD_DBG_FLUSH    0x0008
+#define FD_DBG_DDRAW    0x0008
 #define FD_DBG_NOSCIS   0x0010
 #define FD_DBG_DIRECT   0x0020
 #define FD_DBG_NOBYPASS 0x0040
@@ -72,6 +72,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_OPTMSGS  0x0200
 #define FD_DBG_GLSL120  0x0400
 #define FD_DBG_SHADERDB 0x0800
+#define FD_DBG_FLUSH    0x1000
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 7eddbdd3825..8c9234b3847 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -36,7 +36,6 @@
 #include "tgsi/tgsi_strings.h"
 
 #include "nir/tgsi_to_nir.h"
-#include "glsl/shader_enums.h"
 
 #include "freedreno_util.h"
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
index f3d3075e6a6..9950782dc38 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
@@ -30,6 +30,7 @@
 #define IR3_NIR_H_
 
 #include "glsl/nir/nir.h"
+#include "glsl/nir/shader_enums.h"
 
 bool ir3_nir_lower_if_else(nir_shader *shader);
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 6dc0ce1133f..7e2c27d9765 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -30,7 +30,7 @@
 #define IR3_SHADER_H_
 
 #include "pipe/p_state.h"
-#include "glsl/shader_enums.h"
+#include "glsl/nir/shader_enums.h"
 
 #include "ir3.h"
 #include "disasm.h"
diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c
index 4e05a3aca1e..9d5195129b7 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder.c
@@ -25,6 +25,8 @@
  *    Chia-I Wu <[email protected]>
  */
 
+#include "util/u_memory.h"
+
 #include "ilo_builder.h"
 #include "ilo_builder_render.h" /* for ilo_builder_batch_patch_sba() */
 
diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h
index da7db90a54b..cbc568c4cd0 100644
--- a/src/gallium/drivers/ilo/core/ilo_core.h
+++ b/src/gallium/drivers/ilo/core/ilo_core.h
@@ -30,8 +30,6 @@
 
 #include "pipe/p_compiler.h"
 
-#include "util/u_debug.h"
 #include "util/u_math.h"
-#include "util/u_memory.h"
 
 #endif /* ILO_CORE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_debug.h b/src/gallium/drivers/ilo/core/ilo_debug.h
index 9833233d796..532a2aa7ed6 100644
--- a/src/gallium/drivers/ilo/core/ilo_debug.h
+++ b/src/gallium/drivers/ilo/core/ilo_debug.h
@@ -28,6 +28,8 @@
 #ifndef ILO_DEBUG_H
 #define ILO_DEBUG_H
 
+#include "util/u_debug.h"
+
 #include "ilo_core.h"
 
 /* enable debug flags affecting hot pathes only with debug builds */
diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index fa547ac5c36..6eefc8f46d2 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -286,8 +286,8 @@ image_get_gen6_tiling(const struct ilo_dev *dev,
                                  info->bind_surface_dp_typed))
          return GEN6_TILING_NONE;
 
-      if (estimated_size <= 64 ||
-          estimated_size > info->prefer_linear_threshold)
+      if (estimated_size <= 64 || (info->prefer_linear_threshold &&
+               estimated_size > info->prefer_linear_threshold))
          return GEN6_TILING_NONE;
 
       if (estimated_size <= 2048)
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index 646ed6f5727..546e0ff7739 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -102,7 +102,7 @@ struct ilo_image_info {
 
    /*
     * prefer GEN6_TILING_NONE when the (estimated) image size exceeds the
-    * threshold
+    * threshold; ignored when zero
     */
    uint32_t prefer_linear_threshold;
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.c b/src/gallium/drivers/ilo/core/ilo_state_cc.c
index 83ee8de979c..1f2456e19ea 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_cc.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_cc.c
@@ -694,10 +694,10 @@ cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc,
       cc_get_gen6_effective_rt(dev, info, 0, &rt0);
 
       /* 0x0 is reserved for blend factors and we have to set them all */
-      dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT |
-             rt0.a_dst << GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT |
-             rt0.rgb_src << GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT |
-             rt0.rgb_dst << GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT;
+      dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__SHIFT |
+             rt0.a_dst << GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__SHIFT |
+             rt0.rgb_src << GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__SHIFT |
+             rt0.rgb_dst << GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__SHIFT;
 
       for (i = 0; i < blend->rt_count; i++) {
          if (blend->rt[i].argb_write_disables != 0xf) {
@@ -707,10 +707,10 @@ cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc,
       }
 
       if (rt0.blend_enable) {
-         dw1 |= GEN8_PS_BLEND_DW1_BLEND_ENABLE;
+         dw1 |= GEN8_PS_BLEND_DW1_RT0_BLEND_ENABLE;
 
          if (rt0.a_src != rt0.rgb_src || rt0.a_dst != rt0.rgb_dst)
-            dw1 |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE;
+            dw1 |= GEN8_PS_BLEND_DW1_RT0_INDEPENDENT_ALPHA_ENABLE;
       }
    }
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.c b/src/gallium/drivers/ilo/core/ilo_state_raster.c
index ed64a1f0d3c..a694f71bbbf 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_raster.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_raster.c
@@ -512,7 +512,7 @@ raster_set_gen8_3DSTATE_RASTER(struct ilo_state_raster *rs,
 
    /* where should line_msaa_enable be set? */
    if (setup->msaa_enable)
-      dw1 |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE;
+      dw1 |= GEN8_RASTER_DW1_DX_MULTISAMPLE_ENABLE;
 
    if (tri->depth_offset_solid)
       dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID;
@@ -574,10 +574,6 @@ get_gen6_sample_count(const struct ilo_dev *dev, uint8_t sample_count)
       c = GEN7_NUMSAMPLES_8;
       min_gen = ILO_GEN(7);
       break;
-   case 16:
-      c = GEN8_NUMSAMPLES_16;
-      min_gen = ILO_GEN(8);
-      break;
    default:
       assert(!"unexpected sample count");
       c = GEN6_NUMSAMPLES_1;
@@ -792,17 +788,17 @@ raster_set_gen8_3DSTATE_WM(struct ilo_state_raster *rs,
    if (ilo_dev_gen(dev) < ILO_GEN(8)) {
       switch (scan->earlyz_op) {
       case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
-         dw1 |= GEN7_WM_DW1_DEPTH_CLEAR;
+         dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_CLEAR;
          break;
       case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
-         dw1 |= GEN7_WM_DW1_DEPTH_RESOLVE;
+         dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_RESOLVE;
          break;
       case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE:
-         dw1 |= GEN7_WM_DW1_HIZ_RESOLVE;
+         dw1 |= GEN7_WM_DW1_LEGACY_HIZ_RESOLVE;
          break;
       default:
          if (scan->earlyz_stencil_clear)
-            dw1 |= GEN7_WM_DW1_DEPTH_CLEAR;
+            dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_CLEAR;
          break;
       }
    }
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.c b/src/gallium/drivers/ilo/core/ilo_state_sbe.c
index 5d1d400acdd..1b4ca0683c9 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sbe.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.c
@@ -239,8 +239,8 @@ sbe_set_gen8_3DSTATE_SBE(struct ilo_state_sbe *sbe,
          vue_read_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
 
    if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      dw1 |= GEN8_SBE_DW1_USE_URB_READ_LEN |
-             GEN8_SBE_DW1_USE_URB_READ_OFFSET |
+      dw1 |= GEN8_SBE_DW1_FORCE_URB_READ_LEN |
+             GEN8_SBE_DW1_FORCE_URB_READ_OFFSET |
              vue_read_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT;
    } else {
       dw1 |= vue_read_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT;
@@ -286,10 +286,10 @@ sbe_set_gen8_3DSTATE_SBE_SWIZ(struct ilo_state_sbe *sbe,
                 swizzle->attr << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT;
 
       if (swizzle->force_zeros) {
-         swiz[i] |= GEN8_SBE_SWIZ_OVERRIDE_W |
-                    GEN8_SBE_SWIZ_OVERRIDE_Z |
-                    GEN8_SBE_SWIZ_OVERRIDE_Y |
-                    GEN8_SBE_SWIZ_OVERRIDE_X |
+         swiz[i] |= GEN8_SBE_SWIZ_CONST_OVERRIDE_W |
+                    GEN8_SBE_SWIZ_CONST_OVERRIDE_Z |
+                    GEN8_SBE_SWIZ_CONST_OVERRIDE_Y |
+                    GEN8_SBE_SWIZ_CONST_OVERRIDE_X |
                     GEN8_SBE_SWIZ_CONST_0000;
       }
    }
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
index f4d801e9b56..ceeb68a460e 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
@@ -592,7 +592,12 @@ ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps,
 
    ILO_DEV_ASSERT(dev, 8, 8);
 
-   dw3 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+   /*
+    * Set VME here for correct computation of LODs and others.  Not sure why
+    * it is needed now.
+    */
+   dw3 = GEN6_THREADDISP_VME |
+         ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
          ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
 
    if (false)
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
index 40fe15f316f..27c37535fc8 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -814,10 +814,6 @@ surface_get_gen6_image_sample_count(const struct ilo_dev *dev,
       *sample_count = GEN7_NUMSAMPLES_8;
       min_gen = ILO_GEN(7);
       break;
-   case 16:
-      *sample_count = GEN8_NUMSAMPLES_16;
-      min_gen = ILO_GEN(8);
-      break;
    default:
       assert(!"invalid sample count");
       *sample_count = GEN6_NUMSAMPLES_1;
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c
index 9faf835fef2..8f091e21a27 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c
@@ -369,14 +369,14 @@ vf_params_set_gen8_3DSTATE_VF_SGVS(struct ilo_state_vf *vf,
 
    if (params->prepend_instanceid) {
       dw1 |= GEN8_SGVS_DW1_IID_ENABLE |
-             1 << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT |
-             attr << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT;
+             1 << GEN8_SGVS_DW1_IID_COMP__SHIFT |
+             attr << GEN8_SGVS_DW1_IID_OFFSET__SHIFT;
    }
 
    if (params->prepend_vertexid) {
       dw1 |= GEN8_SGVS_DW1_VID_ENABLE |
-             0 << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT |
-             attr << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT;
+             0 << GEN8_SGVS_DW1_VID_COMP__SHIFT |
+             attr << GEN8_SGVS_DW1_VID_OFFSET__SHIFT;
    }
 
    STATIC_ASSERT(ARRAY_SIZE(vf->sgvs) >= 1);
diff --git a/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h b/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h
index fe8b26908c0..96cf543d27e 100644
--- a/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h
@@ -41,7 +41,9 @@ enum gen_eu_urb_op {
     GEN7_MSG_URB_READ_OWORD				      = 0x3,
     GEN7_MSG_URB_ATOMIC_MOV				      = 0x4,
     GEN7_MSG_URB_ATOMIC_INC				      = 0x5,
+    GEN75_MSG_URB_ATOMIC_ADD				      = 0x6,
     GEN8_MSG_URB_SIMD8_WRITE				      = 0x7,
+    GEN8_MSG_URB_SIMD8_READ				      = 0x8,
 };
 
 enum gen_eu_pi_simd {
@@ -137,6 +139,7 @@ enum gen_eu_dp_op {
     GEN75_MSG_DP_RC_MEMORY_FENCE			      = 0x7,
     GEN75_MSG_DP_RC_MEDIA_BLOCK_WRITE			      = 0xa,
     GEN75_MSG_DP_RC_RT_WRITE				      = 0xc,
+    GEN8_MSG_DP_RC_RT_READ				      = 0xd,
     GEN75_MSG_DP_CC_OWORD_BLOCK_READ			      = 0x0,
     GEN75_MSG_DP_CC_UNALIGNED_OWORD_BLOCK_READ		      = 0x1,
     GEN75_MSG_DP_CC_OWORD_DUAL_BLOCK_READ		      = 0x2,
diff --git a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
index 5a0bb4f8d77..36f9618eb2d 100644
--- a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
@@ -84,6 +84,8 @@ enum gen_mi_alu_operand {
 #define GEN7_MI_OPCODE_MI_PREDICATE				(0xc << 23)
 #define GEN7_MI_OPCODE_MI_URB_CLEAR				(0x19 << 23)
 #define GEN75_MI_OPCODE_MI_MATH					(0x1a << 23)
+#define GEN8_MI_OPCODE_MI_SEMAPHORE_SIGNAL			(0x1b << 23)
+#define GEN8_MI_OPCODE_MI_SEMAPHORE_WAIT			(0x1c << 23)
 #define GEN6_MI_OPCODE_MI_STORE_DATA_IMM			(0x20 << 23)
 #define GEN6_MI_OPCODE_MI_LOAD_REGISTER_IMM			(0x22 << 23)
 #define GEN6_MI_OPCODE_MI_STORE_REGISTER_MEM			(0x24 << 23)
@@ -91,8 +93,11 @@ enum gen_mi_alu_operand {
 #define GEN6_MI_OPCODE_MI_REPORT_PERF_COUNT			(0x28 << 23)
 #define GEN7_MI_OPCODE_MI_LOAD_REGISTER_MEM			(0x29 << 23)
 #define GEN75_MI_OPCODE_MI_LOAD_REGISTER_REG			(0x2a << 23)
+#define GEN75_MI_OPCODE_MI_RS_STORE_DATA_IMM			(0x2b << 23)
 #define GEN75_MI_OPCODE_MI_LOAD_URB_MEM				(0x2c << 23)
 #define GEN75_MI_OPCODE_MI_STORE_URB_MEM			(0x2d << 23)
+#define GEN8_MI_OPCODE_MI_COPY_MEM_MEM				(0x2e << 23)
+#define GEN8_MI_OPCODE_MI_ATOMIC				(0x2f << 23)
 #define GEN6_MI_OPCODE_MI_BATCH_BUFFER_START			(0x31 << 23)
 #define GEN6_MI_LENGTH__MASK					0x0000003f
 #define GEN6_MI_LENGTH__SHIFT					0
@@ -155,8 +160,41 @@ enum gen_mi_alu_operand {
 #define GEN75_MI_MATH_DW_SRC2__MASK				0x000007ff
 #define GEN75_MI_MATH_DW_SRC2__SHIFT				0
 
+#define GEN8_MI_SEMAPHORE_SIGNAL__SIZE				2
+#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_POST_SYNC_OP		(0x1 << 21)
+#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE__MASK		0x00038000
+#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE__SHIFT		15
+#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_RCS			(0x0 << 15)
+#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VCS0		(0x1 << 15)
+#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_BCS			(0x2 << 15)
+#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VECS		(0x3 << 15)
+#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VCS1		(0x4 << 15)
+
+
+#define GEN8_MI_SEMAPHORE_WAIT__SIZE				4
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_USE_GGTT			(0x1 << 22)
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE__MASK		0x00008000
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE__SHIFT		15
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE_SIGNAL		(0x0 << 15)
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE_POLL		(0x1 << 15)
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP__MASK			0x00007000
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP__SHIFT			12
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_GREATER_THAN_SDD	(0x0 << 12)
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_GREATER_THAN_OR_EQUAL_SDD	(0x1 << 12)
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_LESS_THAN_SDD		(0x2 << 12)
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_LESS_THAN_OR_EQUAL_SDD	(0x3 << 12)
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_EQUAL_SDD		(0x4 << 12)
+#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_NO_EQUAL_SDD		(0x5 << 12)
+
+
+#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__MASK		0xfffffffc
+#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__SHIFT		2
+#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__SHR		2
+
+
 #define GEN6_MI_STORE_DATA_IMM__SIZE				6
 #define GEN6_MI_STORE_DATA_IMM_DW0_USE_GGTT			(0x1 << 22)
+#define GEN8_MI_STORE_DATA_IMM_DW0_STORE_QWORD			(0x1 << 21)
 
 
 #define GEN6_MI_STORE_DATA_IMM_DW2_ADDR__MASK			0xfffffffc
@@ -188,7 +226,17 @@ enum gen_mi_alu_operand {
 #define GEN6_MI_STORE_REGISTER_MEM_DW2_ADDR__SHR		2
 
 
-#define GEN6_MI_FLUSH_DW__SIZE					4
+#define GEN6_MI_FLUSH_DW__SIZE					5
+#define GEN6_MI_FLUSH_DW_DW0_WRITE__MASK			0x0000c000
+#define GEN6_MI_FLUSH_DW_DW0_WRITE__SHIFT			14
+#define GEN6_MI_FLUSH_DW_DW0_WRITE_NONE				(0x0 << 14)
+#define GEN6_MI_FLUSH_DW_DW0_WRITE_IMM				(0x1 << 14)
+#define GEN6_MI_FLUSH_DW_DW0_WRITE_TIMESTAMP			(0x3 << 14)
+
+#define GEN6_MI_FLUSH_DW_DW1_USE_GGTT				(0x1 << 2)
+#define GEN6_MI_FLUSH_DW_DW1_ADDR__MASK				0xfffffff8
+#define GEN6_MI_FLUSH_DW_DW1_ADDR__SHIFT			3
+#define GEN6_MI_FLUSH_DW_DW1_ADDR__SHR				3
 
 
 
@@ -225,6 +273,17 @@ enum gen_mi_alu_operand {
 #define GEN75_MI_LOAD_REGISTER_REG_DW2_DST_REG__SHIFT		2
 #define GEN75_MI_LOAD_REGISTER_REG_DW2_DST_REG__SHR		2
 
+#define GEN75_MI_RS_STORE_DATA_IMM__SIZE			6
+#define GEN75_MI_RS_STORE_DATA_IMM_DW0_USE_GGTT			(0x1 << 22)
+
+
+#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__MASK		0xfffffffc
+#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__SHIFT		2
+#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__SHR		2
+
+
+
+
 #define GEN75_MI_LOAD_URB_MEM__SIZE				4
 
 #define GEN75_MI_LOAD_URB_MEM_DW1_ADDR__MASK			0x00007ffc
@@ -247,12 +306,47 @@ enum gen_mi_alu_operand {
 #define GEN75_MI_STORE_URB_MEM_DW2_ADDR__SHR			6
 
 
+#define GEN8_MI_COPY_MEM_MEM__SIZE				5
+#define GEN8_MI_COPY_MEM_MEM_DW0_USE_GGTT_SRC			(0x1 << 22)
+#define GEN8_MI_COPY_MEM_MEM_DW0_USE_GGTT_DST			(0x1 << 21)
+
+#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__MASK			0xfffffffc
+#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__SHIFT		2
+#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__SHR			2
+
+
+#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__MASK			0xfffffffc
+#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__SHIFT		2
+#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__SHR			2
+
+
+#define GEN8_MI_ATOMIC__SIZE					11
+#define GEN8_MI_ATOMIC_DW0_USE_GGTT				(0x1 << 22)
+#define GEN8_MI_ATOMIC_DW0_POST_SYNC_OP				(0x1 << 21)
+#define GEN8_MI_ATOMIC_DW0_SIZE__MASK				0x00180000
+#define GEN8_MI_ATOMIC_DW0_SIZE__SHIFT				19
+#define GEN8_MI_ATOMIC_DW0_SIZE_DWORD				(0x0 << 19)
+#define GEN8_MI_ATOMIC_DW0_SIZE_QWORD				(0x1 << 19)
+#define GEN8_MI_ATOMIC_DW0_SIZE_OWORD				(0x2 << 19)
+#define GEN8_MI_ATOMIC_DW0_INLINE_DATA				(0x1 << 18)
+#define GEN8_MI_ATOMIC_DW0_CS_STALL				(0x1 << 17)
+#define GEN8_MI_ATOMIC_DW0_RETURN_DATA_CONTROL			(0x1 << 16)
+#define GEN8_MI_ATOMIC_DW0_OP__MASK				0x0000ff00
+#define GEN8_MI_ATOMIC_DW0_OP__SHIFT				8
+
+#define GEN8_MI_ATOMIC_DW1_ADDR__MASK				0xfffffffc
+#define GEN8_MI_ATOMIC_DW1_ADDR__SHIFT				2
+#define GEN8_MI_ATOMIC_DW1_ADDR__SHR				2
+
+
+
 #define GEN6_MI_BATCH_BUFFER_START__SIZE			3
 #define GEN75_MI_BATCH_BUFFER_START_DW0_SECOND_LEVEL		(0x1 << 22)
 #define GEN75_MI_BATCH_BUFFER_START_DW0_ADD_OFFSET_ENABLE	(0x1 << 16)
 #define GEN75_MI_BATCH_BUFFER_START_DW0_PREDICATION_ENABLE	(0x1 << 15)
 #define GEN75_MI_BATCH_BUFFER_START_DW0_NON_PRIVILEGED		(0x1 << 13)
 #define GEN6_MI_BATCH_BUFFER_START_DW0_CLEAR_COMMAND_BUFFER	(0x1 << 11)
+#define GEN75_MI_BATCH_BUFFER_START_DW0_RS_ENABLE		(0x1 << 10)
 #define GEN6_MI_BATCH_BUFFER_START_DW0_USE_PPGTT		(0x1 << 8)
 
 #define GEN6_MI_BATCH_BUFFER_START_DW1_ADDR__MASK		0xfffffffc
diff --git a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
index c51e4f78bc0..54ec13eaafa 100644
--- a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
@@ -37,6 +37,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN6_REG__SIZE						0x400000
 #define GEN6_REG_NOPID						0x2094
 
+
+#define GEN6_REG_SO_PRIM_STORAGE_NEEDED				0x2280
+
+#define GEN6_REG_SO_NUM_PRIMS_WRITTEN				0x2288
+
+
+#define GEN7_REG_TS_GPGPU_THREADS_DISPATCHED			0x2290
+
 #define GEN7_REG_HS_INVOCATION_COUNT				0x2300
 
 #define GEN7_REG_DS_INVOCATION_COUNT				0x2308
@@ -95,10 +103,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN75_REG_CS_GPR__ESIZE					0x8
 #define GEN75_REG_CS_GPR__LEN					0x10
 
+#define GEN7_REG_GPGPU_DISPATCHDIMX				0x2500
 
-#define GEN6_REG_SO_PRIM_STORAGE_NEEDED				0x2280
+#define GEN7_REG_GPGPU_DISPATCHDIMY				0x2504
 
-#define GEN6_REG_SO_NUM_PRIMS_WRITTEN				0x2288
+#define GEN7_REG_GPGPU_DISPATCHDIMZ				0x2508
 
 
 #define GEN7_REG_SO_NUM_PRIMS_WRITTEN(i0)			(0x5200 + 0x8*(i0))
@@ -118,8 +127,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN7_REG_CACHE_MODE_0_HIZ_RAW_STALL_OPT_DISABLE		(0x1 << 2)
 
 #define GEN7_REG_CACHE_MODE_1					0x7004
-#define GEN8_REG_CACHE_MODE_1_HIZ_NP_EARLY_Z_FAILS_DISABLE	(0x1 << 13)
-#define GEN8_REG_CACHE_MODE_1_HIZ_NP_PMA_FIX_ENABLE		(0x1 << 11)
+#define GEN8_REG_CACHE_MODE_1_NP_EARLY_Z_FAILS_DISABLE		(0x1 << 13)
+#define GEN8_REG_CACHE_MODE_1_NP_PMA_FIX_ENABLE			(0x1 << 11)
 
 
 #define GEN8_REG_L3CNTLREG					0x7034
diff --git a/src/gallium/drivers/ilo/genhw/gen_render.xml.h b/src/gallium/drivers/ilo/genhw/gen_render.xml.h
index 2e86ba96ae2..43d271d838a 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render.xml.h
@@ -102,6 +102,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN7_RENDER_OPCODE_3DSTATE_URB_HS			(0x31 << 16)
 #define GEN7_RENDER_OPCODE_3DSTATE_URB_DS			(0x32 << 16)
 #define GEN7_RENDER_OPCODE_3DSTATE_URB_GS			(0x33 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_VS		(0x34 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_GS		(0x35 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_HS		(0x36 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_DS		(0x37 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_PS		(0x38 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_VS	(0x43 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_GS	(0x44 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_HS	(0x45 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_DS	(0x45 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_PS	(0x46 << 16)
 #define GEN8_RENDER_OPCODE_3DSTATE_VF_INSTANCING		(0x49 << 16)
 #define GEN8_RENDER_OPCODE_3DSTATE_VF_SGVS			(0x4a << 16)
 #define GEN8_RENDER_OPCODE_3DSTATE_VF_TOPOLOGY			(0x4b << 16)
@@ -130,6 +140,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS	(0x116 << 16)
 #define GEN7_RENDER_OPCODE_3DSTATE_SO_DECL_LIST			(0x117 << 16)
 #define GEN7_RENDER_OPCODE_3DSTATE_SO_BUFFER			(0x118 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POOL_ALLOC	(0x119 << 16)
+#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_POOL_ALLOC		(0x11a << 16)
 #define GEN8_RENDER_OPCODE_3DSTATE_SAMPLE_PATTERN		(0x11c << 16)
 #define GEN6_RENDER_OPCODE_PIPE_CONTROL				(0x200 << 16)
 #define GEN6_RENDER_OPCODE_3DPRIMITIVE				(0x300 << 16)
@@ -178,6 +190,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN8_SBA_MOCS__MASK					0x000007f0
 #define GEN8_SBA_MOCS__SHIFT					4
 #define GEN6_SBA_ADDR_MODIFIED					(0x1 << 0)
+#define GEN8_SBA_SIZE__MASK					0xfffff000
+#define GEN8_SBA_SIZE__SHIFT					12
+#define GEN8_SBA_SIZE__SHR					12
+#define GEN8_SBA_SIZE_MODIFIED					(0x1 << 0)
 #define GEN6_BINDING_TABLE_ADDR__MASK				0x0000ffe0
 #define GEN6_BINDING_TABLE_ADDR__SHIFT				5
 #define GEN6_BINDING_TABLE_ADDR__SHR				5
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
index 52173fe5d07..c79a4f3a830 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
@@ -168,7 +168,6 @@ enum gen_sample_count {
     GEN8_NUMSAMPLES_2					      = 0x1,
     GEN6_NUMSAMPLES_4					      = 0x2,
     GEN7_NUMSAMPLES_8					      = 0x3,
-    GEN8_NUMSAMPLES_16					      = 0x4,
 };
 
 enum gen_inputattr_select {
@@ -297,11 +296,58 @@ enum gen_msrast_mode {
 
 #define GEN7_URB_DW1_OFFSET__MASK				0x3e000000
 #define GEN7_URB_DW1_OFFSET__SHIFT				25
+#define GEN75_URB_DW1_OFFSET__MASK				0x7e000000
+#define GEN75_URB_DW1_OFFSET__SHIFT				25
+#define GEN8_URB_DW1_OFFSET__MASK				0xfe000000
+#define GEN8_URB_DW1_OFFSET__SHIFT				25
 #define GEN7_URB_DW1_ENTRY_SIZE__MASK				0x01ff0000
 #define GEN7_URB_DW1_ENTRY_SIZE__SHIFT				16
 #define GEN7_URB_DW1_ENTRY_COUNT__MASK				0x0000ffff
 #define GEN7_URB_DW1_ENTRY_COUNT__SHIFT				0
 
+#define GEN75_3DSTATE_GATHER_CONSTANT_ANY__SIZE			130
+
+
+#define GEN75_GATHER_CONST_DW1_BT_VALID__MASK			0xffff0000
+#define GEN75_GATHER_CONST_DW1_BT_VALID__SHIFT			16
+#define GEN75_GATHER_CONST_DW1_BT_BLOCK__MASK			0x0000f000
+#define GEN75_GATHER_CONST_DW1_BT_BLOCK__SHIFT			12
+
+#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__MASK	0x007fffc0
+#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__SHIFT	6
+#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__SHR	6
+#define GEN8_GATHER_CONST_DW2_DX9_STALL				(0x1 << 5)
+#define GEN75_GATHER_CONST_DW2_DX9_ENABLE			(0x1 << 4)
+
+#define GEN75_GATHER_CONST_DW_ENTRY_HIGH__MASK			0xffff0000
+#define GEN75_GATHER_CONST_DW_ENTRY_HIGH__SHIFT			16
+#define GEN75_GATHER_CONST_DW_ENTRY_OFFSET__MASK		0x0000ff00
+#define GEN75_GATHER_CONST_DW_ENTRY_OFFSET__SHIFT		8
+#define GEN75_GATHER_CONST_DW_ENTRY_CHANNEL_MASK__MASK		0x000000f0
+#define GEN75_GATHER_CONST_DW_ENTRY_CHANNEL_MASK__SHIFT		4
+#define GEN75_GATHER_CONST_DW_ENTRY_BT_INDEX__MASK		0x0000001f
+#define GEN75_GATHER_CONST_DW_ENTRY_BT_INDEX__SHIFT		0
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_ANY__SIZE		258
+
+
+#define GEN75_BT_EDIT_DW1_BT_BLOCK_CLEAR__MASK			0xffff0000
+#define GEN75_BT_EDIT_DW1_BT_BLOCK_CLEAR__SHIFT			16
+#define GEN75_BT_EDIT_DW1_TARGET__MASK				0x00000003
+#define GEN75_BT_EDIT_DW1_TARGET__SHIFT				0
+#define GEN75_BT_EDIT_DW1_TARGET_CORE0				0x1
+#define GEN75_BT_EDIT_DW1_TARGET_CORE1				0x2
+#define GEN75_BT_EDIT_DW1_TARGET_ALL				0x3
+
+#define GEN75_BT_EDIT_DW_ENTRY_BT_INDEX__MASK			0x00ff0000
+#define GEN75_BT_EDIT_DW_ENTRY_BT_INDEX__SHIFT			16
+#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__MASK		0x0000ffff
+#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHIFT	0
+#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHR		5
+#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__MASK		0x0000ffff
+#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHIFT		0
+#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHR		6
+
 #define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_ANY__SIZE		2
 
 
@@ -315,6 +361,48 @@ enum gen_msrast_mode {
 #define GEN75_PCB_ALLOC_DW1_SIZE__MASK				0x0000003f
 #define GEN75_PCB_ALLOC_DW1_SIZE__SHIFT				0
 
+#define GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC__SIZE		3
+
+
+#define GEN75_BT_POOL_ALLOC_DW1_ADDR__MASK			0xfffff000
+#define GEN75_BT_POOL_ALLOC_DW1_ADDR__SHIFT			12
+#define GEN75_BT_POOL_ALLOC_DW1_ADDR__SHR			12
+#define GEN75_BT_POOL_ALLOC_DW1_ENABLE				(0x1 << 11)
+#define GEN75_BT_POOL_ALLOC_DW1_MOCS__MASK			0x00000780
+#define GEN75_BT_POOL_ALLOC_DW1_MOCS__SHIFT			7
+#define GEN8_BT_POOL_ALLOC_DW1_MOCS__MASK			0x0000007f
+#define GEN8_BT_POOL_ALLOC_DW1_MOCS__SHIFT			0
+
+#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__MASK			0xfffff000
+#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__SHIFT			12
+#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__SHR			12
+
+
+#define GEN8_BT_POOL_ALLOC_DW3_SIZE__MASK			0xfffff000
+#define GEN8_BT_POOL_ALLOC_DW3_SIZE__SHIFT			12
+#define GEN8_BT_POOL_ALLOC_DW3_SIZE__SHR			12
+
+#define GEN75_3DSTATE_GATHER_POOL_ALLOC__SIZE			3
+
+
+#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__MASK			0xfffff000
+#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__SHIFT			12
+#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__SHR			12
+#define GEN75_GATHER_POOL_ALLOC_DW1_ENABLE			(0x1 << 11)
+#define GEN75_GATHER_POOL_ALLOC_DW1_MOCS__MASK			0x0000000f
+#define GEN75_GATHER_POOL_ALLOC_DW1_MOCS__SHIFT			0
+#define GEN8_GATHER_POOL_ALLOC_DW1_MOCS__MASK			0x0000007f
+#define GEN8_GATHER_POOL_ALLOC_DW1_MOCS__SHIFT			0
+
+#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__MASK		0xfffff000
+#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__SHIFT		12
+#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__SHR		12
+
+
+#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__MASK			0xfffff000
+#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__SHIFT			12
+#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__SHR			12
+
 #define GEN6_3DSTATE_VERTEX_BUFFERS__SIZE			133
 
 
@@ -402,15 +490,15 @@ enum gen_msrast_mode {
 
 
 #define GEN8_SGVS_DW1_IID_ENABLE				(0x1 << 31)
-#define GEN8_SGVS_DW1_IID_VE_COMP__MASK				0x60000000
-#define GEN8_SGVS_DW1_IID_VE_COMP__SHIFT			29
-#define GEN8_SGVS_DW1_IID_VE_INDEX__MASK			0x003f0000
-#define GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT			16
+#define GEN8_SGVS_DW1_IID_COMP__MASK				0x60000000
+#define GEN8_SGVS_DW1_IID_COMP__SHIFT				29
+#define GEN8_SGVS_DW1_IID_OFFSET__MASK				0x003f0000
+#define GEN8_SGVS_DW1_IID_OFFSET__SHIFT				16
 #define GEN8_SGVS_DW1_VID_ENABLE				(0x1 << 15)
-#define GEN8_SGVS_DW1_VID_VE_COMP__MASK				0x00006000
-#define GEN8_SGVS_DW1_VID_VE_COMP__SHIFT			13
-#define GEN8_SGVS_DW1_VID_VE_INDEX__MASK			0x0000003f
-#define GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT			0
+#define GEN8_SGVS_DW1_VID_COMP__MASK				0x00006000
+#define GEN8_SGVS_DW1_VID_COMP__SHIFT				13
+#define GEN8_SGVS_DW1_VID_OFFSET__MASK				0x0000003f
+#define GEN8_SGVS_DW1_VID_OFFSET__SHIFT				0
 
 #define GEN8_3DSTATE_VF_TOPOLOGY__SIZE				2
 
@@ -464,6 +552,10 @@ enum gen_msrast_mode {
 #define GEN7_3DSTATE_POINTERS_ANY__SIZE				2
 
 
+#define GEN7_PTR_DW1_ADDR__MASK					0xffffffe0
+#define GEN7_PTR_DW1_ADDR__SHIFT				5
+#define GEN7_PTR_DW1_ADDR__SHR					5
+#define GEN8_PTR_DW1_CHANGED					(0x1 << 0)
 
 #define GEN6_3DSTATE_VS__SIZE					9
 
@@ -513,12 +605,14 @@ enum gen_msrast_mode {
 #define GEN8_VS_DW7_CACHE_DISABLE				(0x1 << 1)
 #define GEN8_VS_DW7_VS_ENABLE					(0x1 << 0)
 
-#define GEN8_VS_DW8_URB_WRITE_OFFSET__MASK			0x03e00000
-#define GEN8_VS_DW8_URB_WRITE_OFFSET__SHIFT			21
-#define GEN8_VS_DW8_URB_WRITE_LEN__MASK				0x001f0000
-#define GEN8_VS_DW8_URB_WRITE_LEN__SHIFT			16
+#define GEN8_VS_DW8_VUE_OUT_READ_OFFSET__MASK			0x07e00000
+#define GEN8_VS_DW8_VUE_OUT_READ_OFFSET__SHIFT			21
+#define GEN8_VS_DW8_VUE_OUT_LEN__MASK				0x001f0000
+#define GEN8_VS_DW8_VUE_OUT_LEN__SHIFT				16
 #define GEN8_VS_DW8_UCP_CLIP_ENABLES__MASK			0x0000ff00
 #define GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT			8
+#define GEN8_VS_DW8_UCP_CULL_ENABLES__MASK			0x000000ff
+#define GEN8_VS_DW8_UCP_CULL_ENABLES__SHIFT			0
 
 #define GEN7_3DSTATE_HS__SIZE					9
 
@@ -558,11 +652,11 @@ enum gen_msrast_mode {
 
 
 
-#define GEN8_HS_DW1_DISPATCH_MAX_THREADS__MASK			0x000000ff
-#define GEN8_HS_DW1_DISPATCH_MAX_THREADS__SHIFT			0
 
 #define GEN8_HS_DW2_HS_ENABLE					(0x1 << 31)
 #define GEN8_HS_DW2_STATISTICS					(0x1 << 29)
+#define GEN8_HS_DW2_MAX_THREADS__MASK				0x0001ff00
+#define GEN8_HS_DW2_MAX_THREADS__SHIFT				8
 #define GEN8_HS_DW2_INSTANCE_COUNT__MASK			0x0000000f
 #define GEN8_HS_DW2_INSTANCE_COUNT__SHIFT			0
 
@@ -584,9 +678,6 @@ enum gen_msrast_mode {
 #define GEN8_HS_DW7_URB_READ_OFFSET__MASK			0x000003f0
 #define GEN8_HS_DW7_URB_READ_OFFSET__SHIFT			4
 
-#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__MASK			0x00001fff
-#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__SHIFT			0
-#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__SHR			6
 
 #define GEN7_3DSTATE_TE__SIZE					4
 
@@ -660,16 +751,19 @@ enum gen_msrast_mode {
 #define GEN8_DS_DW7_MAX_THREADS__MASK				0x3fe00000
 #define GEN8_DS_DW7_MAX_THREADS__SHIFT				21
 #define GEN8_DS_DW7_STATISTICS					(0x1 << 10)
+#define GEN8_DS_DW7_SIMD8_ENABLE				(0x1 << 3)
 #define GEN8_DS_DW7_COMPUTE_W					(0x1 << 2)
 #define GEN8_DS_DW7_CACHE_DISABLE				(0x1 << 1)
 #define GEN8_DS_DW7_DS_ENABLE					(0x1 << 0)
 
-#define GEN8_DS_DW8_URB_WRITE_OFFSET__MASK			0x03e00000
-#define GEN8_DS_DW8_URB_WRITE_OFFSET__SHIFT			21
-#define GEN8_DS_DW8_URB_WRITE_LEN__MASK				0x001f0000
-#define GEN8_DS_DW8_URB_WRITE_LEN__SHIFT			16
+#define GEN8_DS_DW8_VUE_OUT_READ_OFFSET__MASK			0x07e00000
+#define GEN8_DS_DW8_VUE_OUT_READ_OFFSET__SHIFT			21
+#define GEN8_DS_DW8_VUE_OUT_LEN__MASK				0x001f0000
+#define GEN8_DS_DW8_VUE_OUT_LEN__SHIFT				16
 #define GEN8_DS_DW8_UCP_CLIP_ENABLES__MASK			0x0000ff00
 #define GEN8_DS_DW8_UCP_CLIP_ENABLES__SHIFT			8
+#define GEN8_DS_DW8_UCP_CULL_ENABLES__MASK			0x000000ff
+#define GEN8_DS_DW8_UCP_CULL_ENABLES__SHIFT			0
 
 
 
@@ -771,7 +865,7 @@ enum gen_msrast_mode {
 #define GEN8_GS_DW1_KERNEL_ADDR__SHR				6
 
 
-#define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__MASK			0x0000007f
+#define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__MASK			0x0000003f
 #define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__SHIFT		0
 
 
@@ -815,18 +909,20 @@ enum gen_msrast_mode {
 #define GEN8_GS_DW8_GSCTRL__SHIFT				31
 #define GEN8_GS_DW8_GSCTRL_CUT					(0x0 << 31)
 #define GEN8_GS_DW8_GSCTRL_SID					(0x1 << 31)
-#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__MASK			0x00001fff
-#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__SHIFT			0
-#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__SHR			6
-#define GEN9_GS_DW8_MAX_THREADS__MASK				0x00001fff
+#define GEN8_GS_DW8_STATIC_OUTPUT				(0x1 << 30)
+#define GEN8_GS_DW8_STATIC_OUTPUT_VERTEX_COUNT__MASK		0x07ff0000
+#define GEN8_GS_DW8_STATIC_OUTPUT_VERTEX_COUNT__SHIFT		16
+#define GEN9_GS_DW8_MAX_THREADS__MASK				0x000001ff
 #define GEN9_GS_DW8_MAX_THREADS__SHIFT				0
 
-#define GEN8_GS_DW9_URB_WRITE_OFFSET__MASK			0x03e00000
-#define GEN8_GS_DW9_URB_WRITE_OFFSET__SHIFT			21
-#define GEN8_GS_DW9_URB_WRITE_LEN__MASK				0x001f0000
-#define GEN8_GS_DW9_URB_WRITE_LEN__SHIFT			16
+#define GEN8_GS_DW9_VUE_OUT_READ_OFFSET__MASK			0x07e00000
+#define GEN8_GS_DW9_VUE_OUT_READ_OFFSET__SHIFT			21
+#define GEN8_GS_DW9_VUE_OUT_LEN__MASK				0x001f0000
+#define GEN8_GS_DW9_VUE_OUT_LEN__SHIFT				16
 #define GEN8_GS_DW9_UCP_CLIP_ENABLES__MASK			0x0000ff00
 #define GEN8_GS_DW9_UCP_CLIP_ENABLES__SHIFT			8
+#define GEN8_GS_DW9_UCP_CULL_ENABLES__MASK			0x000000ff
+#define GEN8_GS_DW9_UCP_CULL_ENABLES__SHIFT			0
 
 #define GEN7_3DSTATE_STREAMOUT__SIZE				5
 
@@ -838,6 +934,11 @@ enum gen_msrast_mode {
 #define GEN7_SO_DW1_REORDER_MODE__MASK				0x04000000
 #define GEN7_SO_DW1_REORDER_MODE__SHIFT				26
 #define GEN7_SO_DW1_STATISTICS					(0x1 << 25)
+#define GEN8_SO_DW1_FORCE_RENDERING__MASK			0x01800000
+#define GEN8_SO_DW1_FORCE_RENDERING__SHIFT			23
+#define GEN8_SO_DW1_FORCE_RENDERING_NORMAL			(0x0 << 23)
+#define GEN8_SO_DW1_FORCE_RENDERING_OFF				(0x2 << 23)
+#define GEN8_SO_DW1_FORCE_RENDERING_ON				(0x3 << 23)
 #define GEN7_SO_DW1_BUFFER_ENABLES__MASK			0x00000f00
 #define GEN7_SO_DW1_BUFFER_ENABLES__SHIFT			8
 
@@ -928,9 +1029,9 @@ enum gen_msrast_mode {
 
 
 
-#define GEN8_SO_BUF_DW5_OFFSET_ADDR__MASK			0xfffffffc
-#define GEN8_SO_BUF_DW5_OFFSET_ADDR__SHIFT			2
-#define GEN8_SO_BUF_DW5_OFFSET_ADDR__SHR			2
+#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__MASK			0xfffffffc
+#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__SHIFT			2
+#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__SHR			2
 
 
 
@@ -939,6 +1040,7 @@ enum gen_msrast_mode {
 
 #define GEN7_CLIP_DW1_FRONT_WINDING__MASK			0x00100000
 #define GEN7_CLIP_DW1_FRONT_WINDING__SHIFT			20
+#define GEN8_CLIP_DW1_FORCE_UCP_CULL_ENABLES			(0x1 << 20)
 #define GEN7_CLIP_DW1_SUBPIXEL__MASK				0x00080000
 #define GEN7_CLIP_DW1_SUBPIXEL__SHIFT				19
 #define GEN7_CLIP_DW1_SUBPIXEL_8BITS				(0x0 << 19)
@@ -946,6 +1048,8 @@ enum gen_msrast_mode {
 #define GEN7_CLIP_DW1_EARLY_CULL_ENABLE				(0x1 << 18)
 #define GEN7_CLIP_DW1_CULL_MODE__MASK				0x00030000
 #define GEN7_CLIP_DW1_CULL_MODE__SHIFT				16
+#define GEN8_CLIP_DW1_FORCE_UCP_CLIP_ENABLES			(0x1 << 17)
+#define GEN8_CLIP_DW1_FORCE_CLIP_MODE				(0x1 << 16)
 #define GEN6_CLIP_DW1_STATISTICS				(0x1 << 10)
 #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__MASK			0x000000ff
 #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT			0
@@ -1026,6 +1130,7 @@ enum gen_msrast_mode {
 #define GEN7_SF_DW3_TRIFAN_PROVOKE__MASK			0x06000000
 #define GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT			25
 #define GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE			(0x1 << 14)
+#define GEN8_SF_DW3_SMOOTH_POINT_ENABLE				(0x1 << 13)
 #define GEN7_SF_DW3_SUBPIXEL__MASK				0x00001000
 #define GEN7_SF_DW3_SUBPIXEL__SHIFT				12
 #define GEN7_SF_DW3_SUBPIXEL_8BITS				(0x0 << 12)
@@ -1037,8 +1142,8 @@ enum gen_msrast_mode {
 
 #define GEN7_3DSTATE_SBE_DW1__SIZE				13
 
-#define GEN8_SBE_DW1_USE_URB_READ_LEN				(0x1 << 29)
-#define GEN8_SBE_DW1_USE_URB_READ_OFFSET			(0x1 << 28)
+#define GEN8_SBE_DW1_FORCE_URB_READ_LEN				(0x1 << 29)
+#define GEN8_SBE_DW1_FORCE_URB_READ_OFFSET			(0x1 << 28)
 #define GEN7_SBE_DW1_ATTR_SWIZZLE__MASK				0x10000000
 #define GEN7_SBE_DW1_ATTR_SWIZZLE__SHIFT			28
 #define GEN7_SBE_DW1_ATTR_SWIZZLE_0_15				(0x0 << 28)
@@ -1050,21 +1155,28 @@ enum gen_msrast_mode {
 #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD__SHIFT		20
 #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT		(0x0 << 20)
 #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT		(0x1 << 20)
+#define GEN8_SBE_DW1_PID_OVERRIDE_W				(0x1 << 19)
+#define GEN8_SBE_DW1_PID_OVERRIDE_Z				(0x1 << 18)
+#define GEN8_SBE_DW1_PID_OVERRIDE_Y				(0x1 << 17)
+#define GEN8_SBE_DW1_PID_OVERRIDE_X				(0x1 << 16)
 #define GEN7_SBE_DW1_URB_READ_LEN__MASK				0x0000f800
 #define GEN7_SBE_DW1_URB_READ_LEN__SHIFT			11
 #define GEN7_SBE_DW1_URB_READ_OFFSET__MASK			0x000003f0
 #define GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT			4
 #define GEN8_SBE_DW1_URB_READ_OFFSET__MASK			0x000007e0
 #define GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT			5
+#define GEN8_SBE_DW1_PID_OVERRIDE_ATTR__MASK			0x0000001f
+#define GEN8_SBE_DW1_PID_OVERRIDE_ATTR__SHIFT			0
 
 #define GEN8_3DSTATE_SBE_SWIZ_DW1_DW8__SIZE			8
 
 #define GEN8_SBE_SWIZ_HIGH__MASK				0xffff0000
 #define GEN8_SBE_SWIZ_HIGH__SHIFT				16
-#define GEN8_SBE_SWIZ_OVERRIDE_W				(0x1 << 15)
-#define GEN8_SBE_SWIZ_OVERRIDE_Z				(0x1 << 14)
-#define GEN8_SBE_SWIZ_OVERRIDE_Y				(0x1 << 13)
-#define GEN8_SBE_SWIZ_OVERRIDE_X				(0x1 << 12)
+#define GEN8_SBE_SWIZ_CONST_OVERRIDE_W				(0x1 << 15)
+#define GEN8_SBE_SWIZ_CONST_OVERRIDE_Z				(0x1 << 14)
+#define GEN8_SBE_SWIZ_CONST_OVERRIDE_Y				(0x1 << 13)
+#define GEN8_SBE_SWIZ_CONST_OVERRIDE_X				(0x1 << 12)
+#define GEN8_SBE_SWIZ_SWIZZLE_CONTROL				(0x1 << 11)
 #define GEN8_SBE_SWIZ_CONST__MASK				0x00000600
 #define GEN8_SBE_SWIZ_CONST__SHIFT				9
 #define GEN8_SBE_SWIZ_CONST_0000				(0x0 << 9)
@@ -1126,12 +1238,28 @@ enum gen_msrast_mode {
 
 
 #define GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE			(0x1 << 26)
+#define GEN8_RASTER_DW1_API__MASK				0x00c00000
+#define GEN8_RASTER_DW1_API__SHIFT				22
+#define GEN8_RASTER_DW1_API_DX9_OGL				(0x0 << 22)
+#define GEN8_RASTER_DW1_API_DX10				(0x1 << 22)
+#define GEN8_RASTER_DW1_API_DX10_1				(0x2 << 22)
 #define GEN8_RASTER_DW1_FRONT_WINDING__MASK			0x00200000
 #define GEN8_RASTER_DW1_FRONT_WINDING__SHIFT			21
+#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT__MASK		0x001c0000
+#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT__SHIFT		18
+#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_0	(0x0 << 18)
+#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_1	(0x1 << 18)
+#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_2	(0x2 << 18)
+#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_4	(0x3 << 18)
+#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_8	(0x4 << 18)
+#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_16	(0x5 << 18)
 #define GEN8_RASTER_DW1_CULL_MODE__MASK				0x00030000
 #define GEN8_RASTER_DW1_CULL_MODE__SHIFT			16
+#define GEN8_RASTER_DW1_FORCE_MULTISAMPLE_ENABLE		(0x1 << 14)
 #define GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE			(0x1 << 13)
-#define GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE			(0x1 << 12)
+#define GEN8_RASTER_DW1_DX_MULTISAMPLE_ENABLE			(0x1 << 12)
+#define GEN8_RASTER_DW1_DX_MSRASTMODE__MASK			0x00000c00
+#define GEN8_RASTER_DW1_DX_MSRASTMODE__SHIFT			10
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID			(0x1 << 9)
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME			(0x1 << 8)
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_POINT			(0x1 << 7)
@@ -1223,10 +1351,10 @@ enum gen_msrast_mode {
 
 
 #define GEN7_WM_DW1_STATISTICS					(0x1 << 31)
-#define GEN7_WM_DW1_DEPTH_CLEAR					(0x1 << 30)
+#define GEN7_WM_DW1_LEGACY_DEPTH_CLEAR				(0x1 << 30)
 #define GEN7_WM_DW1_PS_DISPATCH_ENABLE				(0x1 << 29)
-#define GEN7_WM_DW1_DEPTH_RESOLVE				(0x1 << 28)
-#define GEN7_WM_DW1_HIZ_RESOLVE					(0x1 << 27)
+#define GEN7_WM_DW1_LEGACY_DEPTH_RESOLVE			(0x1 << 28)
+#define GEN7_WM_DW1_LEGACY_HIZ_RESOLVE				(0x1 << 27)
 #define GEN7_WM_DW1_LEGACY_LINE_RAST				(0x1 << 26)
 #define GEN7_WM_DW1_PS_KILL_PIXEL				(0x1 << 25)
 #define GEN7_WM_DW1_PSCDEPTH__MASK				0x01800000
@@ -1235,6 +1363,11 @@ enum gen_msrast_mode {
 #define GEN7_WM_DW1_EDSC__SHIFT					21
 #define GEN7_WM_DW1_PS_USE_DEPTH				(0x1 << 20)
 #define GEN7_WM_DW1_PS_USE_W					(0x1 << 19)
+#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE__MASK			0x00180000
+#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE__SHIFT		19
+#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_NORMAL		(0x0 << 19)
+#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_OFF			(0x1 << 19)
+#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_ON			(0x2 << 19)
 #define GEN7_WM_DW1_ZW_INTERP__MASK				0x00060000
 #define GEN7_WM_DW1_ZW_INTERP__SHIFT				17
 #define GEN7_WM_DW1_BARYCENTRIC_INTERP__MASK			0x0001f800
@@ -1261,6 +1394,11 @@ enum gen_msrast_mode {
 #define GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT			(0x1 << 2)
 #define GEN7_WM_DW1_MSRASTMODE__MASK				0x00000003
 #define GEN7_WM_DW1_MSRASTMODE__SHIFT				0
+#define GEN8_WM_DW1_FORCE_KILL_PIXEL__MASK			0x00000003
+#define GEN8_WM_DW1_FORCE_KILL_PIXEL__SHIFT			0
+#define GEN8_WM_DW1_FORCE_KILL_PIXEL_NORMAL			0x0
+#define GEN8_WM_DW1_FORCE_KILL_PIXEL_OFF			0x1
+#define GEN8_WM_DW1_FORCE_KILL_PIXEL_ON				0x2
 
 #define GEN7_WM_DW2_MSDISPMODE__MASK				0x80000000
 #define GEN7_WM_DW2_MSDISPMODE__SHIFT				31
@@ -1271,6 +1409,7 @@ enum gen_msrast_mode {
 #define GEN8_3DSTATE_WM_CHROMAKEY__SIZE				2
 
 
+#define GEN8_CHROMAKEY_DW1_KILL_ENABLE				(0x1 << 31)
 
 #define GEN8_3DSTATE_WM_DEPTH_STENCIL__SIZE			4
 
@@ -1318,6 +1457,7 @@ enum gen_msrast_mode {
 
 #define GEN8_WM_HZ_DW1_STENCIL_CLEAR				(0x1 << 31)
 #define GEN8_WM_HZ_DW1_DEPTH_CLEAR				(0x1 << 30)
+#define GEN8_WM_HZ_DW1_SCISSOR_ENABLE				(0x1 << 29)
 #define GEN8_WM_HZ_DW1_DEPTH_RESOLVE				(0x1 << 28)
 #define GEN8_WM_HZ_DW1_HIZ_RESOLVE				(0x1 << 27)
 #define GEN8_WM_HZ_DW1_PIXEL_OFFSET_ENABLE			(0x1 << 26)
@@ -1443,17 +1583,17 @@ enum gen_msrast_mode {
 
 #define GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE			(0x1 << 31)
 #define GEN8_PS_BLEND_DW1_WRITABLE_RT				(0x1 << 30)
-#define GEN8_PS_BLEND_DW1_BLEND_ENABLE				(0x1 << 29)
-#define GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__MASK		0x1f000000
-#define GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT		24
-#define GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__MASK		0x00f80000
-#define GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT		19
-#define GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__MASK		0x0007c000
-#define GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT		14
-#define GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__MASK		0x00003e00
-#define GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT		9
+#define GEN8_PS_BLEND_DW1_RT0_BLEND_ENABLE			(0x1 << 29)
+#define GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__MASK		0x1f000000
+#define GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__SHIFT		24
+#define GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__MASK		0x00f80000
+#define GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__SHIFT		19
+#define GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__MASK		0x0007c000
+#define GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__SHIFT		14
+#define GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__MASK		0x00003e00
+#define GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__SHIFT		9
 #define GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE			(0x1 << 8)
-#define GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE		(0x1 << 7)
+#define GEN8_PS_BLEND_DW1_RT0_INDEPENDENT_ALPHA_ENABLE		(0x1 << 7)
 
 #define GEN6_3DSTATE_CONSTANT_ANY__SIZE				11
 
@@ -1469,6 +1609,8 @@ enum gen_msrast_mode {
 #define GEN6_CONSTANT_DW_ADDR_ADDR__SHR				5
 
 
+#define GEN8_CONSTANT_DW0_MOCS__MASK				0x00007f00
+#define GEN8_CONSTANT_DW0_MOCS__SHIFT				8
 
 #define GEN7_CONSTANT_DW1_BUFFER1_READ_LEN__MASK		0xffff0000
 #define GEN7_CONSTANT_DW1_BUFFER1_READ_LEN__SHIFT		16
@@ -1502,6 +1644,8 @@ enum gen_msrast_mode {
 
 #define GEN6_3DSTATE_DRAWING_RECTANGLE__SIZE			4
 
+#define GEN8_DRAWING_RECTANGLE_DW0_CORE_MODE_SELECT__MASK	0x0000c000
+#define GEN8_DRAWING_RECTANGLE_DW0_CORE_MODE_SELECT__SHIFT	14
 
 #define GEN6_DRAWING_RECTANGLE_DW1_MIN_Y__MASK			0xffff0000
 #define GEN6_DRAWING_RECTANGLE_DW1_MIN_Y__SHIFT			16
@@ -1624,15 +1768,12 @@ enum gen_msrast_mode {
 #define GEN8_DEPTH_DW5_MOCS__MASK				0x0000007f
 #define GEN8_DEPTH_DW5_MOCS__SHIFT				0
 
-#define GEN8_DEPTH_DW6_OFFSET_Y__MASK				0xffff0000
-#define GEN8_DEPTH_DW6_OFFSET_Y__SHIFT				16
-#define GEN8_DEPTH_DW6_OFFSET_X__MASK				0x0000ffff
-#define GEN8_DEPTH_DW6_OFFSET_X__SHIFT				0
 
 #define GEN8_DEPTH_DW7_RT_VIEW_EXTENT__MASK			0xffe00000
 #define GEN8_DEPTH_DW7_RT_VIEW_EXTENT__SHIFT			21
 #define GEN8_DEPTH_DW7_QPITCH__MASK				0x00007fff
 #define GEN8_DEPTH_DW7_QPITCH__SHIFT				0
+#define GEN8_DEPTH_DW7_QPITCH__SHR				2
 
 #define GEN6_3DSTATE_POLY_STIPPLE_OFFSET__SIZE			2
 
@@ -1649,6 +1790,11 @@ enum gen_msrast_mode {
 #define GEN6_3DSTATE_LINE_STIPPLE__SIZE				3
 
 
+#define GEN6_LINE_STIPPLE_DW1_CURRENT_MODIFY_ENABLE		(0x1 << 31)
+#define GEN6_LINE_STIPPLE_DW1_CURRENT_REPEAT_COUNTER__MASK	0x3fe00000
+#define GEN6_LINE_STIPPLE_DW1_CURRENT_REPEAT_COUNTER__SHIFT	21
+#define GEN6_LINE_STIPPLE_DW1_CURRENT_STIPPLE_INDEX__MASK	0x000f0000
+#define GEN6_LINE_STIPPLE_DW1_CURRENT_STIPPLE_INDEX__SHIFT	16
 #define GEN6_LINE_STIPPLE_DW1_PATTERN__MASK			0x0000ffff
 #define GEN6_LINE_STIPPLE_DW1_PATTERN__SHIFT			0
 
@@ -1664,16 +1810,28 @@ enum gen_msrast_mode {
 #define GEN6_3DSTATE_AA_LINE_PARAMETERS__SIZE			3
 
 
+#define GEN8_AA_LINE_DW1_POINT_BIAS__MASK			0xff000000
+#define GEN8_AA_LINE_DW1_POINT_BIAS__SHIFT			24
+#define GEN8_AA_LINE_DW1_POINT_BIAS__RADIX			8
 #define GEN6_AA_LINE_DW1_BIAS__MASK				0x00ff0000
 #define GEN6_AA_LINE_DW1_BIAS__SHIFT				16
 #define GEN6_AA_LINE_DW1_BIAS__RADIX				8
+#define GEN8_AA_LINE_DW1_POINT_SLOPE__MASK			0x0000ff00
+#define GEN8_AA_LINE_DW1_POINT_SLOPE__SHIFT			8
+#define GEN8_AA_LINE_DW1_POINT_SLOPE__RADIX			8
 #define GEN6_AA_LINE_DW1_SLOPE__MASK				0x000000ff
 #define GEN6_AA_LINE_DW1_SLOPE__SHIFT				0
 #define GEN6_AA_LINE_DW1_SLOPE__RADIX				8
 
+#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__MASK			0xff000000
+#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__SHIFT			24
+#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__RADIX			8
 #define GEN6_AA_LINE_DW2_CAP_BIAS__MASK				0x00ff0000
 #define GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT			16
 #define GEN6_AA_LINE_DW2_CAP_BIAS__RADIX			8
+#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__MASK			0x0000ff00
+#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__SHIFT			8
+#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__RADIX			8
 #define GEN6_AA_LINE_DW2_CAP_SLOPE__MASK			0x000000ff
 #define GEN6_AA_LINE_DW2_CAP_SLOPE__SHIFT			0
 #define GEN6_AA_LINE_DW2_CAP_SLOPE__RADIX			8
@@ -1690,7 +1848,7 @@ enum gen_msrast_mode {
 #define GEN6_3DSTATE_MULTISAMPLE__SIZE				4
 
 
-#define GEN75_MULTISAMPLE_DW1_DX9_MULTISAMPLE_ENABLE		(0x1 << 5)
+#define GEN75_MULTISAMPLE_DW1_PIXEL_OFFSET_ENABLE		(0x1 << 5)
 #define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__MASK		0x00000010
 #define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT		4
 #define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__MASK			0x0000000e
@@ -1724,6 +1882,7 @@ enum gen_msrast_mode {
 
 #define GEN8_STENCIL_DW4_QPITCH__MASK				0x00007fff
 #define GEN8_STENCIL_DW4_QPITCH__SHIFT				0
+#define GEN8_STENCIL_DW4_QPITCH__SHR				2
 
 #define GEN6_3DSTATE_HIER_DEPTH_BUFFER__SIZE			5
 
@@ -1739,6 +1898,7 @@ enum gen_msrast_mode {
 
 #define GEN8_HIZ_DW4_QPITCH__MASK				0x00007fff
 #define GEN8_HIZ_DW4_QPITCH__SHIFT				0
+#define GEN8_HIZ_DW4_QPITCH__SHR				2
 
 #define GEN6_3DSTATE_CLEAR_PARAMS__SIZE				3
 
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
index b65b704adc6..b2c2142af78 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
@@ -430,8 +430,10 @@ enum gen_key_filter {
 #define GEN7_SAMPLER_DW0_BORDER_COLOR_MODE_DX9			(0x1 << 29)
 #define GEN6_SAMPLER_DW0_LOD_PRECLAMP_ENABLE			(0x1 << 28)
 #define GEN6_SAMPLER_DW0_MIN_MAG_NOT_EQUAL			(0x1 << 27)
-#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__MASK		0x18000000
-#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__SHIFT		27
+#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE__MASK		0x18000000
+#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE__SHIFT		27
+#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE_NONE			(0x0 << 27)
+#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE_OGL			(0x2 << 27)
 #define GEN6_SAMPLER_DW0_BASE_LOD__MASK				0x07c00000
 #define GEN6_SAMPLER_DW0_BASE_LOD__SHIFT			22
 #define GEN6_SAMPLER_DW0_BASE_LOD__RADIX			1
@@ -493,23 +495,11 @@ enum gen_key_filter {
 #define GEN6_SAMPLER_DW2_BORDER_COLOR_ADDR__SHIFT		5
 #define GEN6_SAMPLER_DW2_BORDER_COLOR_ADDR__SHR			5
 
-#define GEN8_SAMPLER_DW2_SEP_FILTER_COEFF_TABLE_SIZE__MASK	0xc0000000
-#define GEN8_SAMPLER_DW2_SEP_FILTER_COEFF_TABLE_SIZE__SHIFT	30
-#define GEN8_SAMPLER_DW2_SEP_FILTER_WIDTH__MASK			0x30000000
-#define GEN8_SAMPLER_DW2_SEP_FILTER_WIDTH__SHIFT		28
-#define GEN8_SAMPLER_DW2_SEP_FILTER_HEIGHT__MASK		0x0c000000
-#define GEN8_SAMPLER_DW2_SEP_FILTER_HEIGHT__SHIFT		26
 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__MASK		0x00ffffc0
 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__SHIFT		6
 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__SHR		6
-#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_MODE			(0x1 << 4)
-#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_COEFF_SIZE		(0x1 << 3)
-#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_HALIGN			(0x1 << 2)
-#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_VALIGN			(0x1 << 1)
 #define GEN8_SAMPLER_DW2_LOD_CLAMP_MAG_MODE			(0x1 << 0)
 
-#define GEN8_SAMPLER_DW3_NON_SEP_FILTER_FOOTPRINT_MASK__MASK	0xff000000
-#define GEN8_SAMPLER_DW3_NON_SEP_FILTER_FOOTPRINT_MASK__SHIFT	24
 #define GEN6_SAMPLER_DW3_CHROMAKEY_ENABLE			(0x1 << 25)
 #define GEN6_SAMPLER_DW3_CHROMAKEY_INDEX__MASK			0x01800000
 #define GEN6_SAMPLER_DW3_CHROMAKEY_INDEX__SHIFT			23
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h
index 55d830bad32..2476002ec91 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h
@@ -111,6 +111,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define GEN8_IDRT_DW5_CURBE_READ_LEN__MASK			0xffff0000
 #define GEN8_IDRT_DW5_CURBE_READ_LEN__SHIFT			16
+#define GEN8_IDRT_DW5_CURBE_READ_OFFSET__MASK			0x0000ffff
+#define GEN8_IDRT_DW5_CURBE_READ_OFFSET__SHIFT			0
 
 #define GEN8_IDRT_DW6_ROUNDING_MODE__MASK			0x00c00000
 #define GEN8_IDRT_DW6_ROUNDING_MODE__SHIFT			22
@@ -121,7 +123,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN8_IDRT_DW6_BARRIER_ENABLE				(0x1 << 21)
 #define GEN8_IDRT_DW6_SLM_SIZE__MASK				0x001f0000
 #define GEN8_IDRT_DW6_SLM_SIZE__SHIFT				16
-#define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__MASK			0x000000ff
+#define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__MASK			0x000003ff
 #define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__SHIFT			0
 
 #define GEN8_IDRT_DW7_CROSS_THREAD_CURBE_READ_LEN__MASK		0x000000ff
@@ -280,6 +282,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN8_GPGPU_DW1_IDRT_OFFSET__MASK			0x0000003f
 #define GEN8_GPGPU_DW1_IDRT_OFFSET__SHIFT			0
 
+#define GEN8_GPGPU_DW2_INDIRECT_LEN__MASK			0x0001ffff
+#define GEN8_GPGPU_DW2_INDIRECT_LEN__SHIFT			0
 
 #define GEN8_GPGPU_DW3_INDIRECT_ADDR__MASK			0xffffffe0
 #define GEN8_GPGPU_DW3_INDIRECT_ADDR__SHIFT			5
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
index b5d09f64429..c180450ce27 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
@@ -388,7 +388,7 @@ enum gen_surface_scs {
 #define GEN8_SURFACE_DW0_TILING__SHIFT				12
 #define GEN8_SURFACE_DW0_VSTRIDE				(0x1 << 11)
 #define GEN8_SURFACE_DW0_VSTRIDE_OFFSET				(0x1 << 10)
-#define GEN8_SURFACE_DW0_SAMPLER_L2_BYPASS_MODE			(0x1 << 9)
+#define GEN8_SURFACE_DW0_SAMPLER_L2_BYPASS_DISABLE		(0x1 << 9)
 #define GEN7_SURFACE_DW0_RENDER_CACHE_RW			(0x1 << 8)
 #define GEN7_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__MASK	0x000000c0
 #define GEN7_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__SHIFT	6
@@ -402,6 +402,7 @@ enum gen_surface_scs {
 #define GEN8_SURFACE_DW1_BASE_LOD__SHIFT			19
 #define GEN8_SURFACE_DW1_QPITCH__MASK				0x00007fff
 #define GEN8_SURFACE_DW1_QPITCH__SHIFT				0
+#define GEN8_SURFACE_DW1_QPITCH__SHR				2
 
 #define GEN7_SURFACE_DW2_HEIGHT__MASK				0x3fff0000
 #define GEN7_SURFACE_DW2_HEIGHT__SHIFT				16
@@ -434,7 +435,6 @@ enum gen_surface_scs {
 #define GEN8_SURFACE_DW4_MULTISAMPLECOUNT_2			(0x1 << 3)
 #define GEN7_SURFACE_DW4_MULTISAMPLECOUNT_4			(0x2 << 3)
 #define GEN7_SURFACE_DW4_MULTISAMPLECOUNT_8			(0x3 << 3)
-#define GEN8_SURFACE_DW4_MULTISAMPLECOUNT_16			(0x4 << 3)
 #define GEN7_SURFACE_DW4_MSPOS_INDEX__MASK			0x00000007
 #define GEN7_SURFACE_DW4_MSPOS_INDEX__SHIFT			0
 #define GEN7_SURFACE_DW4_MIN_ARRAY_ELEMENT_STRBUF__MASK		0x07ffffff
@@ -451,8 +451,11 @@ enum gen_surface_scs {
 #define GEN8_SURFACE_DW5_Y_OFFSET__MASK				0x00e00000
 #define GEN8_SURFACE_DW5_Y_OFFSET__SHIFT			21
 #define GEN8_SURFACE_DW5_Y_OFFSET__SHR				1
-#define GEN8_SURFACE_DW5_CUBE_EWA				(0x1 << 20)
-#define GEN8_SURFACE_DW5_COHERENCY_TYPE				(0x1 << 14)
+#define GEN8_SURFACE_DW5_CUBE_EWA_DISABLE			(0x1 << 20)
+#define GEN8_SURFACE_DW5_COHERENCY_TYPE__MASK			0x00004000
+#define GEN8_SURFACE_DW5_COHERENCY_TYPE__SHIFT			14
+#define GEN8_SURFACE_DW5_COHERENCY_TYPE_GPU			(0x0 << 14)
+#define GEN8_SURFACE_DW5_COHERENCY_TYPE_IA			(0x1 << 14)
 #define GEN7_SURFACE_DW5_MIN_LOD__MASK				0x000000f0
 #define GEN7_SURFACE_DW5_MIN_LOD__SHIFT				4
 #define GEN7_SURFACE_DW5_MIP_COUNT_LOD__MASK			0x0000000f
@@ -463,22 +466,23 @@ enum gen_surface_scs {
 #define GEN7_SURFACE_DW6_UV_X_OFFSET__SHIFT			16
 #define GEN7_SURFACE_DW6_UV_Y_OFFSET__MASK			0x00003fff
 #define GEN7_SURFACE_DW6_UV_Y_OFFSET__SHIFT			0
+#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__MASK		0xffffffc0
+#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHIFT		6
+#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHR		6
 #define GEN7_SURFACE_DW6_MCS_ADDR__MASK				0xfffff000
 #define GEN7_SURFACE_DW6_MCS_ADDR__SHIFT			12
 #define GEN7_SURFACE_DW6_MCS_ADDR__SHR				12
 #define GEN8_SURFACE_DW6_AUX_QPITCH__MASK			0x7fff0000
 #define GEN8_SURFACE_DW6_AUX_QPITCH__SHIFT			16
+#define GEN8_SURFACE_DW6_AUX_QPITCH__SHR			2
 #define GEN7_SURFACE_DW6_AUX_PITCH__MASK			0x00000ff8
 #define GEN7_SURFACE_DW6_AUX_PITCH__SHIFT			3
-#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__MASK		0xffffffc0
-#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHIFT		6
-#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHR		6
-#define GEN7_SURFACE_DW6_AUX_MODE__MASK				0x00000007
-#define GEN7_SURFACE_DW6_AUX_MODE__SHIFT			0
-#define GEN7_SURFACE_DW6_AUX_MODE_NONE				0x0
-#define GEN7_SURFACE_DW6_AUX_MODE_MCS				0x1
-#define GEN7_SURFACE_DW6_AUX_MODE_APPEND			0x2
-#define GEN8_SURFACE_DW6_AUX_MODE_HIZ				0x3
+#define GEN7_SURFACE_DW6_AUX__MASK				0x00000007
+#define GEN7_SURFACE_DW6_AUX__SHIFT				0
+#define GEN7_SURFACE_DW6_AUX_NONE				0x0
+#define GEN7_SURFACE_DW6_AUX_MCS				0x1
+#define GEN7_SURFACE_DW6_AUX_APPEND				0x2
+#define GEN8_SURFACE_DW6_AUX_HIZ				0x3
 
 #define GEN7_SURFACE_DW7_CC_R__MASK				0x80000000
 #define GEN7_SURFACE_DW7_CC_R__SHIFT				31
@@ -504,6 +508,12 @@ enum gen_surface_scs {
 
 
 
+#define GEN8_SURFACE_DW11_V_X_OFFSET__MASK			0x3fff0000
+#define GEN8_SURFACE_DW11_V_X_OFFSET__SHIFT			16
+#define GEN8_SURFACE_DW11_V_Y_OFFSET__MASK			0x00003fff
+#define GEN8_SURFACE_DW11_V_Y_OFFSET__SHIFT			0
+#define GEN8_SURFACE_DW11_AUX_ADDR_HI__MASK			0xffffffff
+#define GEN8_SURFACE_DW11_AUX_ADDR_HI__SHIFT			0
 
 
 
diff --git a/src/gallium/drivers/ilo/ilo_common.h b/src/gallium/drivers/ilo/ilo_common.h
index 3dbe79fb872..d3016590551 100644
--- a/src/gallium/drivers/ilo/ilo_common.h
+++ b/src/gallium/drivers/ilo/ilo_common.h
@@ -34,6 +34,7 @@
 #include "util/list.h"
 #include "util/u_format.h"
 #include "util/u_inlines.h"
+#include "util/u_memory.h"
 #include "util/u_pointer.h"
 
 #include "core/ilo_core.h"
diff --git a/src/gallium/drivers/ilo/ilo_shader.c b/src/gallium/drivers/ilo/ilo_shader.c
index 5f2b01017e2..73b625e9de4 100644
--- a/src/gallium/drivers/ilo/ilo_shader.c
+++ b/src/gallium/drivers/ilo/ilo_shader.c
@@ -987,15 +987,6 @@ ilo_shader_destroy(struct ilo_shader_state *shader)
 }
 
 /**
- * Return the type (PIPE_SHADER_x) of the shader.
- */
-int
-ilo_shader_get_type(const struct ilo_shader_state *shader)
-{
-   return shader->info.type;
-}
-
-/**
  * Select a kernel for the given context.  This will compile a new kernel if
  * none of the existing kernels work with the context.
  *
@@ -1257,9 +1248,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader,
    case ILO_KERNEL_SAMPLER_COUNT:
       val = shader->info.num_samplers;
       break;
-   case ILO_KERNEL_URB_DATA_START_REG:
-      val = kernel->in.start_grf;
-      break;
    case ILO_KERNEL_SKIP_CBUF0_UPLOAD:
       val = kernel->skip_cbuf0_upload;
       break;
@@ -1311,9 +1299,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader,
    case ILO_KERNEL_VS_GEN6_SO:
       val = kernel->stream_output;
       break;
-   case ILO_KERNEL_VS_GEN6_SO_START_REG:
-      val = kernel->gs_start_grf;
-      break;
    case ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET:
       val = kernel->gs_offsets[0];
       break;
@@ -1340,16 +1325,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader,
       val = kernel->bt.gen6_so_count;
       break;
 
-   case ILO_KERNEL_FS_INPUT_Z:
-   case ILO_KERNEL_FS_INPUT_W:
-      val = kernel->in.has_pos;
-      break;
-   case ILO_KERNEL_FS_OUTPUT_Z:
-      val = kernel->out.has_pos;
-      break;
-   case ILO_KERNEL_FS_USE_KILL:
-      val = kernel->has_kill;
-      break;
    case ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS:
       val = kernel->in.barycentric_interpolation_mode;
       break;
diff --git a/src/gallium/drivers/ilo/ilo_shader.h b/src/gallium/drivers/ilo/ilo_shader.h
index d9f02a4746a..01de54146b1 100644
--- a/src/gallium/drivers/ilo/ilo_shader.h
+++ b/src/gallium/drivers/ilo/ilo_shader.h
@@ -36,7 +36,6 @@ enum ilo_kernel_param {
    ILO_KERNEL_INPUT_COUNT,
    ILO_KERNEL_OUTPUT_COUNT,
    ILO_KERNEL_SAMPLER_COUNT,
-   ILO_KERNEL_URB_DATA_START_REG,
    ILO_KERNEL_SKIP_CBUF0_UPLOAD,
    ILO_KERNEL_PCB_CBUF0_SIZE,
 
@@ -53,7 +52,6 @@ enum ilo_kernel_param {
    ILO_KERNEL_VS_INPUT_EDGEFLAG,
    ILO_KERNEL_VS_PCB_UCP_SIZE,
    ILO_KERNEL_VS_GEN6_SO,
-   ILO_KERNEL_VS_GEN6_SO_START_REG,
    ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET,
    ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET,
    ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET,
@@ -64,10 +62,6 @@ enum ilo_kernel_param {
    ILO_KERNEL_GS_GEN6_SURFACE_SO_BASE,
    ILO_KERNEL_GS_GEN6_SURFACE_SO_COUNT,
 
-   ILO_KERNEL_FS_INPUT_Z,
-   ILO_KERNEL_FS_INPUT_W,
-   ILO_KERNEL_FS_OUTPUT_Z,
-   ILO_KERNEL_FS_USE_KILL,
    ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS,
    ILO_KERNEL_FS_DISPATCH_16_OFFSET,
    ILO_KERNEL_FS_SURFACE_RT_BASE,
@@ -149,9 +143,6 @@ ilo_shader_create_cs(const struct ilo_dev *dev,
 void
 ilo_shader_destroy(struct ilo_shader_state *shader);
 
-int
-ilo_shader_get_type(const struct ilo_shader_state *shader);
-
 bool
 ilo_shader_select_kernel(struct ilo_shader_state *shader,
                          const struct ilo_state_vector *vec,
diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index 9346ea3204d..c18e9f5b435 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -151,6 +151,15 @@ NVC0_C_SOURCES := \
 	nvc0/nvc0_program.c \
 	nvc0/nvc0_program.h \
 	nvc0/nvc0_query.c \
+	nvc0/nvc0_query.h \
+	nvc0/nvc0_query_hw.c \
+	nvc0/nvc0_query_hw.h \
+	nvc0/nvc0_query_hw_metric.c \
+	nvc0/nvc0_query_hw_metric.h \
+	nvc0/nvc0_query_hw_sm.c \
+	nvc0/nvc0_query_hw_sm.h \
+	nvc0/nvc0_query_sw.c \
+	nvc0/nvc0_query_sw.h \
 	nvc0/nvc0_resource.c \
 	nvc0/nvc0_resource.h \
 	nvc0/nvc0_screen.c \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 400b9f09e51..7859c8e79bd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -25,10 +25,24 @@
 
 #include <stack>
 #include <limits>
+#if __cplusplus >= 201103L
+#include <unordered_map>
+#else
 #include <tr1/unordered_map>
+#endif
 
 namespace nv50_ir {
 
+#if __cplusplus >= 201103L
+using std::hash;
+using std::unordered_map;
+#elif !defined(ANDROID)
+using std::tr1::hash;
+using std::tr1::unordered_map;
+#else
+#error Android release before Lollipop is not supported!
+#endif
+
 #define MAX_REGISTER_FILE_SIZE 256
 
 class RegisterSet
@@ -349,12 +363,12 @@ RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p)
 
 struct PhiMapHash {
    size_t operator()(const std::pair<Instruction *, BasicBlock *>& val) const {
-      return std::tr1::hash<Instruction*>()(val.first) * 31 +
-         std::tr1::hash<BasicBlock*>()(val.second);
+      return hash<Instruction*>()(val.first) * 31 +
+         hash<BasicBlock*>()(val.second);
    }
 };
 
-typedef std::tr1::unordered_map<
+typedef unordered_map<
    std::pair<Instruction *, BasicBlock *>, Value *, PhiMapHash> PhiMap;
 
 // Critical edges need to be split up so that work can be inserted along
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c
index ee4e08dd520..21cf2b9ae5e 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -190,8 +190,14 @@ nouveau_fence_wait(struct nouveau_fence *fence)
    /* wtf, someone is waiting on a fence in flush_notify handler? */
    assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING);
 
-   if (fence->state < NOUVEAU_FENCE_STATE_EMITTED)
-      nouveau_fence_emit(fence);
+   if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) {
+      PUSH_SPACE(screen->pushbuf, 8);
+      /* The space allocation might trigger a flush, which could emit the
+       * current fence. So check again.
+       */
+      if (fence->state < NOUVEAU_FENCE_STATE_EMITTED)
+         nouveau_fence_emit(fence);
+   }
 
    if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED)
       if (nouveau_pushbuf_kick(screen->pushbuf, screen->pushbuf->channel))
@@ -224,8 +230,12 @@ nouveau_fence_wait(struct nouveau_fence *fence)
 void
 nouveau_fence_next(struct nouveau_screen *screen)
 {
-   if (screen->fence.current->state < NOUVEAU_FENCE_STATE_EMITTING)
-      nouveau_fence_emit(screen->fence.current);
+   if (screen->fence.current->state < NOUVEAU_FENCE_STATE_EMITTING) {
+      if (screen->fence.current->ref > 1)
+         nouveau_fence_emit(screen->fence.current);
+      else
+         return;
+   }
 
    nouveau_fence_ref(NULL, &screen->fence.current);
 
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index 5757eb1fb16..dbbb8baad79 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -1,3 +1,4 @@
+#include <strings.h>
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index fdde11f4cd5..941555ffbf8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -65,14 +65,9 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
             }
             while (words) {
-               unsigned nr;
-
-               if (!PUSH_SPACE(push, 16))
-                  break;
-               nr = PUSH_AVAIL(push);
-               assert(nr >= 16);
-               nr = MIN2(MIN2(nr - 3, words), NV04_PFIFO_MAX_PACKET_LEN);
+               unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN);
 
+               PUSH_SPACE(push, nr + 3);
                BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
                PUSH_DATA (push, (start << 8) | b);
                BEGIN_NI04(push, NV50_3D(CB_DATA(0)), nr);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_transfer.c b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c
index be514077d32..9a3fd1e705f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c
@@ -187,14 +187,7 @@ nv50_sifc_linear_u8(struct nouveau_context *nv,
    PUSH_DATA (push, 0);
 
    while (count) {
-      unsigned nr;
-
-      if (!PUSH_SPACE(push, 16))
-         break;
-      nr = PUSH_AVAIL(push);
-      assert(nr >= 16);
-      nr = MIN2(count, nr - 1);
-      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN);
+      unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);
 
       BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr);
       PUSH_DATAp(push, src, nr);
@@ -395,12 +388,9 @@ nv50_cb_push(struct nouveau_context *nv,
    nouveau_pushbuf_validate(push);
 
    while (words) {
-      unsigned nr;
-
-      nr = PUSH_AVAIL(push);
-      nr = MIN2(nr - 7, words);
-      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1);
+      unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN);
 
+      PUSH_SPACE(push, nr + 7);
       BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3);
       PUSH_DATAh(push, bo->offset + base);
       PUSH_DATA (push, bo->offset + base);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index 47bd123621b..e33af042620 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -37,12 +37,9 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen,
 
    switch (dev->chipset & ~0xf) {
    case 0xc0:
-      if (dev->chipset == 0xc8)
-         obj_class = NVC8_COMPUTE_CLASS;
-      else
-         obj_class = NVC0_COMPUTE_CLASS;
-      break;
    case 0xd0:
+      /* In theory, GF110+ should also support NVC8_COMPUTE_CLASS but,
+       * in practice, a ILLEGAL_CLASS dmesg fail appears when using it. */
       obj_class = NVC0_COMPUTE_CLASS;
       break;
    default:
@@ -108,14 +105,6 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen,
    PUSH_DATAh(push, screen->text->offset);
    PUSH_DATA (push, screen->text->offset);
 
-   /* bind parameters buffer */
-   BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3);
-   PUSH_DATA (push, screen->parm->size);
-   PUSH_DATAh(push, screen->parm->offset);
-   PUSH_DATA (push, screen->parm->offset);
-   BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1);
-   PUSH_DATA (push, (0 << 8) | 1);
-
    /* TODO: textures & samplers */
 
    return 0;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 30bee3a0f8c..4af83c53224 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -15,6 +15,7 @@
 #include "nvc0/nvc0_screen.h"
 #include "nvc0/nvc0_program.h"
 #include "nvc0/nvc0_resource.h"
+#include "nvc0/nvc0_query.h"
 
 #include "nv50/nv50_transfer.h"
 
@@ -231,17 +232,6 @@ uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
                                     uint32_t label);
 void nvc0_program_init_tcp_empty(struct nvc0_context *);
 
-/* nvc0_query.c */
-void nvc0_init_query_functions(struct nvc0_context *);
-void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *,
-                               struct pipe_query *, unsigned result_offset);
-void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
-void nvc0_so_target_save_offset(struct pipe_context *,
-                                struct pipe_stream_output_target *, unsigned i,
-                                bool *serialize);
-
-#define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
-
 /* nvc0_shader_state.c */
 void nvc0_vertprog_validate(struct nvc0_context *);
 void nvc0_tctlprog_validate(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index b13df6a9485..e4752e2dbc5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -25,519 +25,51 @@
 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
 
 #include "nvc0/nvc0_context.h"
-#include "nv_object.xml.h"
-#include "nvc0/nve4_compute.xml.h"
-#include "nvc0/nvc0_compute.xml.h"
-
-#define NVC0_QUERY_STATE_READY   0
-#define NVC0_QUERY_STATE_ACTIVE  1
-#define NVC0_QUERY_STATE_ENDED   2
-#define NVC0_QUERY_STATE_FLUSHED 3
-
-struct nvc0_query {
-   uint32_t *data;
-   uint16_t type;
-   uint16_t index;
-   int8_t ctr[4];
-   uint32_t sequence;
-   struct nouveau_bo *bo;
-   uint32_t base;
-   uint32_t offset; /* base + i * rotate */
-   uint8_t state;
-   bool is64bit;
-   uint8_t rotate;
-   int nesting; /* only used for occlusion queries */
-   union {
-      struct nouveau_mm_allocation *mm;
-      uint64_t value;
-   } u;
-   struct nouveau_fence *fence;
-};
-
-#define NVC0_QUERY_ALLOC_SPACE 256
-
-static boolean nvc0_hw_sm_query_begin(struct nvc0_context *,
-                                      struct nvc0_query *);
-static void nvc0_hw_sm_query_end(struct nvc0_context *, struct nvc0_query *);
-static boolean nvc0_hw_sm_query_result(struct nvc0_context *,
-                                       struct nvc0_query *, void *, boolean);
-
-static inline struct nvc0_query *
-nvc0_query(struct pipe_query *pipe)
-{
-   return (struct nvc0_query *)pipe;
-}
-
-static bool
-nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
-{
-   struct nvc0_screen *screen = nvc0->screen;
-   int ret;
-
-   if (q->bo) {
-      nouveau_bo_ref(NULL, &q->bo);
-      if (q->u.mm) {
-         if (q->state == NVC0_QUERY_STATE_READY)
-            nouveau_mm_free(q->u.mm);
-         else
-            nouveau_fence_work(screen->base.fence.current,
-                               nouveau_mm_free_work, q->u.mm);
-      }
-   }
-   if (size) {
-      q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
-      if (!q->bo)
-         return false;
-      q->offset = q->base;
-
-      ret = nouveau_bo_map(q->bo, 0, screen->base.client);
-      if (ret) {
-         nvc0_query_allocate(nvc0, q, 0);
-         return false;
-      }
-      q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
-   }
-   return true;
-}
-
-static void
-nvc0_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
-{
-   nvc0_query_allocate(nvc0_context(pipe), nvc0_query(pq), 0);
-   nouveau_fence_ref(NULL, &nvc0_query(pq)->fence);
-   FREE(nvc0_query(pq));
-}
+#include "nvc0/nvc0_query.h"
+#include "nvc0/nvc0_query_sw.h"
+#include "nvc0/nvc0_query_hw.h"
+#include "nvc0/nvc0_query_hw_sm.h"
 
 static struct pipe_query *
-nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
+nvc0_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nvc0_query *q;
-   unsigned space = NVC0_QUERY_ALLOC_SPACE;
 
-   q = CALLOC_STRUCT(nvc0_query);
+   q = nvc0_sw_create_query(nvc0, type, index);
    if (!q)
-      return NULL;
-
-   switch (type) {
-   case PIPE_QUERY_OCCLUSION_COUNTER:
-   case PIPE_QUERY_OCCLUSION_PREDICATE:
-      q->rotate = 32;
-      space = NVC0_QUERY_ALLOC_SPACE;
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      q->is64bit = true;
-      space = 512;
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      q->is64bit = true;
-      space = 64;
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED:
-   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      q->is64bit = true;
-      q->index = index;
-      space = 32;
-      break;
-   case PIPE_QUERY_TIME_ELAPSED:
-   case PIPE_QUERY_TIMESTAMP:
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-   case PIPE_QUERY_GPU_FINISHED:
-      space = 32;
-      break;
-   case NVC0_QUERY_TFB_BUFFER_OFFSET:
-      space = 16;
-      break;
-   default:
-#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-      if (type >= NVC0_QUERY_DRV_STAT(0) && type <= NVC0_QUERY_DRV_STAT_LAST) {
-         space = 0;
-         q->is64bit = true;
-         q->index = type - NVC0_QUERY_DRV_STAT(0);
-         break;
-      } else
-#endif
-      if (nvc0->screen->base.device->drm_version >= 0x01000101) {
-         if (type >= NVE4_HW_SM_QUERY(0) && type <= NVE4_HW_SM_QUERY_LAST) {
-            /* for each MP:
-             * [00] = WS0.C0
-             * [04] = WS0.C1
-             * [08] = WS0.C2
-             * [0c] = WS0.C3
-             * [10] = WS1.C0
-             * [14] = WS1.C1
-             * [18] = WS1.C2
-             * [1c] = WS1.C3
-             * [20] = WS2.C0
-             * [24] = WS2.C1
-             * [28] = WS2.C2
-             * [2c] = WS2.C3
-             * [30] = WS3.C0
-             * [34] = WS3.C1
-             * [38] = WS3.C2
-             * [3c] = WS3.C3
-             * [40] = MP.C4
-             * [44] = MP.C5
-             * [48] = MP.C6
-             * [4c] = MP.C7
-             * [50] = WS0.sequence
-             * [54] = WS1.sequence
-             * [58] = WS2.sequence
-             * [5c] = WS3.sequence
-             */
-            space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
-            break;
-         } else
-         if (type >= NVC0_HW_SM_QUERY(0) && type <= NVC0_HW_SM_QUERY_LAST) {
-            /* for each MP:
-             * [00] = MP.C0
-             * [04] = MP.C1
-             * [08] = MP.C2
-             * [0c] = MP.C3
-             * [10] = MP.C4
-             * [14] = MP.C5
-             * [18] = MP.C6
-             * [1c] = MP.C7
-             * [20] = MP.sequence
-             */
-            space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t);
-            break;
-         }
-      }
-      debug_printf("invalid query type: %u\n", type);
-      FREE(q);
-      return NULL;
-   }
-   if (!nvc0_query_allocate(nvc0, q, space)) {
-      FREE(q);
-      return NULL;
-   }
-
-   q->type = type;
-
-   if (q->rotate) {
-      /* we advance before query_begin ! */
-      q->offset -= q->rotate;
-      q->data -= q->rotate / sizeof(*q->data);
-   } else
-   if (!q->is64bit)
-      q->data[0] = 0; /* initialize sequence */
+      q = nvc0_hw_create_query(nvc0, type, index);
 
    return (struct pipe_query *)q;
 }
 
 static void
-nvc0_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q,
-               unsigned offset, uint32_t get)
+nvc0_destroy_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   offset += q->offset;
-
-   PUSH_SPACE(push, 5);
-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
-   BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, q->bo->offset + offset);
-   PUSH_DATA (push, q->bo->offset + offset);
-   PUSH_DATA (push, q->sequence);
-   PUSH_DATA (push, get);
-}
-
-static void
-nvc0_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q)
-{
-   q->offset += q->rotate;
-   q->data += q->rotate / sizeof(*q->data);
-   if (q->offset - q->base == NVC0_QUERY_ALLOC_SPACE)
-      nvc0_query_allocate(nvc0, q, NVC0_QUERY_ALLOC_SPACE);
+   struct nvc0_query *q = nvc0_query(pq);
+   q->funcs->destroy_query(nvc0_context(pipe), q);
 }
 
 static boolean
-nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+nvc0_begin_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   struct nvc0_context *nvc0 = nvc0_context(pipe);
-   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_query *q = nvc0_query(pq);
-   bool ret = true;
-
-   /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to false even *after* we re-
-    * initialized it to true.
-    */
-   if (q->rotate) {
-      nvc0_query_rotate(nvc0, q);
-
-      /* XXX: can we do this with the GPU, and sync with respect to a previous
-       *  query ?
-       */
-      q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = true */
-      q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
-      q->data[5] = 0;
-   }
-   q->sequence++;
-
-   switch (q->type) {
-   case PIPE_QUERY_OCCLUSION_COUNTER:
-   case PIPE_QUERY_OCCLUSION_PREDICATE:
-      q->nesting = nvc0->screen->num_occlusion_queries_active++;
-      if (q->nesting) {
-         nvc0_query_get(push, q, 0x10, 0x0100f002);
-      } else {
-         PUSH_SPACE(push, 3);
-         BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
-         PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
-         IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
-      }
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED:
-      nvc0_query_get(push, q, 0x10, 0x09005002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      nvc0_query_get(push, q, 0x10, 0x05805002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      nvc0_query_get(push, q, 0x20, 0x05805002 | (q->index << 5));
-      nvc0_query_get(push, q, 0x30, 0x06805002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      nvc0_query_get(push, q, 0x10, 0x03005002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_TIME_ELAPSED:
-      nvc0_query_get(push, q, 0x10, 0x00005002);
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      nvc0_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */
-      nvc0_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */
-      nvc0_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */
-      nvc0_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */
-      nvc0_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */
-      nvc0_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */
-      nvc0_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */
-      nvc0_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
-      nvc0_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
-      nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
-      break;
-   default:
-#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-      if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
-          q->type <= NVC0_QUERY_DRV_STAT_LAST) {
-         if (q->index >= 5)
-            q->u.value = nvc0->screen->base.stats.v[q->index];
-         else
-            q->u.value = 0;
-      } else
-#endif
-      if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
-          (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
-         ret = nvc0_hw_sm_query_begin(nvc0, q);
-      }
-      break;
-   }
-   q->state = NVC0_QUERY_STATE_ACTIVE;
-   return ret;
+   return q->funcs->begin_query(nvc0_context(pipe), q);
 }
 
 static void
-nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+nvc0_end_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   struct nvc0_context *nvc0 = nvc0_context(pipe);
-   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_query *q = nvc0_query(pq);
-
-   if (q->state != NVC0_QUERY_STATE_ACTIVE) {
-      /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
-      if (q->rotate)
-         nvc0_query_rotate(nvc0, q);
-      q->sequence++;
-   }
-   q->state = NVC0_QUERY_STATE_ENDED;
-
-   switch (q->type) {
-   case PIPE_QUERY_OCCLUSION_COUNTER:
-   case PIPE_QUERY_OCCLUSION_PREDICATE:
-      nvc0_query_get(push, q, 0, 0x0100f002);
-      if (--nvc0->screen->num_occlusion_queries_active == 0) {
-         PUSH_SPACE(push, 1);
-         IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
-      }
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED:
-      nvc0_query_get(push, q, 0, 0x09005002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      nvc0_query_get(push, q, 0, 0x05805002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      nvc0_query_get(push, q, 0x00, 0x05805002 | (q->index << 5));
-      nvc0_query_get(push, q, 0x10, 0x06805002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      /* TODO: How do we sum over all streams for render condition ? */
-      /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
-      nvc0_query_get(push, q, 0x00, 0x03005002 | (q->index << 5));
-      nvc0_query_get(push, q, 0x20, 0x00005002);
-      break;
-   case PIPE_QUERY_TIMESTAMP:
-   case PIPE_QUERY_TIME_ELAPSED:
-      nvc0_query_get(push, q, 0, 0x00005002);
-      break;
-   case PIPE_QUERY_GPU_FINISHED:
-      nvc0_query_get(push, q, 0, 0x1000f010);
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      nvc0_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
-      nvc0_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
-      nvc0_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
-      nvc0_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
-      nvc0_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
-      nvc0_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
-      nvc0_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
-      nvc0_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
-      nvc0_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
-      nvc0_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
-      break;
-   case NVC0_QUERY_TFB_BUFFER_OFFSET:
-      /* indexed by TFB buffer instead of by vertex stream */
-      nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to false */
-      q->state = NVC0_QUERY_STATE_READY;
-      break;
-   default:
-#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-      if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
-          q->type <= NVC0_QUERY_DRV_STAT_LAST) {
-         q->u.value = nvc0->screen->base.stats.v[q->index] - q->u.value;
-         return;
-      } else
-#endif
-      if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
-          (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
-         nvc0_hw_sm_query_end(nvc0, q);
-      }
-      break;
-   }
-   if (q->is64bit)
-      nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence);
-}
-
-static inline void
-nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q)
-{
-   if (q->is64bit) {
-      if (nouveau_fence_signalled(q->fence))
-         q->state = NVC0_QUERY_STATE_READY;
-   } else {
-      if (q->data[0] == q->sequence)
-         q->state = NVC0_QUERY_STATE_READY;
-   }
+   q->funcs->end_query(nvc0_context(pipe), q);
 }
 
 static boolean
-nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
-                  boolean wait, union pipe_query_result *result)
+nvc0_get_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+                      boolean wait, union pipe_query_result *result)
 {
-   struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nvc0_query *q = nvc0_query(pq);
-   uint64_t *res64 = (uint64_t*)result;
-   uint32_t *res32 = (uint32_t*)result;
-   uint8_t *res8 = (uint8_t*)result;
-   uint64_t *data64 = (uint64_t *)q->data;
-   unsigned i;
-
-#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-   if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
-       q->type <= NVC0_QUERY_DRV_STAT_LAST) {
-      res64[0] = q->u.value;
-      return true;
-   } else
-#endif
-   if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
-       (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
-      return nvc0_hw_sm_query_result(nvc0, q, result, wait);
-   }
-
-   if (q->state != NVC0_QUERY_STATE_READY)
-      nvc0_query_update(nvc0->screen->base.client, q);
-
-   if (q->state != NVC0_QUERY_STATE_READY) {
-      if (!wait) {
-         if (q->state != NVC0_QUERY_STATE_FLUSHED) {
-            q->state = NVC0_QUERY_STATE_FLUSHED;
-            /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
-            PUSH_KICK(nvc0->base.pushbuf);
-         }
-         return false;
-      }
-      if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
-         return false;
-      NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
-   }
-   q->state = NVC0_QUERY_STATE_READY;
-
-   switch (q->type) {
-   case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = true;
-      break;
-   case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
-      res64[0] = q->data[1] - q->data[5];
-      break;
-   case PIPE_QUERY_OCCLUSION_PREDICATE:
-      res8[0] = q->data[1] != q->data[5];
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
-   case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
-      res64[0] = data64[0] - data64[2];
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      res64[0] = data64[0] - data64[4];
-      res64[1] = data64[2] - data64[6];
-      break;
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      res8[0] = data64[0] != data64[2];
-      break;
-   case PIPE_QUERY_TIMESTAMP:
-      res64[0] = data64[1];
-      break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      res64[0] = 1000000000;
-      res8[8] = false;
-      break;
-   case PIPE_QUERY_TIME_ELAPSED:
-      res64[0] = data64[1] - data64[3];
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      for (i = 0; i < 10; ++i)
-         res64[i] = data64[i * 2] - data64[24 + i * 2];
-      break;
-   case NVC0_QUERY_TFB_BUFFER_OFFSET:
-      res32[0] = q->data[1];
-      break;
-   default:
-      assert(0); /* can't happen, we don't create queries with invalid type */
-      return false;
-   }
-
-   return true;
-}
-
-void
-nvc0_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
-{
-   struct nvc0_query *q = nvc0_query(pq);
-   unsigned offset = q->offset;
-
-   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20;
-
-   PUSH_SPACE(push, 5);
-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-   BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, q->bo->offset + offset);
-   PUSH_DATA (push, q->bo->offset + offset);
-   PUSH_DATA (push, q->sequence);
-   PUSH_DATA (push, (1 << 12) |
-              NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+   return q->funcs->get_query_result(nvc0_context(pipe), q, wait, result);
 }
 
 static void
@@ -547,7 +79,8 @@ nvc0_render_condition(struct pipe_context *pipe,
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   struct nvc0_query *q;
+   struct nvc0_query *q = nvc0_query(pq);
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
    uint32_t cond;
    bool wait =
       mode != PIPE_RENDER_COND_NO_WAIT &&
@@ -557,7 +90,6 @@ nvc0_render_condition(struct pipe_context *pipe,
       cond = NVC0_3D_COND_MODE_ALWAYS;
    }
    else {
-      q = nvc0_query(pq);
       /* NOTE: comparison of 2 queries only works if both have completed */
       switch (q->type) {
       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
@@ -568,7 +100,7 @@ nvc0_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
          if (likely(!condition)) {
-            if (unlikely(q->nesting))
+            if (unlikely(hq->nesting))
                cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL :
                              NVC0_3D_COND_MODE_ALWAYS;
             else
@@ -596,805 +128,17 @@ nvc0_render_condition(struct pipe_context *pipe,
    }
 
    if (wait)
-      nvc0_query_fifo_wait(push, pq);
+      nvc0_hw_query_fifo_wait(push, q);
 
    PUSH_SPACE(push, 7);
-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
    BEGIN_NVC0(push, NVC0_3D(COND_ADDRESS_HIGH), 3);
-   PUSH_DATAh(push, q->bo->offset + q->offset);
-   PUSH_DATA (push, q->bo->offset + q->offset);
+   PUSH_DATAh(push, hq->bo->offset + hq->offset);
+   PUSH_DATA (push, hq->bo->offset + hq->offset);
    PUSH_DATA (push, cond);
    BEGIN_NVC0(push, NVC0_2D(COND_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, q->bo->offset + q->offset);
-   PUSH_DATA (push, q->bo->offset + q->offset);
-}
-
-void
-nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
-                          struct pipe_query *pq, unsigned result_offset)
-{
-   struct nvc0_query *q = nvc0_query(pq);
-
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-
-   PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
-   nouveau_pushbuf_space(push, 0, 0, 1);
-   nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
-                        NVC0_IB_ENTRY_1_NO_PREFETCH);
-}
-
-void
-nvc0_so_target_save_offset(struct pipe_context *pipe,
-                           struct pipe_stream_output_target *ptarg,
-                           unsigned index, bool *serialize)
-{
-   struct nvc0_so_target *targ = nvc0_so_target(ptarg);
-
-   if (*serialize) {
-      *serialize = false;
-      PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1);
-      IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0);
-
-      NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1);
-   }
-
-   nvc0_query(targ->pq)->index = index;
-
-   nvc0_query_end(pipe, targ->pq);
-}
-
-
-/* === DRIVER STATISTICS === */
-
-#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-
-static const char *nvc0_drv_stat_names[] =
-{
-   "drv-tex_obj_current_count",
-   "drv-tex_obj_current_bytes",
-   "drv-buf_obj_current_count",
-   "drv-buf_obj_current_bytes_vid",
-   "drv-buf_obj_current_bytes_sys",
-   "drv-tex_transfers_rd",
-   "drv-tex_transfers_wr",
-   "drv-tex_copy_count",
-   "drv-tex_blit_count",
-   "drv-tex_cache_flush_count",
-   "drv-buf_transfers_rd",
-   "drv-buf_transfers_wr",
-   "drv-buf_read_bytes_staging_vid",
-   "drv-buf_write_bytes_direct",
-   "drv-buf_write_bytes_staging_vid",
-   "drv-buf_write_bytes_staging_sys",
-   "drv-buf_copy_bytes",
-   "drv-buf_non_kernel_fence_sync_count",
-   "drv-any_non_kernel_fence_sync_count",
-   "drv-query_sync_count",
-   "drv-gpu_serialize_count",
-   "drv-draw_calls_array",
-   "drv-draw_calls_indexed",
-   "drv-draw_calls_fallback_count",
-   "drv-user_buffer_upload_bytes",
-   "drv-constbuf_upload_count",
-   "drv-constbuf_upload_bytes",
-   "drv-pushbuf_count",
-   "drv-resource_validate_count"
-};
-
-#endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */
-
-
-/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
-
-/* Code to read out MP counters: They are accessible via mmio, too, but let's
- * just avoid mapping registers in userspace. We'd have to know which MPs are
- * enabled/present, too, and that information is not presently exposed.
- * We could add a kernel interface for it, but reading the counters like this
- * has the advantage of being async (if get_result isn't called immediately).
- */
-static const uint64_t nve4_read_hw_sm_counters_code[] =
-{
-   /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
-    * mov b32 $r8 $tidx
-    * mov b32 $r12 $physid
-    * mov b32 $r0 $pm0
-    * mov b32 $r1 $pm1
-    * mov b32 $r2 $pm2
-    * mov b32 $r3 $pm3
-    * mov b32 $r4 $pm4
-    * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
-    * mov b32 $r5 $pm5
-    * mov b32 $r6 $pm6
-    * mov b32 $r7 $pm7
-    * set $p0 0x1 eq u32 $r8 0x0
-    * mov b32 $r10 c0[0x0]
-    * ext u32 $r8 $r12 0x414
-    * mov b32 $r11 c0[0x4]
-    * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
-    * ext u32 $r9 $r12 0x208
-    * (not $p0) exit
-    * set $p1 0x1 eq u32 $r9 0x0
-    * mul $r8 u32 $r8 u32 96
-    * mul $r12 u32 $r9 u32 16
-    * mul $r13 u32 $r9 u32 4
-    * add b32 $r9 $r8 $r13
-    * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
-    * add b32 $r8 $r8 $r12
-    * mov b32 $r12 $r10
-    * add b32 $r10 $c $r10 $r8
-    * mov b32 $r13 $r11
-    * add b32 $r11 $r11 0x0 $c
-    * add b32 $r12 $c $r12 $r9
-    * st b128 wt g[$r10d] $r0q
-    * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
-    * mov b32 $r0 c0[0x8]
-    * add b32 $r13 $r13 0x0 $c
-    * $p1 st b128 wt g[$r12d+0x40] $r4q
-    * st b32 wt g[$r12d+0x50] $r0
-    * exit */
-   0x2202020202020207ULL,
-   0x2c00000084021c04ULL,
-   0x2c0000000c031c04ULL,
-   0x2c00000010001c04ULL,
-   0x2c00000014005c04ULL,
-   0x2c00000018009c04ULL,
-   0x2c0000001c00dc04ULL,
-   0x2c00000020011c04ULL,
-   0x22b0420042320207ULL,
-   0x2c00000024015c04ULL,
-   0x2c00000028019c04ULL,
-   0x2c0000002c01dc04ULL,
-   0x190e0000fc81dc03ULL,
-   0x2800400000029de4ULL,
-   0x7000c01050c21c03ULL,
-   0x280040001002dde4ULL,
-   0x204282020042e047ULL,
-   0x7000c00820c25c03ULL,
-   0x80000000000021e7ULL,
-   0x190e0000fc93dc03ULL,
-   0x1000000180821c02ULL,
-   0x1000000040931c02ULL,
-   0x1000000010935c02ULL,
-   0x4800000034825c03ULL,
-   0x22c042c042c04287ULL,
-   0x4800000030821c03ULL,
-   0x2800000028031de4ULL,
-   0x4801000020a29c03ULL,
-   0x280000002c035de4ULL,
-   0x0800000000b2dc42ULL,
-   0x4801000024c31c03ULL,
-   0x9400000000a01fc5ULL,
-   0x200002e04202c047ULL,
-   0x2800400020001de4ULL,
-   0x0800000000d35c42ULL,
-   0x9400000100c107c5ULL,
-   0x9400000140c01f85ULL,
-   0x8000000000001de7ULL
-};
-
-/* NOTE: intentionally using the same names as NV */
-static const char *nve4_pm_query_names[] =
-{
-   /* MP counters */
-   "active_cycles",
-   "active_warps",
-   "atom_count",
-   "branch",
-   "divergent_branch",
-   "gld_request",
-   "global_ld_mem_divergence_replays",
-   "global_store_transaction",
-   "global_st_mem_divergence_replays",
-   "gred_count",
-   "gst_request",
-   "inst_executed",
-   "inst_issued",
-   "inst_issued1",
-   "inst_issued2",
-   "l1_global_load_hit",
-   "l1_global_load_miss",
-   "l1_local_load_hit",
-   "l1_local_load_miss",
-   "l1_local_store_hit",
-   "l1_local_store_miss",
-   "l1_shared_load_transactions",
-   "l1_shared_store_transactions",
-   "local_load",
-   "local_load_transactions",
-   "local_store",
-   "local_store_transactions",
-   "prof_trigger_00",
-   "prof_trigger_01",
-   "prof_trigger_02",
-   "prof_trigger_03",
-   "prof_trigger_04",
-   "prof_trigger_05",
-   "prof_trigger_06",
-   "prof_trigger_07",
-   "shared_load",
-   "shared_load_replay",
-   "shared_store",
-   "shared_store_replay",
-   "sm_cta_launched",
-   "threads_launched",
-   "uncached_global_load_transaction",
-   "warps_launched",
-   /* metrics, i.e. functions of the MP counters */
-   "metric-ipc",                   /* inst_executed, clock */
-   "metric-ipac",                  /* inst_executed, active_cycles */
-   "metric-ipec",                  /* inst_executed, (bool)inst_executed */
-   "metric-achieved_occupancy",    /* active_warps, active_cycles */
-   "metric-sm_efficiency",         /* active_cycles, clock */
-   "metric-inst_replay_overhead"   /* inst_issued, inst_executed */
-};
-
-/* For simplicity, we will allocate as many group slots as we allocate counter
- * slots. This means that a single counter which wants to source from 2 groups
- * will have to be declared as using 2 counter slots. This shouldn't really be
- * a problem because such queries don't make much sense ... (unless someone is
- * really creative).
- */
-struct nvc0_mp_counter_cfg
-{
-   uint32_t func    : 16; /* mask or 4-bit logic op (depending on mode) */
-   uint32_t mode    : 4;  /* LOGOP,B6,LOGOP_B6(_PULSE) */
-   uint32_t num_src : 3;  /* number of sources (1 - 6, only for NVC0:NVE4) */
-   uint32_t sig_dom : 1;  /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
-   uint32_t sig_sel : 8;  /* signal group */
-   uint64_t src_sel;      /* signal selection for up to 6 sources (48 bit) */
-};
-
-#define NVC0_COUNTER_OPn_SUM            0
-#define NVC0_COUNTER_OPn_OR             1
-#define NVC0_COUNTER_OPn_AND            2
-#define NVC0_COUNTER_OP2_REL_SUM_MM     3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
-#define NVC0_COUNTER_OP2_DIV_SUM_M0     4 /* sum(ctr0) / ctr1 of MP[0]) */
-#define NVC0_COUNTER_OP2_AVG_DIV_MM     5 /* avg(ctr0 / ctr1) */
-#define NVC0_COUNTER_OP2_AVG_DIV_M0     6 /* avg(ctr0) / ctr1 of MP[0]) */
-
-struct nvc0_hw_sm_query_cfg
-{
-   struct nvc0_mp_counter_cfg ctr[4];
-   uint8_t num_counters;
-   uint8_t op;
-   uint8_t norm[2]; /* normalization num,denom */
-};
-
-#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
-#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
-#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
-   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
-   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
-   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
-#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
-   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
-   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
-   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
-#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
-   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
-   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
-   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
-
-/* NOTES:
- * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
- * inst_executed etc.: we only count a single warp scheduler
- * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
- *  this is inaccurate !
- */
-static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
-{
-   _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
-   _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
-   _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
-   _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c, 1, 1),
-   _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
-   _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
-   _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
-   _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004, 1, 1),
-   _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
-   _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
-   _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
-   _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398, 1, 1),
-   _Q1A(INST_ISSUED,   0x0003, B6, ISSUE, 0x00000104, 1, 1),
-   _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004, 1, 1),
-   _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008, 1, 1),
-   _Q1B(L1_GLD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
-   _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
-   _Q1B(L1_LOCAL_LD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
-   _Q1B(L1_LOCAL_LD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
-   _Q1B(L1_LOCAL_ST_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
-   _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
-   _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
-   _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
-   _Q1A(LOCAL_LD,    0x0001, B6, LDST, 0x00000008, 1, 1),
-   _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
-   _Q1A(LOCAL_ST,    0x0001, B6, LDST, 0x0000000c, 1, 1),
-   _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
-   _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
-   _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
-   _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
-   _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
-   _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
-   _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
-   _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
-   _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
-   _Q1A(SHARED_LD,   0x0001, B6, LDST, 0x00000000, 1, 1),
-   _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
-   _Q1A(SHARED_ST,   0x0001, B6, LDST, 0x00000004, 1, 1),
-   _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
-   _Q1B(SM_CTA_LAUNCHED,      0x0001, B6, WARP, 0x0000001c, 1, 1),
-   _Q1A(THREADS_LAUNCHED,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
-   _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
-   _Q1A(WARPS_LAUNCHED,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
-   _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
-   _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
-   _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
-   _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
-   _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
-   _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
-};
-
-#undef _Q1A
-#undef _Q1B
-#undef _M2A
-#undef _M2B
-
-/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
-static const uint64_t nvc0_read_hw_sm_counters_code[] =
-{
-   /* mov b32 $r8 $tidx
-    * mov b32 $r9 $physid
-    * mov b32 $r0 $pm0
-    * mov b32 $r1 $pm1
-    * mov b32 $r2 $pm2
-    * mov b32 $r3 $pm3
-    * mov b32 $r4 $pm4
-    * mov b32 $r5 $pm5
-    * mov b32 $r6 $pm6
-    * mov b32 $r7 $pm7
-    * set $p0 0x1 eq u32 $r8 0x0
-    * mov b32 $r10 c0[0x0]
-    * mov b32 $r11 c0[0x4]
-    * ext u32 $r8 $r9 0x414
-    * (not $p0) exit
-    * mul $r8 u32 $r8 u32 36
-    * add b32 $r10 $c $r10 $r8
-    * add b32 $r11 $r11 0x0 $c
-    * mov b32 $r8 c0[0x8]
-    * st b128 wt g[$r10d+0x00] $r0q
-    * st b128 wt g[$r10d+0x10] $r4q
-    * st b32 wt g[$r10d+0x20] $r8
-    * exit */
-   0x2c00000084021c04ULL,
-   0x2c0000000c025c04ULL,
-   0x2c00000010001c04ULL,
-   0x2c00000014005c04ULL,
-   0x2c00000018009c04ULL,
-   0x2c0000001c00dc04ULL,
-   0x2c00000020011c04ULL,
-   0x2c00000024015c04ULL,
-   0x2c00000028019c04ULL,
-   0x2c0000002c01dc04ULL,
-   0x190e0000fc81dc03ULL,
-   0x2800400000029de4ULL,
-   0x280040001002dde4ULL,
-   0x7000c01050921c03ULL,
-   0x80000000000021e7ULL,
-   0x1000000090821c02ULL,
-   0x4801000020a29c03ULL,
-   0x0800000000b2dc42ULL,
-   0x2800400020021de4ULL,
-   0x9400000000a01fc5ULL,
-   0x9400000040a11fc5ULL,
-   0x9400000080a21f85ULL,
-   0x8000000000001de7ULL
-};
-
-static const char *nvc0_pm_query_names[] =
-{
-   /* MP counters */
-   "active_cycles",
-   "active_warps",
-   "atom_count",
-   "branch",
-   "divergent_branch",
-   "gld_request",
-   "gred_count",
-   "gst_request",
-   "inst_executed",
-   "inst_issued1_0",
-   "inst_issued1_1",
-   "inst_issued2_0",
-   "inst_issued2_1",
-   "local_load",
-   "local_store",
-   "prof_trigger_00",
-   "prof_trigger_01",
-   "prof_trigger_02",
-   "prof_trigger_03",
-   "prof_trigger_04",
-   "prof_trigger_05",
-   "prof_trigger_06",
-   "prof_trigger_07",
-   "shared_load",
-   "shared_store",
-   "threads_launched",
-   "thread_inst_executed_0",
-   "thread_inst_executed_1",
-   "thread_inst_executed_2",
-   "thread_inst_executed_3",
-   "warps_launched",
-};
-
-#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
-
-static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] =
-{
-   _Q(ACTIVE_CYCLES,       0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(ACTIVE_WARPS,        0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
-   _Q(ATOM_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(BRANCH,              0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
-   _Q(DIVERGENT_BRANCH,    0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
-   _Q(GLD_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(GRED_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(GST_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(INST_EXECUTED,       0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
-   _Q(INST_ISSUED1_0,      0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(INST_ISSUED1_1,      0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(INST_ISSUED2_0,      0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(INST_ISSUED2_1,      0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(LOCAL_LD,            0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(LOCAL_ST,            0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(PROF_TRIGGER_0,      0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(PROF_TRIGGER_1,      0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(PROF_TRIGGER_2,      0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(PROF_TRIGGER_3,      0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(PROF_TRIGGER_4,      0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(PROF_TRIGGER_5,      0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(PROF_TRIGGER_6,      0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(PROF_TRIGGER_7,      0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(SHARED_LD,           0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(SHARED_ST,           0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(THREADS_LAUNCHED,    0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
-   _Q(TH_INST_EXECUTED_0,  0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
-   _Q(TH_INST_EXECUTED_1,  0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
-   _Q(TH_INST_EXECUTED_2,  0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
-   _Q(TH_INST_EXECUTED_3,  0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
-   _Q(WARPS_LAUNCHED,      0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
-};
-
-#undef _Q
-
-static const struct nvc0_hw_sm_query_cfg *
-nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q)
-{
-   struct nvc0_screen *screen = nvc0->screen;
-
-   if (screen->base.class_3d >= NVE4_3D_CLASS)
-      return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
-   return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)];
-}
-
-boolean
-nvc0_hw_sm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
-{
-   struct nvc0_screen *screen = nvc0->screen;
-   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
-   const struct nvc0_hw_sm_query_cfg *cfg;
-   unsigned i, c;
-   unsigned num_ab[2] = { 0, 0 };
-
-   cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
-
-   /* check if we have enough free counter slots */
-   for (i = 0; i < cfg->num_counters; ++i)
-      num_ab[cfg->ctr[i].sig_dom]++;
-
-   if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
-       screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
-      NOUVEAU_ERR("Not enough free MP counter slots !\n");
-      return false;
-   }
-
-   assert(cfg->num_counters <= 4);
-   PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6);
-
-   if (!screen->pm.mp_counters_enabled) {
-      screen->pm.mp_counters_enabled = true;
-      BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
-      PUSH_DATA (push, 0x1fcb);
-   }
-
-   /* set sequence field to 0 (used to check if result is available) */
-   for (i = 0; i < screen->mp_count; ++i)
-      q->data[i * 10 + 10] = 0;
-
-   for (i = 0; i < cfg->num_counters; ++i) {
-      const unsigned d = cfg->ctr[i].sig_dom;
-
-      if (!screen->pm.num_hw_sm_active[d]) {
-         uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
-         if (screen->pm.num_hw_sm_active[!d])
-            m |= 1 << (7 + (8 * d));
-         BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
-         PUSH_DATA (push, m);
-      }
-      screen->pm.num_hw_sm_active[d]++;
-
-      for (c = d * 4; c < (d * 4 + 4); ++c) {
-         if (!screen->pm.mp_counter[c]) {
-            q->ctr[i] = c;
-            screen->pm.mp_counter[c] = (struct pipe_query *)q;
-            break;
-         }
-      }
-      assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
-
-      /* configure and reset the counter(s) */
-      if (is_nve4) {
-         if (d == 0)
-            BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
-         else
-            BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
-         PUSH_DATA (push, cfg->ctr[i].sig_sel);
-         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
-         PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
-         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
-         PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
-         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
-         PUSH_DATA (push, 0);
-      } else {
-         unsigned s;
-
-         for (s = 0; s < cfg->ctr[i].num_src; s++) {
-            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1);
-            PUSH_DATA (push, cfg->ctr[i].sig_sel);
-            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1);
-            PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff);
-            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1);
-            PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
-            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1);
-            PUSH_DATA (push, 0);
-         }
-      }
-   }
-   return true;
-}
-
-static void
-nvc0_hw_sm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
-{
-   struct nvc0_screen *screen = nvc0->screen;
-   struct pipe_context *pipe = &nvc0->base.pipe;
-   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
-   uint32_t mask;
-   uint32_t input[3];
-   const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
-   const uint grid[3] = { screen->mp_count, 1, 1 };
-   unsigned c;
-   const struct nvc0_hw_sm_query_cfg *cfg;
-
-   cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
-
-   if (unlikely(!screen->pm.prog)) {
-      struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
-      prog->type = PIPE_SHADER_COMPUTE;
-      prog->translated = true;
-      prog->num_gprs = 14;
-      prog->parm_size = 12;
-      if (is_nve4) {
-         prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
-         prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
-      } else {
-         prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
-         prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
-      }
-      screen->pm.prog = prog;
-   }
-
-   /* disable all counting */
-   PUSH_SPACE(push, 8);
-   for (c = 0; c < 8; ++c)
-      if (screen->pm.mp_counter[c]) {
-         if (is_nve4) {
-            IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
-         } else {
-            IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
-         }
-      }
-   /* release counters for this query */
-   for (c = 0; c < 8; ++c) {
-      if (nvc0_query(screen->pm.mp_counter[c]) == q) {
-         screen->pm.num_hw_sm_active[c / 4]--;
-         screen->pm.mp_counter[c] = NULL;
-      }
-   }
-
-   BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
-                q->bo);
-
-   PUSH_SPACE(push, 1);
-   IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
-
-   pipe->bind_compute_state(pipe, screen->pm.prog);
-   input[0] = (q->bo->offset + q->base);
-   input[1] = (q->bo->offset + q->base) >> 32;
-   input[2] = q->sequence;
-   pipe->launch_grid(pipe, block, grid, 0, input);
-
-   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
-
-   /* re-activate other counters */
-   PUSH_SPACE(push, 16);
-   mask = 0;
-   for (c = 0; c < 8; ++c) {
-      unsigned i;
-      q = nvc0_query(screen->pm.mp_counter[c]);
-      if (!q)
-         continue;
-      cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
-      for (i = 0; i < cfg->num_counters; ++i) {
-         if (mask & (1 << q->ctr[i]))
-            break;
-         mask |= 1 << q->ctr[i];
-         if (is_nve4) {
-            BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1);
-         } else {
-            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(q->ctr[i])), 1);
-         }
-         PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
-      }
-   }
-}
-
-static inline bool
-nvc0_hw_sm_query_read_data(uint32_t count[32][4],
-                           struct nvc0_context *nvc0, bool wait,
-                           struct nvc0_query *q,
-                           const struct nvc0_hw_sm_query_cfg *cfg,
-                           unsigned mp_count)
-{
-   unsigned p, c;
-
-   for (p = 0; p < mp_count; ++p) {
-      const unsigned b = (0x24 / 4) * p;
-
-      for (c = 0; c < cfg->num_counters; ++c) {
-         if (q->data[b + 8] != q->sequence) {
-            if (!wait)
-               return false;
-            if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
-               return false;
-         }
-         count[p][c] = q->data[b + q->ctr[c]];
-      }
-   }
-   return true;
-}
-
-static inline bool
-nve4_hw_sm_query_read_data(uint32_t count[32][4],
-                           struct nvc0_context *nvc0, bool wait,
-                           struct nvc0_query *q,
-                           const struct nvc0_hw_sm_query_cfg *cfg,
-                           unsigned mp_count)
-{
-   unsigned p, c, d;
-
-   for (p = 0; p < mp_count; ++p) {
-      const unsigned b = (0x60 / 4) * p;
-
-      for (c = 0; c < cfg->num_counters; ++c) {
-         count[p][c] = 0;
-         for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) {
-            if (q->data[b + 20 + d] != q->sequence) {
-               if (!wait)
-                  return false;
-               if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
-                  return false;
-            }
-            if (q->ctr[c] & ~0x3)
-               count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)];
-            else
-               count[p][c] += q->data[b + d * 4 + q->ctr[c]];
-         }
-      }
-   }
-   return true;
-}
-
-/* Metric calculations:
- * sum(x) ... sum of x over all MPs
- * avg(x) ... average of x over all MPs
- *
- * IPC              : sum(inst_executed) / clock
- * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
- * MP_OCCUPANCY     : avg((active_warps / 64) / active_cycles)
- * MP_EFFICIENCY    : avg(active_cycles / clock)
- *
- * NOTE: Interpretation of IPC requires knowledge of MP count.
- */
-static boolean
-nvc0_hw_sm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
-                        void *result, boolean wait)
-{
-   uint32_t count[32][4];
-   uint64_t value = 0;
-   unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
-   unsigned p, c;
-   const struct nvc0_hw_sm_query_cfg *cfg;
-   bool ret;
-
-   cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
-
-   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
-      ret = nve4_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
-   else
-      ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
-   if (!ret)
-      return false;
-
-   if (cfg->op == NVC0_COUNTER_OPn_SUM) {
-      for (c = 0; c < cfg->num_counters; ++c)
-         for (p = 0; p < mp_count; ++p)
-            value += count[p][c];
-      value = (value * cfg->norm[0]) / cfg->norm[1];
-   } else
-   if (cfg->op == NVC0_COUNTER_OPn_OR) {
-      uint32_t v = 0;
-      for (c = 0; c < cfg->num_counters; ++c)
-         for (p = 0; p < mp_count; ++p)
-            v |= count[p][c];
-      value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
-   } else
-   if (cfg->op == NVC0_COUNTER_OPn_AND) {
-      uint32_t v = ~0;
-      for (c = 0; c < cfg->num_counters; ++c)
-         for (p = 0; p < mp_count; ++p)
-            v &= count[p][c];
-      value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
-   } else
-   if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
-      uint64_t v[2] = { 0, 0 };
-      for (p = 0; p < mp_count; ++p) {
-         v[0] += count[p][0];
-         v[1] += count[p][1];
-      }
-      if (v[0])
-         value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
-   } else
-   if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
-      for (p = 0; p < mp_count; ++p)
-         value += count[p][0];
-      if (count[0][1])
-         value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
-      else
-         value = 0;
-   } else
-   if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
-      unsigned mp_used = 0;
-      for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
-         if (count[p][1])
-            value += (count[p][0] * cfg->norm[0]) / count[p][1];
-      if (mp_used)
-         value /= (uint64_t)mp_used * cfg->norm[1];
-   } else
-   if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
-      unsigned mp_used = 0;
-      for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
-         value += count[p][0];
-      if (count[0][1] && mp_used) {
-         value *= cfg->norm[0];
-         value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
-      } else {
-         value = 0;
-      }
-   }
-
-   *(uint64_t *)result = value;
-   return true;
+   PUSH_DATAh(push, hq->bo->offset + hq->offset);
+   PUSH_DATA (push, hq->bo->offset + hq->offset);
 }
 
 int
@@ -1403,24 +147,13 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
                                   struct pipe_driver_query_info *info)
 {
    struct nvc0_screen *screen = nvc0_screen(pscreen);
-   int count = 0;
+   int num_sw_queries = 0, num_hw_queries = 0;
 
-   count += NVC0_QUERY_DRV_STAT_COUNT;
-
-   if (screen->base.device->drm_version >= 0x01000101) {
-      if (screen->compute) {
-         if (screen->base.class_3d == NVE4_3D_CLASS) {
-            count += NVE4_HW_SM_QUERY_COUNT;
-         } else
-         if (screen->base.class_3d < NVE4_3D_CLASS) {
-            /* NVC0_COMPUTE is not always enabled */
-            count += NVC0_HW_SM_QUERY_COUNT;
-         }
-      }
-   }
+   num_sw_queries = nvc0_sw_get_driver_query_info(screen, 0, NULL);
+   num_hw_queries = nvc0_hw_get_driver_query_info(screen, 0, NULL);
 
    if (!info)
-      return count;
+      return num_sw_queries + num_hw_queries;
 
    /* Init default values. */
    info->name = "this_is_not_the_query_you_are_looking_for";
@@ -1430,36 +163,11 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
    info->group_id = -1;
 
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-   if (id < NVC0_QUERY_DRV_STAT_COUNT) {
-      info->name = nvc0_drv_stat_names[id];
-      info->query_type = NVC0_QUERY_DRV_STAT(id);
-      info->max_value.u64 = 0;
-      if (strstr(info->name, "bytes"))
-         info->type = PIPE_DRIVER_QUERY_TYPE_BYTES;
-      info->group_id = NVC0_QUERY_DRV_STAT_GROUP;
-      return 1;
-   } else
+   if (id < num_sw_queries)
+      return nvc0_sw_get_driver_query_info(screen, id, info);
 #endif
-   if (id < count) {
-      if (screen->compute) {
-         if (screen->base.class_3d == NVE4_3D_CLASS) {
-            info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
-            info->query_type = NVE4_HW_SM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
-            info->max_value.u64 =
-               (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
-            info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
-            return 1;
-         } else
-         if (screen->base.class_3d < NVE4_3D_CLASS) {
-            info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
-            info->query_type = NVC0_HW_SM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
-            info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
-            return 1;
-         }
-      }
-   }
-   /* user asked for info about non-existing query */
-   return 0;
+
+   return nvc0_hw_get_driver_query_info(screen, id - num_sw_queries, info);
 }
 
 int
@@ -1480,7 +188,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
             count++;
          } else
          if (screen->base.class_3d < NVE4_3D_CLASS) {
-            count++; /* NVC0_COMPUTE is not always enabled */
+            count++;
          }
       }
    }
@@ -1488,37 +196,35 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
    if (!info)
       return count;
 
-   if (id == NVC0_QUERY_MP_COUNTER_GROUP) {
+   if (id == NVC0_HW_SM_QUERY_GROUP) {
       if (screen->compute) {
          info->name = "MP counters";
          info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
 
+         /* Because we can't expose the number of hardware counters needed for
+          * each different query, we don't want to allow more than one active
+          * query simultaneously to avoid failure when the maximum number of
+          * counters is reached. Note that these groups of GPU counters are
+          * currently only used by AMD_performance_monitor.
+          */
+         info->max_active_queries = 1;
+
          if (screen->base.class_3d == NVE4_3D_CLASS) {
             info->num_queries = NVE4_HW_SM_QUERY_COUNT;
-
-             /* On NVE4+, each multiprocessor have 8 hardware counters separated
-              * in two distinct domains, but we allow only one active query
-              * simultaneously because some of them use more than one hardware
-              * counter and this will result in an undefined behaviour. */
-             info->max_active_queries = 1; /* TODO: handle multiple hw counters */
-             return 1;
+            return 1;
          } else
          if (screen->base.class_3d < NVE4_3D_CLASS) {
             info->num_queries = NVC0_HW_SM_QUERY_COUNT;
-
-            /* On NVC0:NVE4, each multiprocessor have 8 hardware counters
-             * in a single domain. */
-            info->max_active_queries = 8;
             return 1;
          }
       }
    }
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-   else if (id == NVC0_QUERY_DRV_STAT_GROUP) {
+   else if (id == NVC0_SW_QUERY_DRV_STAT_GROUP) {
       info->name = "Driver statistics";
       info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_CPU;
-      info->max_active_queries = NVC0_QUERY_DRV_STAT_COUNT;
-      info->num_queries = NVC0_QUERY_DRV_STAT_COUNT;
+      info->max_active_queries = NVC0_SW_QUERY_DRV_STAT_COUNT;
+      info->num_queries = NVC0_SW_QUERY_DRV_STAT_COUNT;
       return 1;
    }
 #endif
@@ -1536,10 +242,10 @@ nvc0_init_query_functions(struct nvc0_context *nvc0)
 {
    struct pipe_context *pipe = &nvc0->base.pipe;
 
-   pipe->create_query = nvc0_query_create;
-   pipe->destroy_query = nvc0_query_destroy;
-   pipe->begin_query = nvc0_query_begin;
-   pipe->end_query = nvc0_query_end;
-   pipe->get_query_result = nvc0_query_result;
+   pipe->create_query = nvc0_create_query;
+   pipe->destroy_query = nvc0_destroy_query;
+   pipe->begin_query = nvc0_begin_query;
+   pipe->end_query = nvc0_end_query;
+   pipe->get_query_result = nvc0_get_query_result;
    pipe->render_condition = nvc0_render_condition;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
new file mode 100644
index 00000000000..6883ab6ab9d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
@@ -0,0 +1,39 @@
+#ifndef __NVC0_QUERY_H__
+#define __NVC0_QUERY_H__
+
+#include "pipe/p_context.h"
+
+#include "nouveau_context.h"
+
+struct nvc0_context;
+struct nvc0_query;
+
+struct nvc0_query_funcs {
+   void (*destroy_query)(struct nvc0_context *, struct nvc0_query *);
+   boolean (*begin_query)(struct nvc0_context *, struct nvc0_query *);
+   void (*end_query)(struct nvc0_context *, struct nvc0_query *);
+   boolean (*get_query_result)(struct nvc0_context *, struct nvc0_query *,
+                               boolean, union pipe_query_result *);
+};
+
+struct nvc0_query {
+   const struct nvc0_query_funcs *funcs;
+   uint16_t type;
+   uint16_t index;
+};
+
+static inline struct nvc0_query *
+nvc0_query(struct pipe_query *pipe)
+{
+   return (struct nvc0_query *)pipe;
+}
+
+/*
+ * Driver queries groups:
+ */
+#define NVC0_HW_SM_QUERY_GROUP       0
+#define NVC0_SW_QUERY_DRV_STAT_GROUP 1
+
+void nvc0_init_query_functions(struct nvc0_context *);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
new file mode 100644
index 00000000000..90ee82f21e5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_query_hw.h"
+#include "nvc0/nvc0_query_hw_metric.h"
+#include "nvc0/nvc0_query_hw_sm.h"
+
+#define NVC0_HW_QUERY_STATE_READY   0
+#define NVC0_HW_QUERY_STATE_ACTIVE  1
+#define NVC0_HW_QUERY_STATE_ENDED   2
+#define NVC0_HW_QUERY_STATE_FLUSHED 3
+
+#define NVC0_HW_QUERY_ALLOC_SPACE 256
+
+bool
+nvc0_hw_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q,
+                       int size)
+{
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+   struct nvc0_screen *screen = nvc0->screen;
+   int ret;
+
+   if (hq->bo) {
+      nouveau_bo_ref(NULL, &hq->bo);
+      if (hq->mm) {
+         if (hq->state == NVC0_HW_QUERY_STATE_READY)
+            nouveau_mm_free(hq->mm);
+         else
+            nouveau_fence_work(screen->base.fence.current,
+                               nouveau_mm_free_work, hq->mm);
+      }
+   }
+   if (size) {
+      hq->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &hq->bo,
+                                   &hq->base_offset);
+      if (!hq->bo)
+         return false;
+      hq->offset = hq->base_offset;
+
+      ret = nouveau_bo_map(hq->bo, 0, screen->base.client);
+      if (ret) {
+         nvc0_hw_query_allocate(nvc0, q, 0);
+         return false;
+      }
+      hq->data = (uint32_t *)((uint8_t *)hq->bo->map + hq->base_offset);
+   }
+   return true;
+}
+
+static void
+nvc0_hw_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q,
+                  unsigned offset, uint32_t get)
+{
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   offset += hq->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+   BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->sequence);
+   PUSH_DATA (push, get);
+}
+
+static void
+nvc0_hw_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   hq->offset += hq->rotate;
+   hq->data += hq->rotate / sizeof(*hq->data);
+   if (hq->offset - hq->base_offset == NVC0_HW_QUERY_ALLOC_SPACE)
+      nvc0_hw_query_allocate(nvc0, q, NVC0_HW_QUERY_ALLOC_SPACE);
+}
+
+static inline void
+nvc0_hw_query_update(struct nouveau_client *cli, struct nvc0_query *q)
+{
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   if (hq->is64bit) {
+      if (nouveau_fence_signalled(hq->fence))
+         hq->state = NVC0_HW_QUERY_STATE_READY;
+   } else {
+      if (hq->data[0] == hq->sequence)
+         hq->state = NVC0_HW_QUERY_STATE_READY;
+   }
+}
+
+static void
+nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+   nvc0_hw_query_allocate(nvc0, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
+}
+
+static boolean
+nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+   bool ret = true;
+
+   if (hq->funcs && hq->funcs->begin_query)
+      return hq->funcs->begin_query(nvc0, hq);
+
+   /* For occlusion queries we have to change the storage, because a previous
+    * query might set the initial render conition to false even *after* we re-
+    * initialized it to true.
+    */
+   if (hq->rotate) {
+      nvc0_hw_query_rotate(nvc0, q);
+
+      /* XXX: can we do this with the GPU, and sync with respect to a previous
+       *  query ?
+       */
+      hq->data[0] = hq->sequence; /* initialize sequence */
+      hq->data[1] = 1; /* initial render condition = true */
+      hq->data[4] = hq->sequence + 1; /* for comparison COND_MODE */
+      hq->data[5] = 0;
+   }
+   hq->sequence++;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      hq->nesting = nvc0->screen->num_occlusion_queries_active++;
+      if (hq->nesting) {
+         nvc0_hw_query_get(push, q, 0x10, 0x0100f002);
+      } else {
+         PUSH_SPACE(push, 3);
+         BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
+         PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
+         IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+      }
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nvc0_hw_query_get(push, q, 0x10, 0x09005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nvc0_hw_query_get(push, q, 0x10, 0x05805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nvc0_hw_query_get(push, q, 0x20, 0x05805002 | (q->index << 5));
+      nvc0_hw_query_get(push, q, 0x30, 0x06805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      nvc0_hw_query_get(push, q, 0x10, 0x03005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      nvc0_hw_query_get(push, q, 0x10, 0x00005002);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      nvc0_hw_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
+      nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      break;
+   default:
+      break;
+   }
+   hq->state = NVC0_HW_QUERY_STATE_ACTIVE;
+   return ret;
+}
+
+static void
+nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   if (hq->funcs && hq->funcs->end_query) {
+      hq->funcs->end_query(nvc0, hq);
+      return;
+   }
+
+   if (hq->state != NVC0_HW_QUERY_STATE_ACTIVE) {
+      /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
+      if (hq->rotate)
+         nvc0_hw_query_rotate(nvc0, q);
+      hq->sequence++;
+   }
+   hq->state = NVC0_HW_QUERY_STATE_ENDED;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      nvc0_hw_query_get(push, q, 0, 0x0100f002);
+      if (--nvc0->screen->num_occlusion_queries_active == 0) {
+         PUSH_SPACE(push, 1);
+         IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
+      }
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nvc0_hw_query_get(push, q, 0, 0x09005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nvc0_hw_query_get(push, q, 0, 0x05805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nvc0_hw_query_get(push, q, 0x00, 0x05805002 | (q->index << 5));
+      nvc0_hw_query_get(push, q, 0x10, 0x06805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      /* TODO: How do we sum over all streams for render condition ? */
+      /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
+      nvc0_hw_query_get(push, q, 0x00, 0x03005002 | (q->index << 5));
+      nvc0_hw_query_get(push, q, 0x20, 0x00005002);
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIME_ELAPSED:
+      nvc0_hw_query_get(push, q, 0, 0x00005002);
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      nvc0_hw_query_get(push, q, 0, 0x1000f010);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      nvc0_hw_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
+      nvc0_hw_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
+      nvc0_hw_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
+      nvc0_hw_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
+      nvc0_hw_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
+      nvc0_hw_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
+      nvc0_hw_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
+      nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
+      nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
+      nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      /* This query is not issued on GPU because disjoint is forced to false */
+      hq->state = NVC0_HW_QUERY_STATE_READY;
+      break;
+   case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
+      /* indexed by TFB buffer instead of by vertex stream */
+      nvc0_hw_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
+      break;
+   default:
+      break;
+   }
+   if (hq->is64bit)
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &hq->fence);
+}
+
+static boolean
+nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
+                         boolean wait, union pipe_query_result *result)
+{
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+   uint64_t *res64 = (uint64_t*)result;
+   uint32_t *res32 = (uint32_t*)result;
+   uint8_t *res8 = (uint8_t*)result;
+   uint64_t *data64 = (uint64_t *)hq->data;
+   unsigned i;
+
+   if (hq->funcs && hq->funcs->get_query_result)
+      return hq->funcs->get_query_result(nvc0, hq, wait, result);
+
+   if (hq->state != NVC0_HW_QUERY_STATE_READY)
+      nvc0_hw_query_update(nvc0->screen->base.client, q);
+
+   if (hq->state != NVC0_HW_QUERY_STATE_READY) {
+      if (!wait) {
+         if (hq->state != NVC0_HW_QUERY_STATE_FLUSHED) {
+            hq->state = NVC0_HW_QUERY_STATE_FLUSHED;
+            /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
+            PUSH_KICK(nvc0->base.pushbuf);
+         }
+         return false;
+      }
+      if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
+         return false;
+      NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
+   }
+   hq->state = NVC0_HW_QUERY_STATE_READY;
+
+   switch (q->type) {
+   case PIPE_QUERY_GPU_FINISHED:
+      res8[0] = true;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
+      res64[0] = hq->data[1] - hq->data[5];
+      break;
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      res8[0] = hq->data[1] != hq->data[5];
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
+   case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
+      res64[0] = data64[0] - data64[2];
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      res64[0] = data64[0] - data64[4];
+      res64[1] = data64[2] - data64[6];
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      res8[0] = data64[0] != data64[2];
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      res64[0] = data64[1];
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      res64[0] = 1000000000;
+      res8[8] = false;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      res64[0] = data64[1] - data64[3];
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      for (i = 0; i < 10; ++i)
+         res64[i] = data64[i * 2] - data64[24 + i * 2];
+      break;
+   case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
+      res32[0] = hq->data[1];
+      break;
+   default:
+      assert(0); /* can't happen, we don't create queries with invalid type */
+      return false;
+   }
+
+   return true;
+}
+
+static const struct nvc0_query_funcs hw_query_funcs = {
+   .destroy_query = nvc0_hw_destroy_query,
+   .begin_query = nvc0_hw_begin_query,
+   .end_query = nvc0_hw_end_query,
+   .get_query_result = nvc0_hw_get_query_result,
+};
+
+struct nvc0_query *
+nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index)
+{
+   struct nvc0_hw_query *hq;
+   struct nvc0_query *q;
+   unsigned space = NVC0_HW_QUERY_ALLOC_SPACE;
+
+   hq = nvc0_hw_sm_create_query(nvc0, type);
+   if (hq) {
+      hq->base.funcs = &hw_query_funcs;
+      return (struct nvc0_query *)hq;
+   }
+
+   hq = nvc0_hw_metric_create_query(nvc0, type);
+   if (hq) {
+      hq->base.funcs = &hw_query_funcs;
+      return (struct nvc0_query *)hq;
+   }
+
+   hq = CALLOC_STRUCT(nvc0_hw_query);
+   if (!hq)
+      return NULL;
+
+   q = &hq->base;
+   q->funcs = &hw_query_funcs;
+   q->type = type;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      hq->rotate = 32;
+      space = NVC0_HW_QUERY_ALLOC_SPACE;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      hq->is64bit = true;
+      space = 512;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      hq->is64bit = true;
+      space = 64;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      hq->is64bit = true;
+      q->index = index;
+      space = 32;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+   case PIPE_QUERY_GPU_FINISHED:
+      space = 32;
+      break;
+   case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
+      space = 16;
+      break;
+   default:
+      debug_printf("invalid query type: %u\n", type);
+      FREE(q);
+      return NULL;
+   }
+
+   if (!nvc0_hw_query_allocate(nvc0, q, space)) {
+      FREE(hq);
+      return NULL;
+   }
+
+   if (hq->rotate) {
+      /* we advance before query_begin ! */
+      hq->offset -= hq->rotate;
+      hq->data -= hq->rotate / sizeof(*hq->data);
+   } else
+   if (!hq->is64bit)
+      hq->data[0] = 0; /* initialize sequence */
+
+   return q;
+}
+
+int
+nvc0_hw_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
+                              struct pipe_driver_query_info *info)
+{
+   int num_hw_sm_queries = 0, num_hw_metric_queries = 0;
+
+   num_hw_sm_queries = nvc0_hw_sm_get_driver_query_info(screen, 0, NULL);
+   num_hw_metric_queries =
+      nvc0_hw_metric_get_driver_query_info(screen, 0, NULL);
+
+   if (!info)
+      return num_hw_sm_queries + num_hw_metric_queries;
+
+   if (id < num_hw_sm_queries)
+      return nvc0_hw_sm_get_driver_query_info(screen, id, info);
+
+   return nvc0_hw_metric_get_driver_query_info(screen,
+                                               id - num_hw_sm_queries, info);
+}
+
+void
+nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
+                             struct nvc0_query *q, unsigned result_offset)
+{
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
+
+   PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
+   nouveau_pushbuf_space(push, 0, 0, 1);
+   nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
+                        NVC0_IB_ENTRY_1_NO_PREFETCH);
+}
+
+void
+nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q)
+{
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+   unsigned offset = hq->offset;
+
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->sequence);
+   PUSH_DATA (push, (1 << 12) |
+              NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
new file mode 100644
index 00000000000..3701eb7100f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
@@ -0,0 +1,56 @@
+#ifndef __NVC0_QUERY_HW_H__
+#define __NVC0_QUERY_HW_H__
+
+#include "nouveau_fence.h"
+#include "nouveau_mm.h"
+
+#include "nvc0_query.h"
+
+#define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
+
+struct nvc0_hw_query;
+
+struct nvc0_hw_query_funcs {
+   void (*destroy_query)(struct nvc0_context *, struct nvc0_hw_query *);
+   boolean (*begin_query)(struct nvc0_context *, struct nvc0_hw_query *);
+   void (*end_query)(struct nvc0_context *, struct nvc0_hw_query *);
+   boolean (*get_query_result)(struct nvc0_context *, struct nvc0_hw_query *,
+                               boolean, union pipe_query_result *);
+};
+
+struct nvc0_hw_query {
+   struct nvc0_query base;
+   const struct nvc0_hw_query_funcs *funcs;
+   uint32_t *data;
+   uint32_t sequence;
+   struct nouveau_bo *bo;
+   uint32_t base_offset;
+   uint32_t offset; /* base_offset + i * rotate */
+   uint8_t state;
+   boolean is64bit;
+   uint8_t rotate;
+   int nesting; /* only used for occlusion queries */
+   struct nouveau_mm_allocation *mm;
+   struct nouveau_fence *fence;
+};
+
+static inline struct nvc0_hw_query *
+nvc0_hw_query(struct nvc0_query *q)
+{
+   return (struct nvc0_hw_query *)q;
+}
+
+struct nvc0_query *
+nvc0_hw_create_query(struct nvc0_context *, unsigned, unsigned);
+int
+nvc0_hw_get_driver_query_info(struct nvc0_screen *, unsigned,
+                              struct pipe_driver_query_info *);
+bool
+nvc0_hw_query_allocate(struct nvc0_context *, struct nvc0_query *, int);
+void
+nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *,
+                             unsigned);
+void
+nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *, struct nvc0_query *);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
new file mode 100644
index 00000000000..25aa09be42a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_query_hw_metric.h"
+#include "nvc0/nvc0_query_hw_sm.h"
+
+/* === PERFORMANCE MONITORING METRICS for NVC0:NVE4 === */
+static const char *nvc0_hw_metric_names[] =
+{
+   "metric-achieved_occupancy",
+   "metric-branch_efficiency",
+   "metric-inst_issued",
+   "metric-inst_per_wrap",
+   "metric-inst_replay_overhead",
+   "metric-issued_ipc",
+   "metric-issue_slots",
+   "metric-issue_slot_utilization",
+   "metric-ipc",
+};
+
+struct nvc0_hw_metric_query_cfg {
+   uint32_t queries[8];
+   uint32_t num_queries;
+};
+
+#define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n)
+#define _M(n, c) [NVC0_HW_METRIC_QUERY_##n] = c
+
+/* ==== Compute capability 2.0 (GF100/GF110) ==== */
+static const struct nvc0_hw_metric_query_cfg
+sm20_achieved_occupancy =
+{
+   .queries[0]  = _SM(ACTIVE_WARPS),
+   .queries[1]  = _SM(ACTIVE_CYCLES),
+   .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm20_branch_efficiency =
+{
+   .queries[0]  = _SM(BRANCH),
+   .queries[1]  = _SM(DIVERGENT_BRANCH),
+   .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm20_inst_per_wrap =
+{
+   .queries[0]  = _SM(INST_EXECUTED),
+   .queries[1]  = _SM(WARPS_LAUNCHED),
+   .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm20_inst_replay_overhead =
+{
+   .queries[0]  = _SM(INST_ISSUED),
+   .queries[1]  = _SM(INST_EXECUTED),
+   .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm20_issued_ipc =
+{
+   .queries[0]  = _SM(INST_ISSUED),
+   .queries[1]  = _SM(ACTIVE_CYCLES),
+   .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm20_ipc =
+{
+   .queries[0]  = _SM(INST_EXECUTED),
+   .queries[1]  = _SM(ACTIVE_CYCLES),
+   .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] =
+{
+   _M(ACHIEVED_OCCUPANCY,     &sm20_achieved_occupancy),
+   _M(BRANCH_EFFICIENCY,      &sm20_branch_efficiency),
+   _M(INST_ISSUED,            NULL),
+   _M(INST_PER_WRAP,          &sm20_inst_per_wrap),
+   _M(INST_REPLAY_OVERHEAD,   &sm20_inst_replay_overhead),
+   _M(ISSUED_IPC,             &sm20_issued_ipc),
+   _M(ISSUE_SLOTS,            NULL),
+   _M(ISSUE_SLOT_UTILIZATION, &sm20_issued_ipc),
+   _M(IPC,                    &sm20_ipc),
+};
+
+/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
+static const struct nvc0_hw_metric_query_cfg
+sm21_inst_issued =
+{
+   .queries[0]  = _SM(INST_ISSUED1_0),
+   .queries[1]  = _SM(INST_ISSUED1_1),
+   .queries[2]  = _SM(INST_ISSUED2_0),
+   .queries[3]  = _SM(INST_ISSUED2_1),
+   .num_queries = 4,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm21_inst_replay_overhead =
+{
+   .queries[0]  = _SM(INST_ISSUED1_0),
+   .queries[1]  = _SM(INST_ISSUED1_1),
+   .queries[2]  = _SM(INST_ISSUED2_0),
+   .queries[3]  = _SM(INST_ISSUED2_1),
+   .queries[4]  = _SM(INST_EXECUTED),
+   .num_queries = 5,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm21_issued_ipc =
+{
+   .queries[0]  = _SM(INST_ISSUED1_0),
+   .queries[1]  = _SM(INST_ISSUED1_1),
+   .queries[2]  = _SM(INST_ISSUED2_0),
+   .queries[3]  = _SM(INST_ISSUED2_1),
+   .queries[4]  = _SM(ACTIVE_CYCLES),
+   .num_queries = 5,
+};
+
+static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
+{
+   _M(ACHIEVED_OCCUPANCY,     &sm20_achieved_occupancy),
+   _M(BRANCH_EFFICIENCY,      &sm20_branch_efficiency),
+   _M(INST_ISSUED,            &sm21_inst_issued),
+   _M(INST_PER_WRAP,          &sm20_inst_per_wrap),
+   _M(INST_REPLAY_OVERHEAD,   &sm21_inst_replay_overhead),
+   _M(ISSUED_IPC,             &sm21_issued_ipc),
+   _M(ISSUE_SLOTS,            &sm21_inst_issued),
+   _M(ISSUE_SLOT_UTILIZATION, &sm21_issued_ipc),
+   _M(IPC,                    &sm20_ipc),
+};
+
+#undef _SM
+#undef _M
+
+static inline const struct nvc0_hw_metric_query_cfg **
+nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
+{
+   struct nouveau_device *dev = screen->base.device;
+
+   if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+      return sm20_hw_metric_queries;
+   return sm21_hw_metric_queries;
+}
+
+static const struct nvc0_hw_metric_query_cfg *
+nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0,
+                             struct nvc0_hw_query *hq)
+{
+   const struct nvc0_hw_metric_query_cfg **queries;
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nvc0_query *q = &hq->base;
+
+   queries = nvc0_hw_metric_get_queries(screen);
+   return queries[q->type - NVC0_HW_METRIC_QUERY(0)];
+}
+
+static void
+nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
+                             struct nvc0_hw_query *hq)
+{
+   struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++)
+      hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
+   FREE(hmq);
+}
+
+static boolean
+nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
+{
+   struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
+   boolean ret = false;
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++) {
+      ret = hmq->queries[i]->funcs->begin_query(nvc0, hmq->queries[i]);
+      if (!ret)
+         return ret;
+   }
+   return ret;
+}
+
+static void
+nvc0_hw_metric_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
+{
+   struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++)
+      hmq->queries[i]->funcs->end_query(nvc0, hmq->queries[i]);
+}
+
+static uint64_t
+sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
+{
+   switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
+   case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
+      /* (active_warps / active_cycles) / max. number of warps on a MP */
+      if (res64[1])
+         return (res64[0] / (double)res64[1]) / 48;
+      break;
+   case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
+      /* (branch / (branch + divergent_branch)) * 100 */
+      if (res64[0] + res64[1])
+         return (res64[0] / (double)(res64[0] + res64[1])) * 100;
+      break;
+   case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
+      /* inst_executed / warps_launched */
+      if (res64[1])
+         return res64[0] / (double)res64[1];
+      break;
+   case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
+      /* (inst_issued - inst_executed) / inst_executed */
+      if (res64[1])
+         return (res64[0] - res64[1]) / (double)res64[1];
+      break;
+   case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
+      /* inst_issued / active_cycles */
+      if (res64[1])
+         return res64[0] / (double)res64[1];
+      break;
+   case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
+      /* ((inst_issued / 2) / active_cycles) * 100 */
+      if (res64[1])
+         return ((res64[0] / 2) / (double)res64[1]) * 100;
+      break;
+   case NVC0_HW_METRIC_QUERY_IPC:
+      /* inst_executed / active_cycles */
+      if (res64[1])
+         return res64[0] / (double)res64[1];
+      break;
+   default:
+      debug_printf("invalid metric type: %d\n",
+                   hq->base.type - NVC0_HW_METRIC_QUERY(0));
+      break;
+   }
+   return 0;
+}
+
+static uint64_t
+sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
+{
+   switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
+   case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
+      return sm20_hw_metric_calc_result(hq, res64);
+   case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
+      return sm20_hw_metric_calc_result(hq, res64);
+   case NVC0_HW_METRIC_QUERY_INST_ISSUED:
+      /* issued1_0 + issued1_1 + (issued2_0 + issued2_1) * 2 */
+      return res64[0] + res64[1] + (res64[2] + res64[3]) * 2;
+      break;
+   case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
+      return sm20_hw_metric_calc_result(hq, res64);
+   case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
+      /* (metric-inst_issued - inst_executed) / inst_executed */
+      if (res64[4])
+         return (((res64[0] + res64[1] + (res64[2] + res64[3]) * 2) -
+                   res64[4]) / (double)res64[4]);
+      break;
+   case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
+      /* metric-inst_issued / active_cycles */
+      if (res64[4])
+         return (res64[0] + res64[1] + (res64[2] + res64[3]) * 2) /
+                (double)res64[4];
+      break;
+   case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
+      /* issued1_0 + issued1_1 + issued2_0 + issued2_1 */
+      return res64[0] + res64[1] + res64[2] + res64[3];
+      break;
+   case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
+      /* ((metric-issue_slots / 2) / active_cycles) * 100 */
+      if (res64[4])
+         return (((res64[0] + res64[1] + res64[2] + res64[3]) / 2) /
+                 (double)res64[4]) * 100;
+      break;
+   case NVC0_HW_METRIC_QUERY_IPC:
+      return sm20_hw_metric_calc_result(hq, res64);
+   default:
+      debug_printf("invalid metric type: %d\n",
+                   hq->base.type - NVC0_HW_METRIC_QUERY(0));
+      break;
+   }
+   return 0;
+}
+
+static boolean
+nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
+                                struct nvc0_hw_query *hq, boolean wait,
+                                union pipe_query_result *result)
+{
+   struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_device *dev = screen->base.device;
+   union pipe_query_result results[8] = {};
+   uint64_t res64[8] = {};
+   uint64_t value = 0;
+   boolean ret = false;
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++) {
+      ret = hmq->queries[i]->funcs->get_query_result(nvc0, hmq->queries[i],
+                                                     wait, &results[i]);
+      if (!ret)
+         return ret;
+      res64[i] = *(uint64_t *)&results[i];
+   }
+
+   if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+      value = sm20_hw_metric_calc_result(hq, res64);
+   else
+      value = sm21_hw_metric_calc_result(hq, res64);
+
+   *(uint64_t *)result = value;
+   return ret;
+}
+
+static const struct nvc0_hw_query_funcs hw_metric_query_funcs = {
+   .destroy_query = nvc0_hw_metric_destroy_query,
+   .begin_query = nvc0_hw_metric_begin_query,
+   .end_query = nvc0_hw_metric_end_query,
+   .get_query_result = nvc0_hw_metric_get_query_result,
+};
+
+struct nvc0_hw_query *
+nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type)
+{
+   const struct nvc0_hw_metric_query_cfg *cfg;
+   struct nvc0_hw_metric_query *hmq;
+   struct nvc0_hw_query *hq;
+   unsigned i;
+
+   if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST)
+      return NULL;
+
+   hmq = CALLOC_STRUCT(nvc0_hw_metric_query);
+   if (!hmq)
+      return NULL;
+
+   hq = &hmq->base;
+   hq->funcs = &hw_metric_query_funcs;
+   hq->base.type = type;
+
+   cfg = nvc0_hw_metric_query_get_cfg(nvc0, hq);
+
+   for (i = 0; i < cfg->num_queries; i++) {
+      hmq->queries[i] = nvc0_hw_sm_create_query(nvc0, cfg->queries[i]);
+      if (!hmq->queries[i]) {
+         nvc0_hw_metric_destroy_query(nvc0, hq);
+         return NULL;
+      }
+      hmq->num_queries++;
+   }
+
+   return hq;
+}
+
+static int
+nvc0_hw_metric_get_next_query_id(const struct nvc0_hw_metric_query_cfg **queries,
+                                 unsigned id)
+{
+   unsigned i, next = 0;
+
+   for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) {
+      if (!queries[i]) {
+         next++;
+      } else
+      if (i >= id && queries[id + next]) {
+         break;
+      }
+   }
+   return id + next;
+}
+
+int
+nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
+                                     struct pipe_driver_query_info *info)
+{
+   uint16_t class_3d = screen->base.class_3d;
+   int count = 0;
+
+   if (screen->base.device->drm_version >= 0x01000101) {
+      if (screen->compute) {
+         if (class_3d < NVE4_3D_CLASS) {
+            const struct nvc0_hw_metric_query_cfg **queries =
+               nvc0_hw_metric_get_queries(screen);
+            unsigned i;
+
+            for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) {
+               if (queries[i])
+                  count++;
+            }
+         }
+      }
+   }
+
+   if (!info)
+      return count;
+
+   if (id < count) {
+      if (screen->compute) {
+         if (class_3d < NVE4_3D_CLASS) {
+             const struct nvc0_hw_metric_query_cfg **queries =
+               nvc0_hw_metric_get_queries(screen);
+
+            id = nvc0_hw_metric_get_next_query_id(queries, id);
+            info->name = nvc0_hw_metric_names[id];
+            info->query_type = NVC0_HW_METRIC_QUERY(id);
+            info->group_id = -1;
+            return 1;
+         }
+      }
+   }
+   return 0;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h
new file mode 100644
index 00000000000..95675fd19b7
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h
@@ -0,0 +1,42 @@
+#ifndef __NVC0_QUERY_HW_METRIC_H__
+#define __NVC0_QUERY_HW_METRIC_H__
+
+#include "nvc0_query_hw.h"
+
+struct nvc0_hw_metric_query {
+   struct nvc0_hw_query base;
+   struct nvc0_hw_query *queries[8];
+   unsigned num_queries;
+};
+
+static inline struct nvc0_hw_metric_query *
+nvc0_hw_metric_query(struct nvc0_hw_query *hq)
+{
+   return (struct nvc0_hw_metric_query *)hq;
+}
+
+/*
+ * Driver metrics queries:
+ */
+#define NVC0_HW_METRIC_QUERY(i)   (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i))
+#define NVC0_HW_METRIC_QUERY_LAST  NVC0_HW_METRIC_QUERY(NVC0_HW_METRIC_QUERY_COUNT - 1)
+enum nvc0_hw_metric_queries
+{
+    NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY = 0,
+    NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
+    NVC0_HW_METRIC_QUERY_INST_ISSUED,
+    NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
+    NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
+    NVC0_HW_METRIC_QUERY_ISSUED_IPC,
+    NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
+    NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
+    NVC0_HW_METRIC_QUERY_IPC,
+    NVC0_HW_METRIC_QUERY_COUNT
+};
+
+struct nvc0_hw_query *
+nvc0_hw_metric_create_query(struct nvc0_context *, unsigned);
+int
+nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *, unsigned,
+                                     struct pipe_driver_query_info *);
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
new file mode 100644
index 00000000000..44b222e5134
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -0,0 +1,1387 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_query_hw_sm.h"
+
+#include "nv_object.xml.h"
+#include "nvc0/nve4_compute.xml.h"
+#include "nvc0/nvc0_compute.xml.h"
+
+/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
+
+/* NOTE: intentionally using the same names as NV */
+static const char *nve4_hw_sm_query_names[] =
+{
+   /* MP counters */
+   "active_cycles",
+   "active_warps",
+   "atom_count",
+   "branch",
+   "divergent_branch",
+   "gld_request",
+   "global_ld_mem_divergence_replays",
+   "global_store_transaction",
+   "global_st_mem_divergence_replays",
+   "gred_count",
+   "gst_request",
+   "inst_executed",
+   "inst_issued",
+   "inst_issued1",
+   "inst_issued2",
+   "l1_global_load_hit",
+   "l1_global_load_miss",
+   "l1_local_load_hit",
+   "l1_local_load_miss",
+   "l1_local_store_hit",
+   "l1_local_store_miss",
+   "l1_shared_load_transactions",
+   "l1_shared_store_transactions",
+   "local_load",
+   "local_load_transactions",
+   "local_store",
+   "local_store_transactions",
+   "prof_trigger_00",
+   "prof_trigger_01",
+   "prof_trigger_02",
+   "prof_trigger_03",
+   "prof_trigger_04",
+   "prof_trigger_05",
+   "prof_trigger_06",
+   "prof_trigger_07",
+   "shared_load",
+   "shared_load_replay",
+   "shared_store",
+   "shared_store_replay",
+   "sm_cta_launched",
+   "threads_launched",
+   "uncached_global_load_transaction",
+   "warps_launched",
+   /* metrics, i.e. functions of the MP counters */
+   "metric-ipc",                   /* inst_executed, clock */
+   "metric-ipac",                  /* inst_executed, active_cycles */
+   "metric-ipec",                  /* inst_executed, (bool)inst_executed */
+   "metric-achieved_occupancy",    /* active_warps, active_cycles */
+   "metric-sm_efficiency",         /* active_cycles, clock */
+   "metric-inst_replay_overhead"   /* inst_issued, inst_executed */
+};
+
+/* Code to read out MP counters: They are accessible via mmio, too, but let's
+ * just avoid mapping registers in userspace. We'd have to know which MPs are
+ * enabled/present, too, and that information is not presently exposed.
+ * We could add a kernel interface for it, but reading the counters like this
+ * has the advantage of being async (if get_result isn't called immediately).
+ */
+static const uint64_t nve4_read_hw_sm_counters_code[] =
+{
+   /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
+    * mov b32 $r8 $tidx
+    * mov b32 $r12 $physid
+    * mov b32 $r0 $pm0
+    * mov b32 $r1 $pm1
+    * mov b32 $r2 $pm2
+    * mov b32 $r3 $pm3
+    * mov b32 $r4 $pm4
+    * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
+    * mov b32 $r5 $pm5
+    * mov b32 $r6 $pm6
+    * mov b32 $r7 $pm7
+    * set $p0 0x1 eq u32 $r8 0x0
+    * mov b32 $r10 c0[0x0]
+    * ext u32 $r8 $r12 0x414
+    * mov b32 $r11 c0[0x4]
+    * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
+    * ext u32 $r9 $r12 0x208
+    * (not $p0) exit
+    * set $p1 0x1 eq u32 $r9 0x0
+    * mul $r8 u32 $r8 u32 96
+    * mul $r12 u32 $r9 u32 16
+    * mul $r13 u32 $r9 u32 4
+    * add b32 $r9 $r8 $r13
+    * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
+    * add b32 $r8 $r8 $r12
+    * mov b32 $r12 $r10
+    * add b32 $r10 $c $r10 $r8
+    * mov b32 $r13 $r11
+    * add b32 $r11 $r11 0x0 $c
+    * add b32 $r12 $c $r12 $r9
+    * st b128 wt g[$r10d] $r0q
+    * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
+    * mov b32 $r0 c0[0x8]
+    * add b32 $r13 $r13 0x0 $c
+    * $p1 st b128 wt g[$r12d+0x40] $r4q
+    * st b32 wt g[$r12d+0x50] $r0
+    * exit */
+   0x2202020202020207ULL,
+   0x2c00000084021c04ULL,
+   0x2c0000000c031c04ULL,
+   0x2c00000010001c04ULL,
+   0x2c00000014005c04ULL,
+   0x2c00000018009c04ULL,
+   0x2c0000001c00dc04ULL,
+   0x2c00000020011c04ULL,
+   0x22b0420042320207ULL,
+   0x2c00000024015c04ULL,
+   0x2c00000028019c04ULL,
+   0x2c0000002c01dc04ULL,
+   0x190e0000fc81dc03ULL,
+   0x2800400000029de4ULL,
+   0x7000c01050c21c03ULL,
+   0x280040001002dde4ULL,
+   0x204282020042e047ULL,
+   0x7000c00820c25c03ULL,
+   0x80000000000021e7ULL,
+   0x190e0000fc93dc03ULL,
+   0x1000000180821c02ULL,
+   0x1000000040931c02ULL,
+   0x1000000010935c02ULL,
+   0x4800000034825c03ULL,
+   0x22c042c042c04287ULL,
+   0x4800000030821c03ULL,
+   0x2800000028031de4ULL,
+   0x4801000020a29c03ULL,
+   0x280000002c035de4ULL,
+   0x0800000000b2dc42ULL,
+   0x4801000024c31c03ULL,
+   0x9400000000a01fc5ULL,
+   0x200002e04202c047ULL,
+   0x2800400020001de4ULL,
+   0x0800000000d35c42ULL,
+   0x9400000100c107c5ULL,
+   0x9400000140c01f85ULL,
+   0x8000000000001de7ULL
+};
+
+/* For simplicity, we will allocate as many group slots as we allocate counter
+ * slots. This means that a single counter which wants to source from 2 groups
+ * will have to be declared as using 2 counter slots. This shouldn't really be
+ * a problem because such queries don't make much sense ... (unless someone is
+ * really creative).
+ */
+struct nvc0_hw_sm_counter_cfg
+{
+   uint32_t func    : 16; /* mask or 4-bit logic op (depending on mode) */
+   uint32_t mode    : 4;  /* LOGOP,B6,LOGOP_B6(_PULSE) */
+   uint32_t sig_dom : 1;  /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
+   uint32_t sig_sel : 8;  /* signal group */
+   uint32_t src_mask;     /* mask for signal selection (only for NVC0:NVE4) */
+   uint32_t src_sel;      /* signal selection for up to 4 sources */
+};
+
+#define NVC0_COUNTER_OPn_SUM            0
+#define NVC0_COUNTER_OPn_OR             1
+#define NVC0_COUNTER_OPn_AND            2
+#define NVC0_COUNTER_OP2_REL_SUM_MM     3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
+#define NVC0_COUNTER_OP2_DIV_SUM_M0     4 /* sum(ctr0) / ctr1 of MP[0]) */
+#define NVC0_COUNTER_OP2_AVG_DIV_MM     5 /* avg(ctr0 / ctr1) */
+#define NVC0_COUNTER_OP2_AVG_DIV_M0     6 /* avg(ctr0) / ctr1 of MP[0]) */
+
+struct nvc0_hw_sm_query_cfg
+{
+   struct nvc0_hw_sm_counter_cfg ctr[8];
+   uint8_t num_counters;
+   uint8_t op;
+   uint8_t norm[2]; /* normalization num,denom */
+};
+
+#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
+#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
+#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
+   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
+   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \
+   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
+#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
+   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \
+   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
+   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
+#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
+   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
+   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
+   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
+
+/* NOTES:
+ * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
+ * inst_executed etc.: we only count a single warp scheduler
+ * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
+ *  this is inaccurate !
+ */
+static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
+{
+   _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
+   _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
+   _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
+   _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c, 1, 1),
+   _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
+   _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
+   _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
+   _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004, 1, 1),
+   _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
+   _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
+   _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
+   _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398, 1, 1),
+   _Q1A(INST_ISSUED,   0x0003, B6, ISSUE, 0x00000104, 1, 1),
+   _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004, 1, 1),
+   _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008, 1, 1),
+   _Q1B(L1_GLD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
+   _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
+   _Q1B(L1_LOCAL_LD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
+   _Q1B(L1_LOCAL_LD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
+   _Q1B(L1_LOCAL_ST_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
+   _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
+   _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
+   _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
+   _Q1A(LOCAL_LD,    0x0001, B6, LDST, 0x00000008, 1, 1),
+   _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
+   _Q1A(LOCAL_ST,    0x0001, B6, LDST, 0x0000000c, 1, 1),
+   _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
+   _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
+   _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
+   _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
+   _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
+   _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
+   _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
+   _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
+   _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
+   _Q1A(SHARED_LD,   0x0001, B6, LDST, 0x00000000, 1, 1),
+   _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
+   _Q1A(SHARED_ST,   0x0001, B6, LDST, 0x00000004, 1, 1),
+   _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
+   _Q1B(SM_CTA_LAUNCHED,      0x0001, B6, WARP, 0x0000001c, 1, 1),
+   _Q1A(THREADS_LAUNCHED,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
+   _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
+   _Q1A(WARPS_LAUNCHED,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
+   _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
+   _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
+   _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
+   _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
+   _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
+   _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
+};
+
+#undef _Q1A
+#undef _Q1B
+#undef _M2A
+#undef _M2B
+
+/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
+/* NOTES:
+ * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
+ *   because there is a context-switch problem that we need to fix.
+ *   Results might be wrong sometimes, be careful!
+ */
+static const char *nvc0_hw_sm_query_names[] =
+{
+   /* MP counters */
+   "active_cycles",
+   "active_warps",
+   "atom_count",
+   "branch",
+   "divergent_branch",
+   "gld_request",
+   "gred_count",
+   "gst_request",
+   "inst_executed",
+   "inst_issued",
+   "inst_issued1_0",
+   "inst_issued1_1",
+   "inst_issued2_0",
+   "inst_issued2_1",
+   "local_load",
+   "local_store",
+   "prof_trigger_00",
+   "prof_trigger_01",
+   "prof_trigger_02",
+   "prof_trigger_03",
+   "prof_trigger_04",
+   "prof_trigger_05",
+   "prof_trigger_06",
+   "prof_trigger_07",
+   "shared_load",
+   "shared_store",
+   "threads_launched",
+   "thread_inst_executed_0",
+   "thread_inst_executed_1",
+   "thread_inst_executed_2",
+   "thread_inst_executed_3",
+   "warps_launched",
+};
+
+static const uint64_t nvc0_read_hw_sm_counters_code[] =
+{
+   /* mov b32 $r8 $tidx
+    * mov b32 $r9 $physid
+    * mov b32 $r0 $pm0
+    * mov b32 $r1 $pm1
+    * mov b32 $r2 $pm2
+    * mov b32 $r3 $pm3
+    * mov b32 $r4 $pm4
+    * mov b32 $r5 $pm5
+    * mov b32 $r6 $pm6
+    * mov b32 $r7 $pm7
+    * set $p0 0x1 eq u32 $r8 0x0
+    * mov b32 $r10 c0[0x0]
+    * mov b32 $r11 c0[0x4]
+    * ext u32 $r8 $r9 0x414
+    * (not $p0) exit
+    * mul $r8 u32 $r8 u32 48
+    * add b32 $r10 $c $r10 $r8
+    * add b32 $r11 $r11 0x0 $c
+    * mov b32 $r8 c0[0x8]
+    * st b128 wt g[$r10d+0x00] $r0q
+    * st b128 wt g[$r10d+0x10] $r4q
+    * st b32 wt g[$r10d+0x20] $r8
+    * exit */
+   0x2c00000084021c04ULL,
+   0x2c0000000c025c04ULL,
+   0x2c00000010001c04ULL,
+   0x2c00000014005c04ULL,
+   0x2c00000018009c04ULL,
+   0x2c0000001c00dc04ULL,
+   0x2c00000020011c04ULL,
+   0x2c00000024015c04ULL,
+   0x2c00000028019c04ULL,
+   0x2c0000002c01dc04ULL,
+   0x190e0000fc81dc03ULL,
+   0x2800400000029de4ULL,
+   0x280040001002dde4ULL,
+   0x7000c01050921c03ULL,
+   0x80000000000021e7ULL,
+   0x10000000c0821c02ULL,
+   0x4801000020a29c03ULL,
+   0x0800000000b2dc42ULL,
+   0x2800400020021de4ULL,
+   0x9400000000a01fc5ULL,
+   0x9400000040a11fc5ULL,
+   0x9400000080a21f85ULL,
+   0x8000000000001de7ULL
+};
+
+#define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
+#define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c
+
+/* ==== Compute capability 2.0 (GF100/GF110) ==== */
+static const struct nvc0_hw_sm_query_cfg
+sm20_active_cycles =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_active_warps =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
+   .ctr[3]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
+   .ctr[4]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
+   .ctr[5]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
+   .num_counters = 6,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_atom_count =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_branch =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
+   .num_counters = 2,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_divergent_branch =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
+   .num_counters = 2,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_gld_request =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_gred_count =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_gst_request =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_inst_executed =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
+   .num_counters = 2,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_inst_issued =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
+   .num_counters = 2,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_local_ld =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_local_st =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_prof_trigger_0 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_prof_trigger_1 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_prof_trigger_2 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_prof_trigger_3 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_prof_trigger_4 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_prof_trigger_5 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_prof_trigger_6 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_prof_trigger_7 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_shared_ld =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_shared_st =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_threads_launched =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
+   .ctr[3]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
+   .ctr[4]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
+   .ctr[5]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
+   .num_counters = 6,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_th_inst_executed_0 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
+   .ctr[3]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
+   .ctr[4]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
+   .ctr[5]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
+   .num_counters = 6,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_th_inst_executed_1 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
+   .ctr[3]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
+   .ctr[4]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
+   .ctr[5]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
+   .num_counters = 6,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm20_warps_launched =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
+{
+   _Q(ACTIVE_CYCLES,       &sm20_active_cycles),
+   _Q(ACTIVE_WARPS,        &sm20_active_warps),
+   _Q(ATOM_COUNT,          &sm20_atom_count),
+   _Q(BRANCH,              &sm20_branch),
+   _Q(DIVERGENT_BRANCH,    &sm20_divergent_branch),
+   _Q(GLD_REQUEST,         &sm20_gld_request),
+   _Q(GRED_COUNT,          &sm20_gred_count),
+   _Q(GST_REQUEST,         &sm20_gst_request),
+   _Q(INST_EXECUTED,       &sm20_inst_executed),
+   _Q(INST_ISSUED,         &sm20_inst_issued),
+   _Q(INST_ISSUED1_0,      NULL),
+   _Q(INST_ISSUED1_1,      NULL),
+   _Q(INST_ISSUED2_0,      NULL),
+   _Q(INST_ISSUED2_1,      NULL),
+   _Q(LOCAL_LD,            &sm20_local_ld),
+   _Q(LOCAL_ST,            &sm20_local_st),
+   _Q(PROF_TRIGGER_0,      &sm20_prof_trigger_0),
+   _Q(PROF_TRIGGER_1,      &sm20_prof_trigger_1),
+   _Q(PROF_TRIGGER_2,      &sm20_prof_trigger_2),
+   _Q(PROF_TRIGGER_3,      &sm20_prof_trigger_3),
+   _Q(PROF_TRIGGER_4,      &sm20_prof_trigger_4),
+   _Q(PROF_TRIGGER_5,      &sm20_prof_trigger_5),
+   _Q(PROF_TRIGGER_6,      &sm20_prof_trigger_6),
+   _Q(PROF_TRIGGER_7,      &sm20_prof_trigger_7),
+   _Q(SHARED_LD,           &sm20_shared_ld),
+   _Q(SHARED_ST,           &sm20_shared_st),
+   _Q(THREADS_LAUNCHED,    &sm20_threads_launched),
+   _Q(TH_INST_EXECUTED_0,  &sm20_th_inst_executed_0),
+   _Q(TH_INST_EXECUTED_1,  &sm20_th_inst_executed_1),
+   _Q(TH_INST_EXECUTED_2,  NULL),
+   _Q(TH_INST_EXECUTED_3,  NULL),
+   _Q(WARPS_LAUNCHED,      &sm20_warps_launched),
+};
+
+/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
+static const struct nvc0_hw_sm_query_cfg
+sm21_inst_executed =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
+   .num_counters = 3,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm21_inst_issued1_0 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm21_inst_issued1_1 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm21_inst_issued2_0 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm21_inst_issued2_1 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
+   .num_counters = 1,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm21_th_inst_executed_0 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
+   .ctr[3]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
+   .ctr[4]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
+   .ctr[5]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
+   .num_counters = 6,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm21_th_inst_executed_1 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
+   .ctr[3]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
+   .ctr[4]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
+   .ctr[5]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
+   .num_counters = 6,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm21_th_inst_executed_2 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
+   .ctr[3]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
+   .ctr[4]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
+   .ctr[5]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
+   .num_counters = 6,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm21_th_inst_executed_3 =
+{
+   .ctr[0]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
+   .ctr[1]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
+   .ctr[2]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
+   .ctr[3]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
+   .ctr[4]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
+   .ctr[5]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
+   .num_counters = 6,
+   .op           = NVC0_COUNTER_OPn_SUM,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
+{
+   _Q(ACTIVE_CYCLES,       &sm20_active_cycles),
+   _Q(ACTIVE_WARPS,        &sm20_active_warps),
+   _Q(ATOM_COUNT,          &sm20_atom_count),
+   _Q(BRANCH,              &sm20_branch),
+   _Q(DIVERGENT_BRANCH,    &sm20_divergent_branch),
+   _Q(GLD_REQUEST,         &sm20_gld_request),
+   _Q(GRED_COUNT,          &sm20_gred_count),
+   _Q(GST_REQUEST,         &sm20_gst_request),
+   _Q(INST_EXECUTED,       &sm21_inst_executed),
+   _Q(INST_ISSUED,         NULL),
+   _Q(INST_ISSUED1_0,      &sm21_inst_issued1_0),
+   _Q(INST_ISSUED1_1,      &sm21_inst_issued1_1),
+   _Q(INST_ISSUED2_0,      &sm21_inst_issued2_0),
+   _Q(INST_ISSUED2_1,      &sm21_inst_issued2_1),
+   _Q(LOCAL_LD,            &sm20_local_ld),
+   _Q(LOCAL_ST,            &sm20_local_st),
+   _Q(PROF_TRIGGER_0,      &sm20_prof_trigger_0),
+   _Q(PROF_TRIGGER_1,      &sm20_prof_trigger_1),
+   _Q(PROF_TRIGGER_2,      &sm20_prof_trigger_2),
+   _Q(PROF_TRIGGER_3,      &sm20_prof_trigger_3),
+   _Q(PROF_TRIGGER_4,      &sm20_prof_trigger_4),
+   _Q(PROF_TRIGGER_5,      &sm20_prof_trigger_5),
+   _Q(PROF_TRIGGER_6,      &sm20_prof_trigger_6),
+   _Q(PROF_TRIGGER_7,      &sm20_prof_trigger_7),
+   _Q(SHARED_LD,           &sm20_shared_ld),
+   _Q(SHARED_ST,           &sm20_shared_st),
+   _Q(THREADS_LAUNCHED,    &sm20_threads_launched),
+   _Q(TH_INST_EXECUTED_0,  &sm21_th_inst_executed_0),
+   _Q(TH_INST_EXECUTED_1,  &sm21_th_inst_executed_1),
+   _Q(TH_INST_EXECUTED_2,  &sm21_th_inst_executed_2),
+   _Q(TH_INST_EXECUTED_3,  &sm21_th_inst_executed_3),
+   _Q(WARPS_LAUNCHED,      &sm20_warps_launched),
+};
+
+#undef _Q
+#undef _C
+
+static inline const struct nvc0_hw_sm_query_cfg **
+nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
+{
+   struct nouveau_device *dev = screen->base.device;
+
+   if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+      return sm20_hw_sm_queries;
+   return sm21_hw_sm_queries;
+}
+
+static const struct nvc0_hw_sm_query_cfg *
+nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nvc0_query *q = &hq->base;
+
+   if (screen->base.class_3d >= NVE4_3D_CLASS)
+      return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+
+   if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) {
+      const struct nvc0_hw_sm_query_cfg **queries =
+         nvc0_hw_sm_get_queries(screen);
+      return queries[q->type - NVC0_HW_SM_QUERY(0)];
+   }
+   debug_printf("invalid query type: %d\n", q->type);
+   return NULL;
+}
+
+static void
+nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
+{
+   struct nvc0_query *q = &hq->base;
+   q->funcs->destroy_query(nvc0, q);
+}
+
+static boolean
+nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
+   const struct nvc0_hw_sm_query_cfg *cfg;
+   unsigned i, c;
+   unsigned num_ab[2] = { 0, 0 };
+
+   cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
+
+   /* check if we have enough free counter slots */
+   for (i = 0; i < cfg->num_counters; ++i)
+      num_ab[cfg->ctr[i].sig_dom]++;
+
+   if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
+       screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
+      NOUVEAU_ERR("Not enough free MP counter slots !\n");
+      return false;
+   }
+
+   assert(cfg->num_counters <= 4);
+   PUSH_SPACE(push, 4 * 8 * + 6);
+
+   if (!screen->pm.mp_counters_enabled) {
+      screen->pm.mp_counters_enabled = true;
+      BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
+      PUSH_DATA (push, 0x1fcb);
+   }
+
+   /* set sequence field to 0 (used to check if result is available) */
+   for (i = 0; i < screen->mp_count; ++i)
+      hq->data[i * 10 + 10] = 0;
+   hq->sequence++;
+
+   for (i = 0; i < cfg->num_counters; ++i) {
+      const unsigned d = cfg->ctr[i].sig_dom;
+
+      if (!screen->pm.num_hw_sm_active[d]) {
+         uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
+         if (screen->pm.num_hw_sm_active[!d])
+            m |= 1 << (7 + (8 * d));
+         BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
+         PUSH_DATA (push, m);
+      }
+      screen->pm.num_hw_sm_active[d]++;
+
+      for (c = d * 4; c < (d * 4 + 4); ++c) {
+         if (!screen->pm.mp_counter[c]) {
+            hsq->ctr[i] = c;
+            screen->pm.mp_counter[c] = hsq;
+            break;
+         }
+      }
+      assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
+
+      /* configure and reset the counter(s) */
+     if (d == 0)
+        BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
+     else
+        BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
+     PUSH_DATA (push, cfg->ctr[i].sig_sel);
+     BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
+     PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
+     BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
+     PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+     BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
+     PUSH_DATA (push, 0);
+   }
+   return true;
+}
+
+static boolean
+nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
+   const struct nvc0_hw_sm_query_cfg *cfg;
+   unsigned i, c;
+
+   if (screen->base.class_3d >= NVE4_3D_CLASS)
+      return nve4_hw_sm_begin_query(nvc0, hq);
+
+   cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
+
+   /* check if we have enough free counter slots */
+   if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
+      NOUVEAU_ERR("Not enough free MP counter slots !\n");
+      return false;
+   }
+
+   assert(cfg->num_counters <= 8);
+   PUSH_SPACE(push, 8 * 8 + 2);
+
+   /* set sequence field to 0 (used to check if result is available) */
+   for (i = 0; i < screen->mp_count; ++i) {
+      const unsigned b = (0x30 / 4) * i;
+      hq->data[b + 8] = 0;
+   }
+   hq->sequence++;
+
+   for (i = 0; i < cfg->num_counters; ++i) {
+      uint32_t mask_sel = 0x00000000;
+
+      if (!screen->pm.num_hw_sm_active[0]) {
+         BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
+         PUSH_DATA (push, 0x80000000);
+      }
+      screen->pm.num_hw_sm_active[0]++;
+
+      for (c = 0; c < 8; ++c) {
+         if (!screen->pm.mp_counter[c]) {
+            hsq->ctr[i] = c;
+            screen->pm.mp_counter[c] = hsq;
+            break;
+         }
+      }
+
+      /* Oddly-enough, the signal id depends on the slot selected on Fermi but
+       * not on Kepler. Fortunately, the signal ids are just offseted by the
+       * slot id! */
+      mask_sel |= c;
+      mask_sel |= (c << 8);
+      mask_sel |= (c << 16);
+      mask_sel |= (c << 24);
+      mask_sel &= cfg->ctr[i].src_mask;
+
+      /* configure and reset the counter(s) */
+      BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1);
+      PUSH_DATA (push, cfg->ctr[i].sig_sel);
+      BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1);
+      PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
+      BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1);
+      PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+      BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1);
+      PUSH_DATA (push, 0);
+   }
+   return true;
+}
+
+static void
+nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct pipe_context *pipe = &nvc0->base.pipe;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
+   struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
+   uint32_t mask;
+   uint32_t input[3];
+   const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
+   const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
+   unsigned c;
+
+   if (unlikely(!screen->pm.prog)) {
+      struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
+      prog->type = PIPE_SHADER_COMPUTE;
+      prog->translated = true;
+      prog->num_gprs = 14;
+      prog->parm_size = 12;
+      if (is_nve4) {
+         prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
+         prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
+      } else {
+         prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
+         prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
+      }
+      screen->pm.prog = prog;
+   }
+
+   /* disable all counting */
+   PUSH_SPACE(push, 8);
+   for (c = 0; c < 8; ++c)
+      if (screen->pm.mp_counter[c]) {
+         if (is_nve4) {
+            IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
+         } else {
+            IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
+         }
+      }
+   /* release counters for this query */
+   for (c = 0; c < 8; ++c) {
+      if (screen->pm.mp_counter[c] == hsq) {
+         uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
+         screen->pm.num_hw_sm_active[d]--;
+         screen->pm.mp_counter[c] = NULL;
+      }
+   }
+
+   BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
+                hq->bo);
+
+   PUSH_SPACE(push, 1);
+   IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
+
+   pipe->bind_compute_state(pipe, screen->pm.prog);
+   input[0] = (hq->bo->offset + hq->base_offset);
+   input[1] = (hq->bo->offset + hq->base_offset) >> 32;
+   input[2] = hq->sequence;
+   pipe->launch_grid(pipe, block, grid, 0, input);
+
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
+
+   /* re-activate other counters */
+   PUSH_SPACE(push, 16);
+   mask = 0;
+   for (c = 0; c < 8; ++c) {
+      const struct nvc0_hw_sm_query_cfg *cfg;
+      unsigned i;
+
+      hsq = screen->pm.mp_counter[c];
+      if (!hsq)
+         continue;
+
+      cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
+      for (i = 0; i < cfg->num_counters; ++i) {
+         if (mask & (1 << hsq->ctr[i]))
+            break;
+         mask |= 1 << hsq->ctr[i];
+         if (is_nve4) {
+            BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
+         } else {
+            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
+         }
+         PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+      }
+   }
+}
+
+static inline bool
+nvc0_hw_sm_query_read_data(uint32_t count[32][8],
+                           struct nvc0_context *nvc0, bool wait,
+                           struct nvc0_hw_query *hq,
+                           const struct nvc0_hw_sm_query_cfg *cfg,
+                           unsigned mp_count)
+{
+   struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
+   unsigned p, c;
+
+   for (p = 0; p < mp_count; ++p) {
+      const unsigned b = (0x30 / 4) * p;
+
+      for (c = 0; c < cfg->num_counters; ++c) {
+         if (hq->data[b + 8] != hq->sequence) {
+            if (!wait)
+               return false;
+            if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
+               return false;
+         }
+         count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
+      }
+   }
+   return true;
+}
+
+static inline bool
+nve4_hw_sm_query_read_data(uint32_t count[32][8],
+                           struct nvc0_context *nvc0, bool wait,
+                           struct nvc0_hw_query *hq,
+                           const struct nvc0_hw_sm_query_cfg *cfg,
+                           unsigned mp_count)
+{
+   struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
+   unsigned p, c, d;
+
+   for (p = 0; p < mp_count; ++p) {
+      const unsigned b = (0x60 / 4) * p;
+
+      for (c = 0; c < cfg->num_counters; ++c) {
+         count[p][c] = 0;
+         for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
+            if (hq->data[b + 20 + d] != hq->sequence) {
+               if (!wait)
+                  return false;
+               if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
+                  return false;
+            }
+            if (hsq->ctr[c] & ~0x3)
+               count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
+            else
+               count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
+         }
+      }
+   }
+   return true;
+}
+
+/* Metric calculations:
+ * sum(x) ... sum of x over all MPs
+ * avg(x) ... average of x over all MPs
+ *
+ * IPC              : sum(inst_executed) / clock
+ * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
+ * MP_OCCUPANCY     : avg((active_warps / 64) / active_cycles)
+ * MP_EFFICIENCY    : avg(active_cycles / clock)
+ *
+ * NOTE: Interpretation of IPC requires knowledge of MP count.
+ */
+static boolean
+nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
+                            boolean wait, union pipe_query_result *result)
+{
+   uint32_t count[32][8];
+   uint64_t value = 0;
+   unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
+   unsigned p, c;
+   const struct nvc0_hw_sm_query_cfg *cfg;
+   bool ret;
+
+   cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
+
+   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+      ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
+   else
+      ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
+   if (!ret)
+      return false;
+
+   if (cfg->op == NVC0_COUNTER_OPn_SUM) {
+      for (c = 0; c < cfg->num_counters; ++c)
+         for (p = 0; p < mp_count; ++p)
+            value += count[p][c];
+      value = (value * cfg->norm[0]) / cfg->norm[1];
+   } else
+   if (cfg->op == NVC0_COUNTER_OPn_OR) {
+      uint32_t v = 0;
+      for (c = 0; c < cfg->num_counters; ++c)
+         for (p = 0; p < mp_count; ++p)
+            v |= count[p][c];
+      value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
+   } else
+   if (cfg->op == NVC0_COUNTER_OPn_AND) {
+      uint32_t v = ~0;
+      for (c = 0; c < cfg->num_counters; ++c)
+         for (p = 0; p < mp_count; ++p)
+            v &= count[p][c];
+      value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
+   } else
+   if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
+      uint64_t v[2] = { 0, 0 };
+      for (p = 0; p < mp_count; ++p) {
+         v[0] += count[p][0];
+         v[1] += count[p][1];
+      }
+      if (v[0])
+         value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
+   } else
+   if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
+      for (p = 0; p < mp_count; ++p)
+         value += count[p][0];
+      if (count[0][1])
+         value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
+      else
+         value = 0;
+   } else
+   if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
+      unsigned mp_used = 0;
+      for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
+         if (count[p][1])
+            value += (count[p][0] * cfg->norm[0]) / count[p][1];
+      if (mp_used)
+         value /= (uint64_t)mp_used * cfg->norm[1];
+   } else
+   if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
+      unsigned mp_used = 0;
+      for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
+         value += count[p][0];
+      if (count[0][1] && mp_used) {
+         value *= cfg->norm[0];
+         value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
+      } else {
+         value = 0;
+      }
+   }
+
+   *(uint64_t *)result = value;
+   return true;
+}
+
+static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
+   .destroy_query = nvc0_hw_sm_destroy_query,
+   .begin_query = nvc0_hw_sm_begin_query,
+   .end_query = nvc0_hw_sm_end_query,
+   .get_query_result = nvc0_hw_sm_get_query_result,
+};
+
+struct nvc0_hw_query *
+nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nvc0_hw_sm_query *hsq;
+   struct nvc0_hw_query *hq;
+   unsigned space;
+
+   if (nvc0->screen->base.device->drm_version < 0x01000101)
+      return NULL;
+
+   if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
+       (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
+      return NULL;
+
+   hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
+   if (!hsq)
+      return NULL;
+
+   hq = &hsq->base;
+   hq->funcs = &hw_sm_query_funcs;
+   hq->base.type = type;
+
+   if (screen->base.class_3d >= NVE4_3D_CLASS) {
+       /* for each MP:
+        * [00] = WS0.C0
+        * [04] = WS0.C1
+        * [08] = WS0.C2
+        * [0c] = WS0.C3
+        * [24] = WS2.C1
+        * [28] = WS2.C2
+        * [2c] = WS2.C3
+        * [30] = WS3.C0
+        * [34] = WS3.C1
+        * [38] = WS3.C2
+        * [3c] = WS3.C3
+        * [40] = MP.C4
+        * [44] = MP.C5
+        * [48] = MP.C6
+        * [4c] = MP.C7
+        * [50] = WS0.sequence
+        * [54] = WS1.sequence
+        * [58] = WS2.sequence
+        * [5c] = WS3.sequence
+        */
+       space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
+   } else {
+      /*
+       * Note that padding is used to align memory access to 128 bits.
+       *
+       * for each MP:
+       * [00] = MP.C0
+       * [04] = MP.C1
+       * [08] = MP.C2
+       * [0c] = MP.C3
+       * [10] = MP.C4
+       * [14] = MP.C5
+       * [18] = MP.C6
+       * [1c] = MP.C7
+       * [20] = MP.sequence
+       * [24] = padding
+       * [28] = padding
+       * [2c] = padding
+       */
+      space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
+   }
+
+   if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
+      FREE(hq);
+      return NULL;
+   }
+
+   return hq;
+}
+
+static int
+nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries,
+                             unsigned id)
+{
+   unsigned i, next = 0;
+
+   for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
+      if (!queries[i]) {
+         next++;
+      } else
+      if (i >= id && queries[id + next]) {
+         break;
+      }
+   }
+   return id + next;
+}
+
+int
+nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
+                                 struct pipe_driver_query_info *info)
+{
+   int count = 0;
+
+   if (screen->base.device->drm_version >= 0x01000101) {
+      if (screen->compute) {
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            count += NVE4_HW_SM_QUERY_COUNT;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            const struct nvc0_hw_sm_query_cfg **queries =
+               nvc0_hw_sm_get_queries(screen);
+            unsigned i;
+
+            for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
+               if (queries[i])
+                  count++;
+            }
+         }
+      }
+   }
+
+   if (!info)
+      return count;
+
+   if (id < count) {
+      if (screen->compute) {
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            info->name = nve4_hw_sm_query_names[id];
+            info->query_type = NVE4_HW_SM_QUERY(id);
+            info->max_value.u64 =
+               (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
+            info->group_id = NVC0_HW_SM_QUERY_GROUP;
+            return 1;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            const struct nvc0_hw_sm_query_cfg **queries =
+               nvc0_hw_sm_get_queries(screen);
+
+            id = nvc0_hw_sm_get_next_query_id(queries, id);
+            info->name = nvc0_hw_sm_query_names[id];
+            info->query_type = NVC0_HW_SM_QUERY(id);
+            info->group_id = NVC0_HW_SM_QUERY_GROUP;
+            return 1;
+         }
+      }
+   }
+   return 0;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
new file mode 100644
index 00000000000..26bde0c3e0d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
@@ -0,0 +1,120 @@
+#ifndef __NVC0_QUERY_HW_SM_H__
+#define __NVC0_QUERY_HW_SM_H__
+
+#include "nvc0_query_hw.h"
+
+struct nvc0_hw_sm_query {
+   struct nvc0_hw_query base;
+   uint8_t ctr[8];
+};
+
+static inline struct nvc0_hw_sm_query *
+nvc0_hw_sm_query(struct nvc0_hw_query *hq)
+{
+   return (struct nvc0_hw_sm_query *)hq;
+}
+
+/*
+ * Performance counter queries:
+ */
+#define NVE4_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
+#define NVE4_HW_SM_QUERY_LAST   NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1)
+enum nve4_hw_sm_queries
+{
+   NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0,
+   NVE4_HW_SM_QUERY_ACTIVE_WARPS,
+   NVE4_HW_SM_QUERY_ATOM_COUNT,
+   NVE4_HW_SM_QUERY_BRANCH,
+   NVE4_HW_SM_QUERY_DIVERGENT_BRANCH,
+   NVE4_HW_SM_QUERY_GLD_REQUEST,
+   NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
+   NVE4_HW_SM_QUERY_GST_TRANSACTIONS,
+   NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
+   NVE4_HW_SM_QUERY_GRED_COUNT,
+   NVE4_HW_SM_QUERY_GST_REQUEST,
+   NVE4_HW_SM_QUERY_INST_EXECUTED,
+   NVE4_HW_SM_QUERY_INST_ISSUED,
+   NVE4_HW_SM_QUERY_INST_ISSUED1,
+   NVE4_HW_SM_QUERY_INST_ISSUED2,
+   NVE4_HW_SM_QUERY_L1_GLD_HIT,
+   NVE4_HW_SM_QUERY_L1_GLD_MISS,
+   NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT,
+   NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS,
+   NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT,
+   NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS,
+   NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
+   NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
+   NVE4_HW_SM_QUERY_LOCAL_LD,
+   NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
+   NVE4_HW_SM_QUERY_LOCAL_ST,
+   NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
+   NVE4_HW_SM_QUERY_PROF_TRIGGER_0,
+   NVE4_HW_SM_QUERY_PROF_TRIGGER_1,
+   NVE4_HW_SM_QUERY_PROF_TRIGGER_2,
+   NVE4_HW_SM_QUERY_PROF_TRIGGER_3,
+   NVE4_HW_SM_QUERY_PROF_TRIGGER_4,
+   NVE4_HW_SM_QUERY_PROF_TRIGGER_5,
+   NVE4_HW_SM_QUERY_PROF_TRIGGER_6,
+   NVE4_HW_SM_QUERY_PROF_TRIGGER_7,
+   NVE4_HW_SM_QUERY_SHARED_LD,
+   NVE4_HW_SM_QUERY_SHARED_LD_REPLAY,
+   NVE4_HW_SM_QUERY_SHARED_ST,
+   NVE4_HW_SM_QUERY_SHARED_ST_REPLAY,
+   NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED,
+   NVE4_HW_SM_QUERY_THREADS_LAUNCHED,
+   NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
+   NVE4_HW_SM_QUERY_WARPS_LAUNCHED,
+   NVE4_HW_SM_QUERY_METRIC_IPC,
+   NVE4_HW_SM_QUERY_METRIC_IPAC,
+   NVE4_HW_SM_QUERY_METRIC_IPEC,
+   NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY,
+   NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY,
+   NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD,
+   NVE4_HW_SM_QUERY_COUNT
+};
+
+#define NVC0_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
+#define NVC0_HW_SM_QUERY_LAST   NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1)
+enum nvc0_hw_sm_queries
+{
+   NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0,
+   NVC0_HW_SM_QUERY_ACTIVE_WARPS,
+   NVC0_HW_SM_QUERY_ATOM_COUNT,
+   NVC0_HW_SM_QUERY_BRANCH,
+   NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
+   NVC0_HW_SM_QUERY_GLD_REQUEST,
+   NVC0_HW_SM_QUERY_GRED_COUNT,
+   NVC0_HW_SM_QUERY_GST_REQUEST,
+   NVC0_HW_SM_QUERY_INST_EXECUTED,
+   NVC0_HW_SM_QUERY_INST_ISSUED,
+   NVC0_HW_SM_QUERY_INST_ISSUED1_0,
+   NVC0_HW_SM_QUERY_INST_ISSUED1_1,
+   NVC0_HW_SM_QUERY_INST_ISSUED2_0,
+   NVC0_HW_SM_QUERY_INST_ISSUED2_1,
+   NVC0_HW_SM_QUERY_LOCAL_LD,
+   NVC0_HW_SM_QUERY_LOCAL_ST,
+   NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
+   NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
+   NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
+   NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
+   NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
+   NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
+   NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
+   NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
+   NVC0_HW_SM_QUERY_SHARED_LD,
+   NVC0_HW_SM_QUERY_SHARED_ST,
+   NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
+   NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
+   NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
+   NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
+   NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
+   NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
+   NVC0_HW_SM_QUERY_COUNT
+};
+
+struct nvc0_hw_query *
+nvc0_hw_sm_create_query(struct nvc0_context *, unsigned);
+int
+nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *, unsigned,
+                                 struct pipe_driver_query_info *);
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c
new file mode 100644
index 00000000000..cd24618d564
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvc0/nvc0_context.h"
+
+#include "nvc0_query_sw.h"
+
+/* === DRIVER STATISTICS === */
+
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+
+static const char *nvc0_sw_query_drv_stat_names[] =
+{
+   "drv-tex_obj_current_count",
+   "drv-tex_obj_current_bytes",
+   "drv-buf_obj_current_count",
+   "drv-buf_obj_current_bytes_vid",
+   "drv-buf_obj_current_bytes_sys",
+   "drv-tex_transfers_rd",
+   "drv-tex_transfers_wr",
+   "drv-tex_copy_count",
+   "drv-tex_blit_count",
+   "drv-tex_cache_flush_count",
+   "drv-buf_transfers_rd",
+   "drv-buf_transfers_wr",
+   "drv-buf_read_bytes_staging_vid",
+   "drv-buf_write_bytes_direct",
+   "drv-buf_write_bytes_staging_vid",
+   "drv-buf_write_bytes_staging_sys",
+   "drv-buf_copy_bytes",
+   "drv-buf_non_kernel_fence_sync_count",
+   "drv-any_non_kernel_fence_sync_count",
+   "drv-query_sync_count",
+   "drv-gpu_serialize_count",
+   "drv-draw_calls_array",
+   "drv-draw_calls_indexed",
+   "drv-draw_calls_fallback_count",
+   "drv-user_buffer_upload_bytes",
+   "drv-constbuf_upload_count",
+   "drv-constbuf_upload_bytes",
+   "drv-pushbuf_count",
+   "drv-resource_validate_count"
+};
+
+#endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */
+
+static void
+nvc0_sw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_sw_query *sq = nvc0_sw_query(q);
+   FREE(sq);
+}
+
+static boolean
+nvc0_sw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+   struct nvc0_sw_query *sq = nvc0_sw_query(q);
+
+   if (q->index >= 5) {
+      sq->value = nvc0->screen->base.stats.v[q->index];
+   } else {
+      sq->value = 0;
+   }
+#endif
+   return true;
+}
+
+static void
+nvc0_sw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+   struct nvc0_sw_query *sq = nvc0_sw_query(q);
+   sq->value = nvc0->screen->base.stats.v[q->index] - sq->value;
+#endif
+}
+
+static boolean
+nvc0_sw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
+                         boolean wait, union pipe_query_result *result)
+{
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+   struct nvc0_sw_query *sq = nvc0_sw_query(q);
+   uint64_t *res64 = (uint64_t *)result;
+
+   res64[0] = sq->value;
+#endif
+   return true;
+}
+
+static const struct nvc0_query_funcs sw_query_funcs = {
+   .destroy_query = nvc0_sw_destroy_query,
+   .begin_query = nvc0_sw_begin_query,
+   .end_query = nvc0_sw_end_query,
+   .get_query_result = nvc0_sw_get_query_result,
+};
+
+struct nvc0_query *
+nvc0_sw_create_query(struct nvc0_context *nvcO, unsigned type, unsigned index)
+{
+   struct nvc0_sw_query *sq;
+   struct nvc0_query *q;
+
+   if (type < NVC0_SW_QUERY_DRV_STAT(0) || type > NVC0_SW_QUERY_DRV_STAT_LAST)
+      return NULL;
+
+   sq = CALLOC_STRUCT(nvc0_sw_query);
+   if (!sq)
+      return NULL;
+
+   q = &sq->base;
+   q->funcs = &sw_query_funcs;
+   q->type = type;
+   q->index = type - NVC0_SW_QUERY_DRV_STAT(0);
+
+   return q;
+}
+
+int
+nvc0_sw_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
+                              struct pipe_driver_query_info *info)
+{
+   int count = 0;
+
+   count += NVC0_SW_QUERY_DRV_STAT_COUNT;
+   if (!info)
+      return count;
+
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+   if (id < count) {
+      info->name = nvc0_sw_query_drv_stat_names[id];
+      info->query_type = NVC0_SW_QUERY_DRV_STAT(id);
+      info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+      info->max_value.u64 = 0;
+      if (strstr(info->name, "bytes"))
+         info->type = PIPE_DRIVER_QUERY_TYPE_BYTES;
+      info->group_id = NVC0_SW_QUERY_DRV_STAT_GROUP;
+      return 1;
+   }
+#endif
+   return 0;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h
new file mode 100644
index 00000000000..eaa890e4fc0
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h
@@ -0,0 +1,64 @@
+#ifndef __NVC0_QUERY_SW_H__
+#define __NVC0_QUERY_SW_H__
+
+#include "nvc0_query.h"
+
+struct nvc0_sw_query {
+   struct nvc0_query base;
+   uint64_t value;
+};
+
+static inline struct nvc0_sw_query *
+nvc0_sw_query(struct nvc0_query *q)
+{
+   return (struct nvc0_sw_query *)q;
+}
+
+/*
+ * Driver statistics queries:
+ */
+#define NVC0_SW_QUERY_DRV_STAT(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i))
+#define NVC0_SW_QUERY_DRV_STAT_LAST   NVC0_SW_QUERY_DRV_STAT(NVC0_SW_QUERY_DRV_STAT_COUNT - 1)
+enum nvc0_sw_query_drv_stat
+{
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+   NVC0_SW_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0,
+   NVC0_SW_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES,
+   NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID,
+   NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS,
+   NVC0_SW_QUERY_DRV_STAT_TEX_TRANSFERS_READ,
+   NVC0_SW_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE,
+   NVC0_SW_QUERY_DRV_STAT_TEX_COPY_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_TEX_BLIT_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_BUF_TRANSFERS_READ,
+   NVC0_SW_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE,
+   NVC0_SW_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID,
+   NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT,
+   NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID,
+   NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS,
+   NVC0_SW_QUERY_DRV_STAT_BUF_COPY_BYTES,
+   NVC0_SW_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_QUERY_SYNC_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_ARRAY,
+   NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_INDEXED,
+   NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES,
+   NVC0_SW_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES,
+   NVC0_SW_QUERY_DRV_STAT_PUSHBUF_COUNT,
+   NVC0_SW_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT,
+#endif
+   NVC0_SW_QUERY_DRV_STAT_COUNT
+};
+
+struct nvc0_query *
+nvc0_sw_create_query(struct nvc0_context *, unsigned, unsigned);
+int
+nvc0_sw_get_driver_query_info(struct nvc0_screen *, unsigned,
+                              struct pipe_driver_query_info *);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index afd91e6feee..f34ad0ed5d1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -561,12 +561,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
    switch (screen->base.device->chipset & ~0xf) {
    case 0xc0:
    case 0xd0:
-      /* Using COMPUTE has weird effects on 3D state, we need to
-       * investigate this further before enabling it by default.
-       */
-      if (debug_get_bool_option("NVC0_COMPUTE", false))
-         return nvc0_screen_compute_setup(screen, screen->base.pushbuf);
-      return 0;
+      return nvc0_screen_compute_setup(screen, screen->base.pushbuf);
    case 0xe0:
       return nve4_screen_compute_setup(screen, screen->base.pushbuf);
    case 0xf0:
@@ -914,6 +909,7 @@ nvc0_screen_create(struct nouveau_device *dev)
       else
          value = (16 << 8) | 4;
    }
+   screen->gpc_count = value & 0x000000ff;
    screen->mp_count = value >> 8;
    screen->mp_count_compute = screen->mp_count;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index f57a316f01e..857eb0316c7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -67,6 +67,7 @@ struct nvc0_screen {
    struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
    struct nouveau_bo *poly_cache;
 
+   uint8_t gpc_count;
    uint16_t mp_count;
    uint16_t mp_count_compute; /* magic reg can make compute use fewer MPs */
 
@@ -94,7 +95,7 @@ struct nvc0_screen {
 
    struct {
       struct nvc0_program *prog; /* compute state object to read MP counters */
-      struct pipe_query *mp_counter[8]; /* counter to query allocation */
+      struct nvc0_hw_sm_query *mp_counter[8]; /* counter to query allocation */
       uint8_t num_hw_sm_active[2];
       bool mp_counters_enabled;
    } pm;
@@ -112,148 +113,6 @@ nvc0_screen(struct pipe_screen *screen)
    return (struct nvc0_screen *)screen;
 }
 
-/*
- * Performance counters groups:
- */
-#define NVC0_QUERY_MP_COUNTER_GROUP 0
-#define NVC0_QUERY_DRV_STAT_GROUP   1
-
-/* Performance counter queries:
- */
-#define NVE4_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
-#define NVE4_HW_SM_QUERY_LAST   NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1)
-enum nve4_pm_queries
-{
-    NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0,
-    NVE4_HW_SM_QUERY_ACTIVE_WARPS,
-    NVE4_HW_SM_QUERY_ATOM_COUNT,
-    NVE4_HW_SM_QUERY_BRANCH,
-    NVE4_HW_SM_QUERY_DIVERGENT_BRANCH,
-    NVE4_HW_SM_QUERY_GLD_REQUEST,
-    NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
-    NVE4_HW_SM_QUERY_GST_TRANSACTIONS,
-    NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
-    NVE4_HW_SM_QUERY_GRED_COUNT,
-    NVE4_HW_SM_QUERY_GST_REQUEST,
-    NVE4_HW_SM_QUERY_INST_EXECUTED,
-    NVE4_HW_SM_QUERY_INST_ISSUED,
-    NVE4_HW_SM_QUERY_INST_ISSUED1,
-    NVE4_HW_SM_QUERY_INST_ISSUED2,
-    NVE4_HW_SM_QUERY_L1_GLD_HIT,
-    NVE4_HW_SM_QUERY_L1_GLD_MISS,
-    NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT,
-    NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS,
-    NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT,
-    NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS,
-    NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
-    NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
-    NVE4_HW_SM_QUERY_LOCAL_LD,
-    NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
-    NVE4_HW_SM_QUERY_LOCAL_ST,
-    NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
-    NVE4_HW_SM_QUERY_PROF_TRIGGER_0,
-    NVE4_HW_SM_QUERY_PROF_TRIGGER_1,
-    NVE4_HW_SM_QUERY_PROF_TRIGGER_2,
-    NVE4_HW_SM_QUERY_PROF_TRIGGER_3,
-    NVE4_HW_SM_QUERY_PROF_TRIGGER_4,
-    NVE4_HW_SM_QUERY_PROF_TRIGGER_5,
-    NVE4_HW_SM_QUERY_PROF_TRIGGER_6,
-    NVE4_HW_SM_QUERY_PROF_TRIGGER_7,
-    NVE4_HW_SM_QUERY_SHARED_LD,
-    NVE4_HW_SM_QUERY_SHARED_LD_REPLAY,
-    NVE4_HW_SM_QUERY_SHARED_ST,
-    NVE4_HW_SM_QUERY_SHARED_ST_REPLAY,
-    NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED,
-    NVE4_HW_SM_QUERY_THREADS_LAUNCHED,
-    NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
-    NVE4_HW_SM_QUERY_WARPS_LAUNCHED,
-    NVE4_HW_SM_QUERY_METRIC_IPC,
-    NVE4_HW_SM_QUERY_METRIC_IPAC,
-    NVE4_HW_SM_QUERY_METRIC_IPEC,
-    NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY,
-    NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY,
-    NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD,
-    NVE4_HW_SM_QUERY_COUNT
-};
-
-#define NVC0_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
-#define NVC0_HW_SM_QUERY_LAST   NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1)
-enum nvc0_pm_queries
-{
-    NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0,
-    NVC0_HW_SM_QUERY_ACTIVE_WARPS,
-    NVC0_HW_SM_QUERY_ATOM_COUNT,
-    NVC0_HW_SM_QUERY_BRANCH,
-    NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
-    NVC0_HW_SM_QUERY_GLD_REQUEST,
-    NVC0_HW_SM_QUERY_GRED_COUNT,
-    NVC0_HW_SM_QUERY_GST_REQUEST,
-    NVC0_HW_SM_QUERY_INST_EXECUTED,
-    NVC0_HW_SM_QUERY_INST_ISSUED1_0,
-    NVC0_HW_SM_QUERY_INST_ISSUED1_1,
-    NVC0_HW_SM_QUERY_INST_ISSUED2_0,
-    NVC0_HW_SM_QUERY_INST_ISSUED2_1,
-    NVC0_HW_SM_QUERY_LOCAL_LD,
-    NVC0_HW_SM_QUERY_LOCAL_ST,
-    NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
-    NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
-    NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
-    NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
-    NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
-    NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
-    NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
-    NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
-    NVC0_HW_SM_QUERY_SHARED_LD,
-    NVC0_HW_SM_QUERY_SHARED_ST,
-    NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
-    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
-    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
-    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
-    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
-    NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
-    NVC0_HW_SM_QUERY_COUNT
-};
-
-/* Driver statistics queries:
- */
-#define NVC0_QUERY_DRV_STAT(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i))
-#define NVC0_QUERY_DRV_STAT_LAST   NVC0_QUERY_DRV_STAT(NVC0_QUERY_DRV_STAT_COUNT - 1)
-enum nvc0_drv_stats_queries
-{
-#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-    NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0,
-    NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES,
-    NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT,
-    NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID,
-    NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS,
-    NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ,
-    NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE,
-    NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT,
-    NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT,
-    NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT,
-    NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ,
-    NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE,
-    NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID,
-    NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT,
-    NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID,
-    NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS,
-    NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES,
-    NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT,
-    NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT,
-    NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT,
-    NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT,
-    NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY,
-    NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED,
-    NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT,
-    NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES,
-    NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT,
-    NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES,
-    NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT,
-    NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT,
-#endif
-    NVC0_QUERY_DRV_STAT_COUNT
-};
-
 int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
                                       struct pipe_driver_query_info *);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 8f8ac2d34b9..af837fc4a33 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -26,6 +26,7 @@
 #include "util/u_inlines.h"
 
 #include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_query_hw.h"
 
 static inline void
 nvc0_program_update_context_state(struct nvc0_context *nvc0,
@@ -272,14 +273,14 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
          continue;
 
       if (!targ->clean)
-         nvc0_query_fifo_wait(push, targ->pq);
+         nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq));
       BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
       PUSH_DATA (push, 1);
       PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
       PUSH_DATA (push, buf->address + targ->pipe.buffer_offset);
       PUSH_DATA (push, targ->pipe.buffer_size);
       if (!targ->clean) {
-         nvc0_query_pushbuf_submit(push, targ->pq, 0x4);
+         nvc0_hw_query_pushbuf_submit(push, nvc0_query(targ->pq), 0x4);
       } else {
          PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */
          targ->clean = false;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index c5bfd03956d..742bef39247 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -29,6 +29,7 @@
 
 #include "nvc0/nvc0_stateobj.h"
 #include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_query_hw.h"
 
 #include "nvc0/nvc0_3d.xml.h"
 #include "nv50/nv50_texture.xml.h"
@@ -1070,7 +1071,7 @@ nvc0_so_target_create(struct pipe_context *pipe,
    if (!targ)
       return NULL;
 
-   targ->pq = pipe->create_query(pipe, NVC0_QUERY_TFB_BUFFER_OFFSET, 0);
+   targ->pq = pipe->create_query(pipe, NVC0_HW_QUERY_TFB_BUFFER_OFFSET, 0);
    if (!targ->pq) {
       FREE(targ);
       return NULL;
@@ -1091,6 +1092,25 @@ nvc0_so_target_create(struct pipe_context *pipe,
 }
 
 static void
+nvc0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, bool *serialize)
+{
+   struct nvc0_so_target *targ = nvc0_so_target(ptarg);
+
+   if (*serialize) {
+      *serialize = false;
+      PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1);
+      IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0);
+
+      NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1);
+   }
+
+   nvc0_query(targ->pq)->index = index;
+   pipe->end_query(pipe, targ->pq);
+}
+
+static void
 nvc0_so_target_destroy(struct pipe_context *pipe,
                        struct pipe_stream_output_target *ptarg)
 {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index aaec60a5ac2..d459dd61c19 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -188,14 +188,10 @@ nvc0_m2mf_push_linear(struct nouveau_context *nv,
    nouveau_pushbuf_validate(push);
 
    while (count) {
-      unsigned nr;
+      unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);
 
-      if (!PUSH_SPACE(push, 16))
+      if (!PUSH_SPACE(push, nr + 9))
          break;
-      nr = PUSH_AVAIL(push);
-      assert(nr >= 16);
-      nr = MIN2(count, nr - 9);
-      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN);
 
       BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
       PUSH_DATAh(push, dst->offset + offset);
@@ -234,14 +230,10 @@ nve4_p2mf_push_linear(struct nouveau_context *nv,
    nouveau_pushbuf_validate(push);
 
    while (count) {
-      unsigned nr;
+      unsigned nr = MIN2(count, (NV04_PFIFO_MAX_PACKET_LEN - 1));
 
-      if (!PUSH_SPACE(push, 16))
+      if (!PUSH_SPACE(push, nr + 10))
          break;
-      nr = PUSH_AVAIL(push);
-      assert(nr >= 16);
-      nr = MIN2(count, nr - 8);
-      nr = MIN2(nr, (NV04_PFIFO_MAX_PACKET_LEN - 1));
 
       BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2);
       PUSH_DATAh(push, dst->offset + offset);
@@ -571,9 +563,7 @@ nvc0_cb_bo_push(struct nouveau_context *nv,
    PUSH_DATA (push, bo->offset + base);
 
    while (words) {
-      unsigned nr = PUSH_AVAIL(push);
-      nr = MIN2(nr, words);
-      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1);
+      unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN - 1);
 
       PUSH_SPACE(push, nr + 2);
       PUSH_REFN (push, bo, NOUVEAU_BO_WR | domain);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 188c7d7cdc8..c464904d6d4 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -29,6 +29,7 @@
 #include "translate/translate.h"
 
 #include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_query_hw.h"
 #include "nvc0/nvc0_resource.h"
 
 #include "nvc0/nvc0_3d.xml.h"
@@ -775,7 +776,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
       res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       PUSH_SPACE(push, 2);
       IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
-      nvc0_query_fifo_wait(push, so->pq);
+      nvc0_hw_query_fifo_wait(push, nvc0_query(so->pq));
       if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
          IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
 
@@ -791,7 +792,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
       BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_STRIDE), 1);
       PUSH_DATA (push, so->stride);
       BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BYTES), 1);
-      nvc0_query_pushbuf_submit(push, so->pq, 0x4);
+      nvc0_hw_query_pushbuf_submit(push, nvc0_query(so->pq), 0x4);
       IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0);
 
       mode |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index efb4889e562..32ce76a9e07 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -305,7 +305,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
 		if (family >= CHIP_CEDAR)
-		   return 330;
+		   return 410;
 		/* pre-evergreen geom shaders need newer kernel */
 		if (rscreen->b.info.drm_minor >= 37)
 		   return 330;
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 1d905822cde..8efe902a329 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -166,8 +166,6 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
     if (rctx->b.chip_class <= R700) {
 	    use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
     }
-	/* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */
-	use_sb &= !shader->shader.uses_index_registers;
 	/* disable SB for shaders using doubles */
 	use_sb &= !shader->shader.uses_doubles;
 
@@ -1250,9 +1248,6 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx)
 			continue;
 		}
 
-		if (ctx->src[i].kc_rel)
-			ctx->shader->uses_index_registers = true;
-
 		if (ctx->src[i].rel) {
 			int chan = inst->Src[i].Indirect.Swizzle;
 			int treg = r600_get_temp(ctx);
@@ -1912,7 +1907,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 
 	shader->uses_doubles = ctx.info.uses_doubles;
 
-	indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
+	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
 	tgsi_parse_init(&ctx.parse, tokens);
 	ctx.type = ctx.info.processor;
 	shader->processor_type = ctx.type;
@@ -1936,7 +1931,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.gs_next_vertex = 0;
 	ctx.gs_stream_output_info = &so;
 
-	shader->uses_index_registers = false;
 	ctx.face_gpr = -1;
 	ctx.fixed_pt_position_gpr = -1;
 	ctx.fragcoord_input = -1;
@@ -5703,8 +5697,6 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 		sampler_src_reg = 3;
 
 	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
-	if (sampler_index_mode)
-		ctx->shader->uses_index_registers = true;
 
 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
 
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 48de9cdb156..c240e7110c1 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -75,8 +75,6 @@ struct r600_shader {
 	boolean			has_txq_cube_array_z_comp;
 	boolean			uses_tex_buffers;
 	boolean                 gs_prim_id_input;
-	/* Temporarily workaround SB not handling CF_INDEX_[01] index registers */
-	boolean			uses_index_registers;
 
 	/* Size in bytes of a data item in the ring(s) (single vertex data).
 	   Stages with only one ring items 123 will be set to 0. */
diff --git a/src/gallium/drivers/r600/r600_uvd.c b/src/gallium/drivers/r600/r600_uvd.c
index 357e9017a65..e2e9033ea2c 100644
--- a/src/gallium/drivers/r600/r600_uvd.c
+++ b/src/gallium/drivers/r600/r600_uvd.c
@@ -47,8 +47,11 @@
 #include "r600_pipe.h"
 #include "radeon/radeon_video.h"
 #include "radeon/radeon_uvd.h"
+#include "radeon/radeon_vce.h"
 #include "r600d.h"
 
+#define R600_UVD_ENABLE_TILING 0
+
 /**
  * creates an video buffer with an UVD compatible memory layout
  */
@@ -77,7 +80,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe,
 	template.height = align(tmpl->height / array_size, VL_MACROBLOCK_HEIGHT);
 
 	vl_video_buffer_template(&templ, &template, resource_formats[0], 1, array_size, PIPE_USAGE_DEFAULT, 0);
-	if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced)
+	if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING)
 		templ.bind = PIPE_BIND_LINEAR;
 	resources[0] = (struct r600_texture *)
 		pipe->screen->resource_create(pipe->screen, &templ);
@@ -86,7 +89,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe,
 
 	if (resource_formats[1] != PIPE_FORMAT_NONE) {
 		vl_video_buffer_template(&templ, &template, resource_formats[1], 1, array_size, PIPE_USAGE_DEFAULT, 1);
-		if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced)
+		if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING)
 			templ.bind = PIPE_BIND_LINEAR;
 		resources[1] = (struct r600_texture *)
 			pipe->screen->resource_create(pipe->screen, &templ);
@@ -96,7 +99,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe,
 
 	if (resource_formats[2] != PIPE_FORMAT_NONE) {
 		vl_video_buffer_template(&templ, &template, resource_formats[2], 1, array_size, PIPE_USAGE_DEFAULT, 2);
-		if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced)
+		if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING)
 			templ.bind = PIPE_BIND_LINEAR;
 		resources[2] = (struct r600_texture *)
 			pipe->screen->resource_create(pipe->screen, &templ);
@@ -166,9 +169,28 @@ static struct radeon_winsys_cs_handle* r600_uvd_set_dtb(struct ruvd_msg *msg, st
 	return luma->resource.cs_buf;
 }
 
+/* get the radeon resources for VCE */
+static void r600_vce_get_buffer(struct pipe_resource *resource,
+				struct radeon_winsys_cs_handle **handle,
+				struct radeon_surf **surface)
+{
+	struct r600_texture *res = (struct r600_texture *)resource;
+
+	if (handle)
+		*handle = res->resource.cs_buf;
+
+	if (surface)
+		*surface = &res->surface;
+}
+
 /* create decoder */
 struct pipe_video_codec *r600_uvd_create_decoder(struct pipe_context *context,
-						   const struct pipe_video_codec *templat)
+						 const struct pipe_video_codec *templat)
 {
+	struct r600_context *ctx = (struct r600_context *)context;
+
+        if (templat->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE)
+                return rvce_create_encoder(context, templat, ctx->b.ws, r600_vce_get_buffer);
+
 	return ruvd_create_decoder(context, templat, r600_uvd_set_dtb);
 }
diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h
index ab988f8716d..9c2a9170436 100644
--- a/src/gallium/drivers/r600/sb/sb_bc.h
+++ b/src/gallium/drivers/r600/sb/sb_bc.h
@@ -48,6 +48,7 @@ class fetch_node;
 class alu_group_node;
 class region_node;
 class shader;
+class value;
 
 class sb_ostream {
 public:
@@ -477,7 +478,9 @@ struct bc_cf {
 
 	bool is_alu_extended() {
 		assert(op_ptr->flags & CF_ALU);
-		return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE;
+		return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE ||
+			kc[0].index_mode != KC_INDEX_NONE || kc[1].index_mode != KC_INDEX_NONE ||
+			kc[2].index_mode != KC_INDEX_NONE || kc[3].index_mode != KC_INDEX_NONE;
 	}
 
 };
@@ -818,13 +821,16 @@ class bc_parser {
 
 	bool gpr_reladdr;
 
+	// Note: currently relies on input emitting SET_CF in same basic block as uses
+	value *cf_index_value[2];
+	alu_node *mova;
 public:
 
 	bc_parser(sb_context &sctx, r600_bytecode *bc, r600_shader* pshader) :
 		ctx(sctx), dec(), bc(bc), pshader(pshader),
 		dw(), bc_ndw(), max_cf(),
 		sh(), error(), slots(), cgroup(),
-		cf_map(), loop_stack(), gpr_reladdr() { }
+		cf_map(), loop_stack(), gpr_reladdr(), cf_index_value(), mova() { }
 
 	int decode();
 	int prepare();
@@ -852,6 +858,10 @@ private:
 	int prepare_loop(cf_node *c);
 	int prepare_if(cf_node *c);
 
+	void save_set_cf_index(value *val, unsigned idx);
+	value *get_cf_index_value(unsigned idx);
+	void save_mova(alu_node *mova);
+	alu_node *get_mova();
 };
 
 
diff --git a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
index 0fc73c419a6..3c70ea7cd3d 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
@@ -27,6 +27,7 @@
 #include "sb_bc.h"
 #include "sb_shader.h"
 #include "sb_pass.h"
+#include "eg_sq.h" // V_SQ_CF_INDEX_0/1
 
 namespace r600_sb {
 
@@ -354,6 +355,14 @@ void bc_dump::dump(alu_node& n) {
 			s << "  " << vec_bs[n.bc.bank_swizzle];
 	}
 
+	if (ctx.is_cayman()) {
+		if (n.bc.op == ALU_OP1_MOVA_INT) {
+			static const char *mova_str[] = { " AR_X", " PC", " CF_IDX0", " CF_IDX1",
+				" Unknown MOVA_INT dest" };
+			s << mova_str[std::min(n.bc.dst_gpr, 4u)];  // CM_V_SQ_MOVA_DST_AR_*
+		}
+	}
+
 	sblog << s.str() << "\n";
 }
 
@@ -450,9 +459,9 @@ void bc_dump::dump(fetch_node& n) {
 		if (n.bc.fetch_whole_quad)
 			s << " FWQ";
 		if (ctx.is_egcm() && n.bc.resource_index_mode)
-			s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode;
+			s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0);
 		if (ctx.is_egcm() && n.bc.sampler_index_mode)
-			s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode;
+			s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0);
 
 		s << " UCF:" << n.bc.use_const_fields
 				<< " FMT(DTA:" << n.bc.data_format
@@ -470,9 +479,9 @@ void bc_dump::dump(fetch_node& n) {
 			if (n.bc.offset[k])
 				s << " O" << chans[k] << ":" << n.bc.offset[k];
 		if (ctx.is_egcm() && n.bc.resource_index_mode)
-			s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode;
+			s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0);
 		if (ctx.is_egcm() && n.bc.sampler_index_mode)
-			s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode;
+			s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0);
 	}
 
 	sblog << s.str() << "\n";
diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
index 522ff9d956e..82826a90921 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
@@ -303,7 +303,8 @@ void bc_finalizer::finalize_alu_group(alu_group_node* g, node *prev_node) {
 			assert(fdst.chan() == slot || slot == SLOT_TRANS);
 		}
 
-		n->bc.dst_gpr = fdst.sel();
+		if (!(n->bc.op_ptr->flags & AF_MOVA && ctx.is_cayman()))
+			n->bc.dst_gpr = fdst.sel();
 		n->bc.dst_chan = d ? fdst.chan() : slot < SLOT_TRANS ? slot : 0;
 
 
@@ -514,7 +515,7 @@ void bc_finalizer::copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg
 
 void bc_finalizer::emit_set_grad(fetch_node* f) {
 
-	assert(f->src.size() == 12);
+	assert(f->src.size() == 12 || f->src.size() == 13);
 	unsigned ops[2] = { FETCH_OP_SET_GRADIENTS_V, FETCH_OP_SET_GRADIENTS_H };
 
 	unsigned arg_start = 0;
@@ -809,8 +810,8 @@ void bc_finalizer::finalize_cf(cf_node* c) {
 }
 
 sel_chan bc_finalizer::translate_kcache(cf_node* alu, value* v) {
-	unsigned sel = v->select.sel();
-	unsigned bank = sel >> 12;
+	unsigned sel = v->select.kcache_sel();
+	unsigned bank = v->select.kcache_bank();
 	unsigned chan = v->select.chan();
 	static const unsigned kc_base[] = {128, 160, 256, 288};
 
diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
index 19bd0784a61..28ebfa2ce62 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
@@ -34,6 +34,7 @@
 
 #include "r600_pipe.h"
 #include "r600_shader.h"
+#include "eg_sq.h" // CM_V_SQ_MOVA_DST_CF_IDX0/1
 
 #include <stack>
 
@@ -121,7 +122,7 @@ int bc_parser::parse_decls() {
 		return 0;
 	}
 
-	if (pshader->indirect_files & ~(1 << TGSI_FILE_CONSTANT)) {
+	if (pshader->indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER))) {
 
 		assert(pshader->num_arrays);
 
@@ -328,6 +329,29 @@ int bc_parser::prepare_alu_clause(cf_node* cf) {
 	return 0;
 }
 
+void bc_parser::save_set_cf_index(value *val, unsigned idx)
+{
+	assert(idx <= 1);
+	assert(val);
+	cf_index_value[idx] = val;
+}
+value *bc_parser::get_cf_index_value(unsigned idx)
+{
+	assert(idx <= 1);
+	assert(cf_index_value[idx]);
+	return cf_index_value[idx];
+}
+void bc_parser::save_mova(alu_node *mova)
+{
+	assert(mova);
+	this->mova = mova;
+}
+alu_node *bc_parser::get_mova()
+{
+	assert(mova);
+	return mova;
+}
+
 int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
 
 	alu_node *n;
@@ -338,6 +362,7 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
 	for (node_iterator I = g->begin(), E = g->end();
 			I != E; ++I) {
 		n = static_cast<alu_node*>(*I);
+		bool ubo_indexing[2] = {};
 
 		if (!sh->assign_slot(n, slots[cgroup])) {
 			assert(!"alu slot assignment failed");
@@ -375,9 +400,14 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
 			n->dst.resize(1);
 		}
 
-		if (flags & AF_MOVA) {
+		if (n->bc.op == ALU_OP0_SET_CF_IDX0 || n->bc.op == ALU_OP0_SET_CF_IDX1) {
+			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
+			// DCE will kill this op
+			save_set_cf_index(get_mova()->src[0], n->bc.op == ALU_OP0_SET_CF_IDX1);
+		} else if (flags & AF_MOVA) {
 
 			n->dst[0] = sh->get_special_value(SV_AR_INDEX);
+			save_mova(n);
 
 			n->flags |= NF_DONT_HOIST;
 
@@ -432,7 +462,12 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
 
 				bc_kcache &kc = cf->bc.kc[kc_set];
 				kc_addr = (kc.addr << 4) + (sel & 0x1F);
-				n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan);
+				n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan, (alu_kcache_index_mode)kc.index_mode);
+
+				if (kc.index_mode != KC_INDEX_NONE) {
+					assert(kc.index_mode != KC_LOCK_LOOP);
+					ubo_indexing[kc.index_mode - KC_INDEX_0] = true;
+				}
 			} else if (src.sel < MAX_GPR) {
 				value *v = sh->get_gpr_value(true, src.sel, src.chan, src.rel);
 
@@ -469,6 +504,19 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
 				}
 			}
 		}
+
+		// add UBO index values if any as dependencies
+		if (ubo_indexing[0]) {
+			n->src.push_back(get_cf_index_value(0));
+		}
+		if (ubo_indexing[1]) {
+			n->src.push_back(get_cf_index_value(1));
+		}
+
+		if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) &&
+		    ctx.is_cayman())
+			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
+			save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1);
 	}
 
 	// pack multislot instructions into alu_packed_node
@@ -608,6 +656,13 @@ int bc_parser::prepare_fetch_clause(cf_node *cf) {
 					                              n->bc.src_sel[s], false);
 			}
 
+			// Scheduler will emit the appropriate instructions to set CF_IDX0/1
+			if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) {
+				n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1));
+			}
+			if (n->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
+				n->src.push_back(get_cf_index_value(n->bc.resource_index_mode == V_SQ_CF_INDEX_1));
+			}
 		}
 	}
 
diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp
index 9c2274e65a3..556a05da395 100644
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -403,7 +403,8 @@ bool expr_handler::fold_alu_op1(alu_node& n) {
 		if ((n.bc.op == ALU_OP1_MOV || n.bc.op == ALU_OP1_MOVA_INT ||
 				n.bc.op == ALU_OP1_MOVA_GPR_INT)
 				&& n.bc.clamp == 0 && n.bc.omod == 0
-				&& n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0) {
+				&& n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0 &&
+				n.src.size() == 1 /* RIM/SIM can be appended as additional values */) {
 			assign_source(n.dst[0], v0);
 			return true;
 		}
diff --git a/src/gallium/drivers/r600/sb/sb_gcm.cpp b/src/gallium/drivers/r600/sb/sb_gcm.cpp
index bccb6713967..236b2ea0031 100644
--- a/src/gallium/drivers/r600/sb/sb_gcm.cpp
+++ b/src/gallium/drivers/r600/sb/sb_gcm.cpp
@@ -37,6 +37,7 @@
 #include "sb_bc.h"
 #include "sb_shader.h"
 #include "sb_pass.h"
+#include "eg_sq.h" // V_SQ_CF_INDEX_NONE
 
 namespace r600_sb {
 
@@ -406,6 +407,14 @@ void gcm::bu_sched_bb(bb_node* bb) {
 					ncnt = 3;
 				}
 
+				bool sampler_indexing = false;
+				if (n->is_fetch_inst() &&
+					static_cast<fetch_node *>(n)->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE)
+				{
+					sampler_indexing = true; // Give sampler indexed ops get their own clause
+					ncnt = sh.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1
+				}
+
 				if ((sq == SQ_TEX || sq == SQ_VTX) &&
 						((last_count >= ctx.max_fetch/2 &&
 						check_alu_ready_count(24)) ||
@@ -418,7 +427,7 @@ void gcm::bu_sched_bb(bb_node* bb) {
 				bu_ready[sq].pop_front();
 
 				if (sq != SQ_CF) {
-					if (!clause) {
+					if (!clause || sampler_indexing) {
 						clause = sh.create_clause(sq == SQ_ALU ?
 								NST_ALU_CLAUSE :
 									sq == SQ_TEX ? NST_TEX_CLAUSE :
diff --git a/src/gallium/drivers/r600/sb/sb_ir.h b/src/gallium/drivers/r600/sb/sb_ir.h
index 560a4a9b284..c612e6c4ec6 100644
--- a/src/gallium/drivers/r600/sb/sb_ir.h
+++ b/src/gallium/drivers/r600/sb/sb_ir.h
@@ -62,6 +62,13 @@ struct sel_chan
 
 	static unsigned sel(unsigned idx) { return (idx-1) >> 2; }
 	static unsigned chan(unsigned idx) { return (idx-1) & 3; }
+
+	sel_chan(unsigned bank, unsigned index,
+			 unsigned chan, alu_kcache_index_mode index_mode)
+		: id(sel_chan((bank << 12) | index | ((unsigned)index_mode << 28), chan).id) {}
+	unsigned kcache_index_mode() const { return sel() >> 28; }
+	unsigned kcache_sel() const { return sel() & 0x0fffffffu; }
+	unsigned kcache_bank() const { return kcache_sel() >> 12; }
 };
 
 inline sb_ostream& operator <<(sb_ostream& o, sel_chan r) {
diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
index c98b8fff764..5113b756847 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -36,6 +36,7 @@
 #include "sb_shader.h"
 #include "sb_pass.h"
 #include "sb_sched.h"
+#include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1
 
 namespace r600_sb {
 
@@ -781,7 +782,14 @@ void post_scheduler::schedule_bb(bb_node* bb) {
 			sblog << "\n";
 		);
 
-		if (n->subtype == NST_ALU_CLAUSE) {
+		// May require emitting ALU ops to load index registers
+		if (n->is_fetch_clause()) {
+			n->remove();
+			process_fetch(static_cast<container_node *>(n));
+			continue;
+		}
+
+		if (n->is_alu_clause()) {
 			n->remove();
 			process_alu(static_cast<container_node*>(n));
 			continue;
@@ -823,6 +831,108 @@ void post_scheduler::init_regmap() {
 	}
 }
 
+static alu_node *create_set_idx(shader &sh, unsigned ar_idx) {
+	alu_node *a = sh.create_alu();
+
+	assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1);
+	if (ar_idx == V_SQ_CF_INDEX_0)
+		a->bc.set_op(ALU_OP0_SET_CF_IDX0);
+	else
+		a->bc.set_op(ALU_OP0_SET_CF_IDX1);
+	a->bc.slot = SLOT_X;
+	a->dst.resize(1); // Dummy needed for recolor
+
+	PSC_DUMP(
+		sblog << "created IDX load: ";
+		dump::dump_op(a);
+		sblog << "\n";
+	);
+
+	return a;
+}
+
+void post_scheduler::load_index_register(value *v, unsigned ar_idx)
+{
+	alu.reset();
+
+	if (!sh.get_ctx().is_cayman()) {
+		// Evergreen has to first load address register, then use CF_SET_IDX0/1
+		alu_group_tracker &rt = alu.grp();
+		alu_node *set_idx = create_set_idx(sh, ar_idx);
+		if (!rt.try_reserve(set_idx)) {
+			sblog << "can't emit SET_CF_IDX";
+			dump::dump_op(set_idx);
+			sblog << "\n";
+		}
+		process_group();
+
+		if (!alu.check_clause_limits()) {
+			// Can't happen since clause only contains MOVA/CF_SET_IDX0/1
+		}
+		alu.emit_group();
+	}
+
+	alu_group_tracker &rt = alu.grp();
+	alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y);
+
+	if (!rt.try_reserve(a)) {
+		sblog << "can't emit AR load : ";
+		dump::dump_op(a);
+		sblog << "\n";
+	}
+
+	process_group();
+
+	if (!alu.check_clause_limits()) {
+		// Can't happen since clause only contains MOVA/CF_SET_IDX0/1
+	}
+
+	alu.emit_group();
+	alu.emit_clause(cur_bb);
+}
+
+void post_scheduler::process_fetch(container_node *c) {
+	if (c->empty())
+		return;
+
+	for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) {
+		N = I;
+		++N;
+
+		node *n = *I;
+
+		fetch_node *f = static_cast<fetch_node*>(n);
+
+		PSC_DUMP(
+			sblog << "process_tex ";
+			dump::dump_op(n);
+			sblog << "  ";
+		);
+
+		// TODO: If same values used can avoid reloading index register
+		if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ||
+			f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
+			unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ?
+				f->bc.sampler_index_mode : f->bc.resource_index_mode;
+
+			// Currently require prior opt passes to use one TEX per indexed op
+			assert(f->parent->count() == 1);
+
+			value *v = f->src.back(); // Last src is index offset
+			assert(v);
+
+			cur_bb->push_front(c);
+
+			load_index_register(v, index_mode);
+			f->src.pop_back(); // Don't need index value any more
+
+			return;
+		}
+	}
+
+	cur_bb->push_front(c);
+}
+
 void post_scheduler::process_alu(container_node *c) {
 
 	if (c->empty())
@@ -855,6 +965,7 @@ void post_scheduler::process_alu(container_node *c) {
 
 		if (uc) {
 			n->remove();
+
 			pending.push_back(n);
 			PSC_DUMP( sblog << "pending\n"; );
 		} else {
@@ -997,6 +1108,18 @@ void post_scheduler::init_globals(val_set &s, bool prealloc) {
 	}
 }
 
+void post_scheduler::emit_index_registers() {
+	for (unsigned i = 0; i < 2; i++) {
+		if (alu.current_idx[i]) {
+			regmap = prev_regmap;
+			alu.discard_current_group();
+
+			load_index_register(alu.current_idx[i], KC_INDEX_0 + i);
+			alu.current_idx[i] = NULL;
+		}
+	}
+}
+
 void post_scheduler::emit_clause() {
 
 	if (alu.current_ar) {
@@ -1005,7 +1128,11 @@ void post_scheduler::emit_clause() {
 		alu.emit_group();
 	}
 
-	alu.emit_clause(cur_bb);
+	if (!alu.is_empty()) {
+		alu.emit_clause(cur_bb);
+	}
+
+	emit_index_registers();
 }
 
 void post_scheduler::schedule_alu(container_node *c) {
@@ -1017,6 +1144,14 @@ void post_scheduler::schedule_alu(container_node *c) {
 		prev_regmap = regmap;
 
 		if (!prepare_alu_group()) {
+			if (alu.current_idx[0] || alu.current_idx[1]) {
+				regmap = prev_regmap;
+				emit_clause();
+				init_globals(live, false);
+
+				continue;
+			}
+
 			if (alu.current_ar) {
 				emit_load_ar();
 				continue;
@@ -1028,6 +1163,7 @@ void post_scheduler::schedule_alu(container_node *c) {
 			regmap = prev_regmap;
 			emit_clause();
 			init_globals(live, false);
+
 			continue;
 		}
 
@@ -1180,7 +1316,7 @@ void post_scheduler::emit_load_ar() {
 	alu.discard_current_group();
 
 	alu_group_tracker &rt = alu.grp();
-	alu_node *a = alu.create_ar_load();
+	alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X);
 
 	if (!rt.try_reserve(a)) {
 		sblog << "can't emit AR load : ";
@@ -1287,6 +1423,42 @@ bool post_scheduler::map_src_val(value *v) {
 }
 
 bool post_scheduler::map_src_vec(vvec &vv, bool src) {
+	if (src) {
+		// Handle possible UBO indexing
+		bool ubo_indexing[2] = { false, false };
+		for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
+			value *v = *I;
+			if (!v)
+				continue;
+
+			if (v->is_kcache()) {
+				unsigned index_mode = v->select.kcache_index_mode();
+				if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) {
+					ubo_indexing[index_mode - KC_INDEX_0] = true;
+				}
+			}
+		}
+
+		// idx values stored at end of src vec, see bc_parser::prepare_alu_group
+		for (unsigned i = 2; i != 0; i--) {
+			if (ubo_indexing[i-1]) {
+				// TODO: skip adding value to kcache reservation somehow, causes
+				// unnecessary group breaks and cache line locks
+				value *v = vv.back();
+				if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) {
+					PSC_DUMP(
+						sblog << "IDX" << i-1 << " already set to " <<
+						*alu.current_idx[i-1] << ", trying to set " << *v << "\n";
+					);
+					return false;
+				}
+
+				alu.current_idx[i-1] = v;
+				PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";);
+			}
+		}
+	}
+
 	for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
 		value *v = *I;
 		if (!v)
@@ -1352,6 +1524,10 @@ void post_scheduler::dump_regmap() {
 		sblog << "    current_AR: " << *alu.current_ar << "\n";
 	if (alu.current_pr)
 		sblog << "    current_PR: " << *alu.current_pr << "\n";
+	if (alu.current_idx[0])
+		sblog << "    current IDX0: " << *alu.current_idx[0] << "\n";
+	if (alu.current_idx[1])
+		sblog << "    current IDX1: " << *alu.current_idx[1] << "\n";
 }
 
 void post_scheduler::recolor_locals() {
@@ -1441,6 +1617,13 @@ unsigned post_scheduler::try_add_instruction(node *n) {
 
 	unsigned avail_slots = rt.avail_slots();
 
+	// Cannot schedule in same clause as instructions using this index value
+	if (!n->dst.empty() && n->dst[0] &&
+		(n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) {
+		PSC_DUMP(sblog << "   CF_IDX source: " << *n->dst[0] << "\n";);
+		return 0;
+	}
+
 	if (n->is_alu_packed()) {
 		alu_packed_node *p = static_cast<alu_packed_node*>(n);
 		unsigned slots = p->get_slot_mask();
@@ -1770,7 +1953,7 @@ alu_clause_tracker::alu_clause_tracker(shader &sh)
 	  grp0(sh), grp1(sh),
 	  group(), clause(),
 	  push_exec_mask(),
-	  current_ar(), current_pr() {}
+	  current_ar(), current_pr(), current_idx() {}
 
 void alu_clause_tracker::emit_group() {
 
@@ -1827,6 +2010,8 @@ bool alu_clause_tracker::check_clause_limits() {
 
 	// reserving slots to load AR and PR values
 	unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0);
+	// ...and index registers
+	reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL);
 
 	if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots)
 		return false;
@@ -1892,13 +2077,15 @@ unsigned rp_kcache_tracker::get_lines(kc_lines& lines) {
 	unsigned cnt = 0;
 
 	for (unsigned i = 0; i < sel_count; ++i) {
-		unsigned line = rp[i];
+		unsigned line = rp[i] & 0x1fffffffu;
+		unsigned index_mode = rp[i] >> 29;
 
 		if (!line)
 			return cnt;
 
 		--line;
 		line = (sel_count == 2) ? line >> 5 : line >> 6;
+		line |= index_mode << 29;
 
 		if (lines.insert(line).second)
 			++cnt;
@@ -1913,14 +2100,18 @@ bool alu_kcache_tracker::update_kc() {
 	memcpy(old_kc, kc, sizeof(kc));
 
 	for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) {
-		unsigned line = *I;
+		unsigned index_mode = *I >> 29;
+		unsigned line = *I & 0x1fffffffu;
 		unsigned bank = line >> 8;
 
+		assert(index_mode <= KC_INDEX_INVALID);
 		line &= 0xFF;
 
-		if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line))
-			++kc[c-1].mode;
-		else {
+		if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) &&
+			kc[c-1].index_mode == index_mode)
+		{
+			kc[c-1].mode = KC_LOCK_2;
+		} else {
 			if (c == max_kcs) {
 				memcpy(kc, old_kc, sizeof(kc));
 				return false;
@@ -1930,17 +2121,16 @@ bool alu_kcache_tracker::update_kc() {
 
 			kc[c].bank = bank;
 			kc[c].addr = line;
+			kc[c].index_mode = index_mode;
 			++c;
 		}
 	}
 	return true;
 }
 
-alu_node* alu_clause_tracker::create_ar_load() {
+alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) {
 	alu_node *a = sh.create_alu();
 
-	// FIXME use MOVA_GPR on R6xx
-
 	if (sh.get_ctx().uses_mova_gpr) {
 		a->bc.set_op(ALU_OP1_MOVA_GPR_INT);
 		a->bc.slot = SLOT_TRANS;
@@ -1948,9 +2138,13 @@ alu_node* alu_clause_tracker::create_ar_load() {
 		a->bc.set_op(ALU_OP1_MOVA_INT);
 		a->bc.slot = SLOT_X;
 	}
+	a->bc.dst_chan = ar_channel;
+	if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) {
+		a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
+	}
 
 	a->dst.resize(1);
-	a->src.push_back(current_ar);
+	a->src.push_back(v);
 
 	PSC_DUMP(
 		sblog << "created AR load: ";
diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h
index 87c45867e16..05b428ca884 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.h
+++ b/src/gallium/drivers/r600/sb/sb_sched.h
@@ -66,6 +66,7 @@ public:
 class literal_tracker {
 	literal lt[4];
 	unsigned uc[4];
+
 public:
 	literal_tracker() : lt(), uc() {}
 
@@ -219,6 +220,8 @@ public:
 	// bottom-up)
 	value *current_ar;
 	value *current_pr;
+	// current values of CF_IDX registers that need preloading
+	value *current_idx[2];
 
 	alu_clause_tracker(shader &sh);
 
@@ -235,7 +238,7 @@ public:
 	void new_group();
 	bool is_empty();
 
-	alu_node* create_ar_load();
+	alu_node* create_ar_load(value *v, chan_select ar_channel);
 
 	void discard_current_group();
 
@@ -256,6 +259,7 @@ class post_scheduler : public pass {
 
 	val_set cleared_interf;
 
+	void emit_index_registers();
 public:
 
 	post_scheduler(shader &sh) : pass(sh),
@@ -266,6 +270,9 @@ public:
 	void run_on(container_node *n);
 	void schedule_bb(bb_node *bb);
 
+	void load_index_register(value *v, unsigned idx);
+	void process_fetch(container_node *c);
+
 	void process_alu(container_node *c);
 	void schedule_alu(container_node *c);
 	bool prepare_alu_group();
diff --git a/src/gallium/drivers/r600/sb/sb_shader.cpp b/src/gallium/drivers/r600/sb/sb_shader.cpp
index f996c0786d1..87e28e98157 100644
--- a/src/gallium/drivers/r600/sb/sb_shader.cpp
+++ b/src/gallium/drivers/r600/sb/sb_shader.cpp
@@ -188,9 +188,9 @@ value* shader::create_temp_value() {
 	return get_value(VLK_TEMP, id, 0);
 }
 
-value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan) {
+value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan, alu_kcache_index_mode index_mode) {
 	return get_ro_value(kcache_values, VLK_KCACHE,
-			sel_chan((bank << 12) | index, chan));
+			sel_chan(bank, index, chan, index_mode));
 }
 
 void shader::add_input(unsigned gpr, bool preloaded, unsigned comp_mask) {
diff --git a/src/gallium/drivers/r600/sb/sb_shader.h b/src/gallium/drivers/r600/sb/sb_shader.h
index 7955bba9b67..70bea891b76 100644
--- a/src/gallium/drivers/r600/sb/sb_shader.h
+++ b/src/gallium/drivers/r600/sb/sb_shader.h
@@ -323,7 +323,7 @@ public:
 
 
 	value* get_special_ro_value(unsigned sel);
-	value* get_kcache_value(unsigned bank, unsigned index, unsigned chan);
+	value* get_kcache_value(unsigned bank, unsigned index, unsigned chan, alu_kcache_index_mode index_mode);
 
 	value* get_value_version(value* v, unsigned ver);
 
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 2e9a0135647..ac99e732c94 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -272,6 +272,15 @@ static LLVMValueRef fetch_system_value(
 	return bitcast(bld_base, type, cval);
 }
 
+static LLVMValueRef si_build_alloca_undef(struct gallivm_state *gallivm,
+					  LLVMTypeRef type,
+					  const char *name)
+{
+	LLVMValueRef ptr = lp_build_alloca(gallivm, type, name);
+	LLVMBuildStore(gallivm->builder, LLVMGetUndef(type), ptr);
+	return ptr;
+}
+
 static void emit_declaration(
 	struct lp_build_tgsi_context * bld_base,
 	const struct tgsi_full_declaration *decl)
@@ -285,7 +294,7 @@ static void emit_declaration(
 		for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
 			unsigned chan;
 			for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-				 ctx->soa.addr[idx][chan] = lp_build_alloca(
+				 ctx->soa.addr[idx][chan] = si_build_alloca_undef(
 					&ctx->gallivm,
 					ctx->soa.bld_base.uint_bld.elem_type, "");
 			}
@@ -315,8 +324,9 @@ static void emit_declaration(
 		for (idx = first; idx <= last; idx++) {
 			for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
 				ctx->temps[idx * TGSI_NUM_CHANNELS + i] =
-					lp_build_alloca(bld_base->base.gallivm, bld_base->base.vec_type,
-						"temp");
+					si_build_alloca_undef(bld_base->base.gallivm,
+							      bld_base->base.vec_type,
+							      "temp");
 			}
 		}
 		break;
@@ -347,7 +357,8 @@ static void emit_declaration(
 			unsigned chan;
 			assert(idx < RADEON_LLVM_MAX_OUTPUTS);
 			for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-				ctx->soa.outputs[idx][chan] = lp_build_alloca(&ctx->gallivm,
+				ctx->soa.outputs[idx][chan] = si_build_alloca_undef(
+					&ctx->gallivm,
 					ctx->soa.bld_base.base.elem_type, "");
 			}
 		}
@@ -908,7 +919,21 @@ static void emit_ucmp(
 		LLVMBuildSelect(builder, v, emit_data->args[1], emit_data->args[2], "");
 }
 
-static void emit_cmp(
+static void emit_cmp(const struct lp_build_tgsi_action *action,
+		     struct lp_build_tgsi_context *bld_base,
+		     struct lp_build_emit_data *emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMValueRef cond, *args = emit_data->args;
+
+	cond = LLVMBuildFCmp(builder, LLVMRealOLT, args[0],
+			     bld_base->base.zero, "");
+
+	emit_data->output[emit_data->chan] =
+		LLVMBuildSelect(builder, cond, args[1], args[2], "");
+}
+
+static void emit_set_cond(
 		const struct lp_build_tgsi_action *action,
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data)
@@ -1382,6 +1407,51 @@ static void emit_imsb(const struct lp_build_tgsi_action * action,
 		LLVMBuildSelect(builder, cond, all_ones, msb, "");
 }
 
+static void emit_iabs(const struct lp_build_tgsi_action *action,
+		      struct lp_build_tgsi_context *bld_base,
+		      struct lp_build_emit_data *emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+
+	emit_data->output[emit_data->chan] =
+		lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_IMAX,
+					  emit_data->args[0],
+					  LLVMBuildNeg(builder,
+						       emit_data->args[0], ""));
+}
+
+static void emit_minmax_int(const struct lp_build_tgsi_action *action,
+			    struct lp_build_tgsi_context *bld_base,
+			    struct lp_build_emit_data *emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMIntPredicate op;
+
+	switch (emit_data->info->opcode) {
+	default:
+		assert(0);
+	case TGSI_OPCODE_IMAX:
+		op = LLVMIntSGT;
+		break;
+	case TGSI_OPCODE_IMIN:
+		op = LLVMIntSLT;
+		break;
+	case TGSI_OPCODE_UMAX:
+		op = LLVMIntUGT;
+		break;
+	case TGSI_OPCODE_UMIN:
+		op = LLVMIntULT;
+		break;
+	}
+
+	emit_data->output[emit_data->chan] =
+		LLVMBuildSelect(builder,
+				LLVMBuildICmp(builder, op, emit_data->args[0],
+					      emit_data->args[1], ""),
+				emit_data->args[0],
+				emit_data->args[1], "");
+}
+
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 {
 	struct lp_type type;
@@ -1447,8 +1517,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.ceil.f32";
 	bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp.";
-	bld_base->op_actions[TGSI_OPCODE_CMP].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_CMP].intr_name = "llvm.AMDGPU.cndlt";
+	bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cmp;
 	bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit;
 	bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32";
@@ -1470,7 +1539,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
 	bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp.";
+	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.exp2.f32";
 	bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32";
 	bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem;
@@ -1482,17 +1551,14 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_FSGE].emit = emit_fcmp;
 	bld_base->op_actions[TGSI_OPCODE_FSLT].emit = emit_fcmp;
 	bld_base->op_actions[TGSI_OPCODE_FSNE].emit = emit_fcmp;
-	bld_base->op_actions[TGSI_OPCODE_IABS].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_IABS].intr_name = "llvm.AMDIL.abs.";
+	bld_base->op_actions[TGSI_OPCODE_IABS].emit = emit_iabs;
 	bld_base->op_actions[TGSI_OPCODE_IBFE].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_IBFE].intr_name = "llvm.AMDGPU.bfe.i32";
 	bld_base->op_actions[TGSI_OPCODE_IDIV].emit = emit_idiv;
 	bld_base->op_actions[TGSI_OPCODE_IF].emit = if_emit;
 	bld_base->op_actions[TGSI_OPCODE_UIF].emit = uif_emit;
-	bld_base->op_actions[TGSI_OPCODE_IMAX].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_IMAX].intr_name = "llvm.AMDGPU.imax";
-	bld_base->op_actions[TGSI_OPCODE_IMIN].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_IMIN].intr_name = "llvm.AMDGPU.imin";
+	bld_base->op_actions[TGSI_OPCODE_IMAX].emit = emit_minmax_int;
+	bld_base->op_actions[TGSI_OPCODE_IMIN].emit = emit_minmax_int;
 	bld_base->op_actions[TGSI_OPCODE_IMSB].emit = emit_imsb;
 	bld_base->op_actions[TGSI_OPCODE_INEG].emit = emit_ineg;
 	bld_base->op_actions[TGSI_OPCODE_ISHR].emit = emit_ishr;
@@ -1508,8 +1574,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_LSB].emit = emit_lsb;
 	bld_base->op_actions[TGSI_OPCODE_LG2].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_LG2].intr_name = "llvm.log2.f32";
-	bld_base->op_actions[TGSI_OPCODE_LRP].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_LRP].intr_name = "llvm.AMDGPU.lrp";
 	bld_base->op_actions[TGSI_OPCODE_MOD].emit = emit_mod;
 	bld_base->op_actions[TGSI_OPCODE_UMSB].emit = emit_umsb;
 	bld_base->op_actions[TGSI_OPCODE_NOT].emit = emit_not;
@@ -1519,31 +1583,29 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_POW].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32";
 	bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest.";
+	bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.rint.f32";
 	bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name = "llvm.AMDGPU.rsq.clamped.f32";
 	bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_cmp;
-	bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_cmp;
+	bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_set_cond;
+	bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_set_cond;
 	bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl;
-	bld_base->op_actions[TGSI_OPCODE_SLE].emit = emit_cmp;
-	bld_base->op_actions[TGSI_OPCODE_SLT].emit = emit_cmp;
-	bld_base->op_actions[TGSI_OPCODE_SNE].emit = emit_cmp;
-	bld_base->op_actions[TGSI_OPCODE_SGT].emit = emit_cmp;
+	bld_base->op_actions[TGSI_OPCODE_SLE].emit = emit_set_cond;
+	bld_base->op_actions[TGSI_OPCODE_SLT].emit = emit_set_cond;
+	bld_base->op_actions[TGSI_OPCODE_SNE].emit = emit_set_cond;
+	bld_base->op_actions[TGSI_OPCODE_SGT].emit = emit_set_cond;
 	bld_base->op_actions[TGSI_OPCODE_SIN].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_SIN].intr_name = "llvm.sin.f32";
 	bld_base->op_actions[TGSI_OPCODE_SQRT].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_SQRT].intr_name = "llvm.sqrt.f32";
 	bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg;
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc";
+	bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.trunc.f32";
 	bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd;
 	bld_base->op_actions[TGSI_OPCODE_UBFE].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_UBFE].intr_name = "llvm.AMDGPU.bfe.u32";
 	bld_base->op_actions[TGSI_OPCODE_UDIV].emit = emit_udiv;
-	bld_base->op_actions[TGSI_OPCODE_UMAX].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_UMAX].intr_name = "llvm.AMDGPU.umax";
-	bld_base->op_actions[TGSI_OPCODE_UMIN].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_UMIN].intr_name = "llvm.AMDGPU.umin";
+	bld_base->op_actions[TGSI_OPCODE_UMAX].emit = emit_minmax_int;
+	bld_base->op_actions[TGSI_OPCODE_UMIN].emit = emit_minmax_int;
 	bld_base->op_actions[TGSI_OPCODE_UMOD].emit = emit_umod;
 	bld_base->op_actions[TGSI_OPCODE_USEQ].emit = emit_icmp;
 	bld_base->op_actions[TGSI_OPCODE_USGE].emit = emit_icmp;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index c6605346771..697e60a50d9 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -469,7 +469,7 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){
 	if (program->kernels) {
 		for (int i = 0; i < program->num_kernels; i++){
 			if (program->kernels[i].bo){
-				si_shader_destroy(ctx, &program->kernels[i]);
+				si_shader_destroy(&program->kernels[i]);
 			}
 		}
 		FREE(program->kernels);
@@ -482,7 +482,7 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){
 	FREE(program->shader.binary.config);
 	FREE(program->shader.binary.rodata);
 	FREE(program->shader.binary.global_symbol_offsets);
-	si_shader_destroy(ctx, &program->shader);
+	si_shader_destroy(&program->shader);
 #endif
 
 	pipe_resource_reference(
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index a0283b7c966..53c80dba602 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -271,6 +271,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_START_INSTANCE:
 	case PIPE_CAP_NPOT_TEXTURES:
 	case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+	case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
         case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
 	case PIPE_CAP_TGSI_INSTANCEID:
 	case PIPE_CAP_COMPUTE:
@@ -330,8 +332,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	/* Unsupported features. */
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
-	case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
-	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 32a702fcdf5..a119cbdc16c 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1306,6 +1306,23 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	unsigned compressed = 0;
 	unsigned chan;
 
+	/* XXX: This controls which components of the output
+	 * registers actually get exported. (e.g bit 0 means export
+	 * X component, bit 1 means export Y component, etc.)  I'm
+	 * hard coding this to 0xf for now.  In the future, we might
+	 * want to do something else.
+	 */
+	args[0] = lp_build_const_int32(base->gallivm, 0xf);
+
+	/* Specify whether the EXEC mask represents the valid mask */
+	args[1] = uint->zero;
+
+	/* Specify whether this is the last export */
+	args[2] = uint->zero;
+
+	/* Specify the target we are exporting */
+	args[3] = lp_build_const_int32(base->gallivm, target);
+
 	if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 		int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
@@ -1323,55 +1340,31 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 		}
 	}
 
+	/* Set COMPR flag */
+	args[4] = compressed ? uint->one : uint->zero;
+
 	if (compressed) {
 		/* Pixel shader needs to pack output values before export */
-		for (chan = 0; chan < 2; chan++ ) {
-			args[0] = values[2 * chan];
-			args[1] = values[2 * chan + 1];
-			args[chan + 5] =
-				lp_build_intrinsic(base->gallivm->builder,
-						"llvm.SI.packf16",
-						LLVMInt32TypeInContext(base->gallivm->context),
-						args, 2,
-						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
+		for (chan = 0; chan < 2; chan++) {
+			LLVMValueRef pack_args[2] = {
+				values[2 * chan],
+				values[2 * chan + 1]
+			};
+			LLVMValueRef packed;
+
+			packed = lp_build_intrinsic(base->gallivm->builder,
+						    "llvm.SI.packf16",
+						    LLVMInt32TypeInContext(base->gallivm->context),
+						    pack_args, 2,
+						    LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 			args[chan + 7] = args[chan + 5] =
 				LLVMBuildBitCast(base->gallivm->builder,
-						 args[chan + 5],
+						 packed,
 						 LLVMFloatTypeInContext(base->gallivm->context),
 						 "");
 		}
-
-		/* Set COMPR flag */
-		args[4] = uint->one;
-	} else {
-		for (chan = 0; chan < 4; chan++ )
-			/* +5 because the first output value will be
-			 * the 6th argument to the intrinsic. */
-			args[chan + 5] = values[chan];
-
-		/* Clear COMPR flag */
-		args[4] = uint->zero;
-	}
-
-	/* XXX: This controls which components of the output
-	 * registers actually get exported. (e.g bit 0 means export
-	 * X component, bit 1 means export Y component, etc.)  I'm
-	 * hard coding this to 0xf for now.  In the future, we might
-	 * want to do something else. */
-	args[0] = lp_build_const_int32(base->gallivm, 0xf);
-
-	/* Specify whether the EXEC mask represents the valid mask */
-	args[1] = uint->zero;
-
-	/* Specify whether this is the last export */
-	args[2] = uint->zero;
-
-	/* Specify the target we are exporting */
-	args[3] = lp_build_const_int32(base->gallivm, target);
-
-	/* XXX: We probably need to keep track of the output
-	 * values, so we know what we are passing to the next
-	 * stage. */
+	} else
+		memcpy(&args[5], values, sizeof(values[0]) * 4);
 }
 
 /* Load from output pointers and initialize arguments for the shader export intrinsic */
@@ -2083,6 +2076,45 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 
 	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
+	/* Vertex color clamping.
+	 *
+	 * This uses a state constant loaded in a user data SGPR and
+	 * an IF statement is added that clamps all colors if the constant
+	 * is true.
+	 */
+	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
+	    !si_shader_ctx->shader->is_gs_copy_shader) {
+		struct lp_build_if_state if_ctx;
+		LLVMValueRef cond = NULL;
+		LLVMValueRef addr, val;
+
+		for (i = 0; i < info->num_outputs; i++) {
+			if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
+			    info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
+				continue;
+
+			/* We've found a color. */
+			if (!cond) {
+				/* The state is in the first bit of the user SGPR. */
+				cond = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+						    SI_PARAM_VS_STATE_BITS);
+				cond = LLVMBuildTrunc(gallivm->builder, cond,
+						      LLVMInt1TypeInContext(gallivm->context), "");
+				lp_build_if(&if_ctx, gallivm, cond);
+			}
+
+			for (j = 0; j < 4; j++) {
+				addr = si_shader_ctx->radeon_bld.soa.outputs[i][j];
+				val = LLVMBuildLoad(gallivm->builder, addr, "");
+				val = radeon_llvm_saturate(bld_base, val);
+				LLVMBuildStore(gallivm->builder, val, addr);
+			}
+		}
+
+		if (cond)
+			lp_build_endif(&if_ctx);
+	}
+
 	for (i = 0; i < info->num_outputs; i++) {
 		outputs[i].name = info->output_semantic_name[i];
 		outputs[i].sid = info->output_semantic_index[i];
@@ -2117,6 +2149,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct lp_build_context * base = &bld_base->base;
 	struct lp_build_context * uint = &bld_base->uint_bld;
 	struct tgsi_shader_info *info = &shader->selector->info;
+	LLVMBuilderRef builder = base->gallivm->builder;
 	LLVMValueRef args[9];
 	LLVMValueRef last_args[9] = { 0 };
 	int depth_index = -1, stencil_index = -1, samplemask_index = -1;
@@ -2143,6 +2176,16 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 			target = V_008DFC_SQ_EXP_MRT + semantic_index;
 			alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3];
 
+			if (si_shader_ctx->shader->key.ps.clamp_color) {
+				for (int j = 0; j < 4; j++) {
+					LLVMValueRef ptr = si_shader_ctx->radeon_bld.soa.outputs[i][j];
+					LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+
+					result = radeon_llvm_saturate(bld_base, result);
+					LLVMBuildStore(builder, result, ptr);
+				}
+			}
+
 			if (si_shader_ctx->shader->key.ps.alpha_to_one)
 				LLVMBuildStore(base->gallivm->builder,
 					       base->one, alpha_ptr);
@@ -2153,6 +2196,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 
 			if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
 				si_scale_alpha_by_sample_mask(bld_base, alpha_ptr);
+
 			break;
 		default:
 			target = 0;
@@ -3440,6 +3484,9 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			if (shader->is_gs_copy_shader) {
 				last_array_pointer = SI_PARAM_CONST;
 				num_params = SI_PARAM_CONST+1;
+			} else {
+				params[SI_PARAM_VS_STATE_BITS] = i32;
+				num_params = SI_PARAM_VS_STATE_BITS+1;
 			}
 
 			/* The locations of the other parameters are assigned dynamically. */
@@ -3982,6 +4029,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 				key->vs.es_enabled_outputs);
 		fprintf(f, "  as_es = %u\n", key->vs.as_es);
 		fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
+		fprintf(f, "  export_prim_id = %u\n", key->vs.export_prim_id);
 		break;
 
 	case PIPE_SHADER_TESS_CTRL:
@@ -3993,6 +4041,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 			fprintf(f, "  es_enabled_outputs = 0x%"PRIx64"\n",
 				key->tes.es_enabled_outputs);
 		fprintf(f, "  as_es = %u\n", key->tes.as_es);
+		fprintf(f, "  export_prim_id = %u\n", key->tes.export_prim_id);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
@@ -4005,6 +4054,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 		fprintf(f, "  alpha_func = %u\n", key->ps.alpha_func);
 		fprintf(f, "  alpha_to_one = %u\n", key->ps.alpha_to_one);
 		fprintf(f, "  poly_stipple = %u\n", key->ps.poly_stipple);
+		fprintf(f, "  clamp_color = %u\n", key->ps.clamp_color);
 		break;
 
 	default:
@@ -4196,10 +4246,12 @@ out:
 	return r;
 }
 
-void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
+void si_shader_destroy(struct si_shader *shader)
 {
-	if (shader->gs_copy_shader)
-		si_shader_destroy(ctx, shader->gs_copy_shader);
+	if (shader->gs_copy_shader) {
+		si_shader_destroy(shader->gs_copy_shader);
+		FREE(shader->gs_copy_shader);
+	}
 
 	if (shader->scratch_bo)
 		r600_resource_reference(&shader->scratch_bo, NULL);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b92fa02a171..54dad726d01 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -83,6 +83,7 @@ struct radeon_shader_reloc;
 #define SI_SGPR_VERTEX_BUFFER	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
+#define SI_SGPR_VS_STATE_BITS	12 /* VS(VS) only */
 #define SI_SGPR_LS_OUT_LAYOUT	12 /* VS(LS) only */
 #define SI_SGPR_TCS_OUT_OFFSETS	8  /* TCS & TES only */
 #define SI_SGPR_TCS_OUT_LAYOUT	9  /* TCS & TES only */
@@ -90,8 +91,9 @@ struct radeon_shader_reloc;
 #define SI_SGPR_ALPHA_REF	8  /* PS only */
 #define SI_SGPR_PS_STATE_BITS	9  /* PS only */
 
-#define SI_VS_NUM_USER_SGPR	12
-#define SI_LS_NUM_USER_SGPR	13
+#define SI_VS_NUM_USER_SGPR	13 /* API VS */
+#define SI_ES_NUM_USER_SGPR	12 /* API VS */
+#define SI_LS_NUM_USER_SGPR	13 /* API VS */
 #define SI_TCS_NUM_USER_SGPR	11
 #define SI_TES_NUM_USER_SGPR	10
 #define SI_GS_NUM_USER_SGPR	8
@@ -108,6 +110,8 @@ struct radeon_shader_reloc;
 #define SI_PARAM_VERTEX_BUFFER	4
 #define SI_PARAM_BASE_VERTEX	5
 #define SI_PARAM_START_INSTANCE	6
+/* [0] = clamp vertex color */
+#define SI_PARAM_VS_STATE_BITS	7
 /* the other VS parameters are assigned dynamically */
 
 /* Offsets where TCS outputs and TCS patch outputs live in LDS:
@@ -227,6 +231,7 @@ union si_shader_key {
 		unsigned	alpha_to_one:1;
 		unsigned	poly_stipple:1;
 		unsigned	poly_line_smoothing:1;
+		unsigned	clamp_color:1;
 	} ps;
 	struct {
 		unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
@@ -324,7 +329,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f);
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod);
-void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
+void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 00d4bc1fbc2..e6475364f98 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -694,7 +694,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 	rs->poly_smooth = state->poly_smooth;
 	rs->uses_poly_offset = state->offset_point || state->offset_line ||
 			       state->offset_tri;
-
+	rs->clamp_fragment_color = state->clamp_fragment_color;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
 	rs->pa_sc_line_stipple = state->line_stipple_enable ?
@@ -760,6 +760,8 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 				   state->fill_back != PIPE_POLYGON_MODE_FILL) |
 		S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
 		S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
+	si_pm4_set_reg(pm4, R_00B130_SPI_SHADER_USER_DATA_VS_0 +
+		       SI_SGPR_VS_STATE_BITS * 4, state->clamp_vertex_color);
 
 	/* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
 	for (i = 0; i < 3; i++) {
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 6a567688ee4..fba6619d2fd 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -60,6 +60,7 @@ struct si_state_rasterizer {
 	bool			line_smooth;
 	bool			poly_smooth;
 	bool			uses_poly_offset;
+	bool			clamp_fragment_color;
 };
 
 struct si_dsa_stencil_ref_part {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index f673388b121..c98509bb0b9 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -122,7 +122,8 @@ static void si_shader_ls(struct si_shader *shader)
 
 	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
-		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt);
+		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
+			   S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
 	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
 			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
 }
@@ -154,7 +155,8 @@ static void si_shader_hs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
 	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
 		       S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
-		       S_00B428_SGPRS((num_sgprs - 1) / 8));
+		       S_00B428_SGPRS((num_sgprs - 1) / 8) |
+		       S_00B428_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
 		       S_00B42C_USER_SGPR(num_user_sgprs) |
 		       S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
@@ -177,7 +179,7 @@ static void si_shader_es(struct si_shader *shader)
 
 	if (shader->selector->type == PIPE_SHADER_VERTEX) {
 		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
-		num_user_sgprs = SI_VS_NUM_USER_SGPR;
+		num_user_sgprs = SI_ES_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
 		vgpr_comp_cnt = 3; /* all components are needed for TES */
 		num_user_sgprs = SI_TES_NUM_USER_SGPR;
@@ -570,6 +572,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			key->ps.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
 						       (is_line && rs->line_smooth)) &&
 						      sctx->framebuffer.nr_samples <= 1;
+			key->ps.clamp_color = rs->clamp_fragment_color;
 		}
 
 		key->ps.alpha_func = PIPE_FUNC_ALWAYS;
@@ -645,9 +648,8 @@ static int si_shader_select(struct pipe_context *ctx,
 	return 0;
 }
 
-static void *si_create_shader_state(struct pipe_context *ctx,
-				    const struct pipe_shader_state *state,
-				    unsigned pipe_shader_type)
+static void *si_create_shader_selector(struct pipe_context *ctx,
+				       const struct pipe_shader_state *state)
 {
 	struct si_screen *sscreen = (struct si_screen *)ctx->screen;
 	struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
@@ -656,7 +658,6 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 	if (!sel)
 		return NULL;
 
-	sel->type = pipe_shader_type;
 	sel->tokens = tgsi_dup_tokens(state->tokens);
 	if (!sel->tokens) {
 		FREE(sel);
@@ -665,6 +666,7 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 
 	sel->so = state->stream_output;
 	tgsi_scan_shader(state->tokens, &sel->info);
+	sel->type = util_pipe_shader_from_tgsi_processor(sel->info.processor);
 	p_atomic_inc(&sscreen->b.num_shaders_created);
 
 	/* First set which opcode uses which (i,j) pair. */
@@ -695,7 +697,7 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 		sel->info.uses_linear_centroid +
 		sel->info.uses_linear_sample >= 2;
 
-	switch (pipe_shader_type) {
+	switch (sel->type) {
 	case PIPE_SHADER_GEOMETRY:
 		sel->gs_output_prim =
 			sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
@@ -761,36 +763,6 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 	return sel;
 }
 
-static void *si_create_fs_state(struct pipe_context *ctx,
-				const struct pipe_shader_state *state)
-{
-	return si_create_shader_state(ctx, state, PIPE_SHADER_FRAGMENT);
-}
-
-static void *si_create_gs_state(struct pipe_context *ctx,
-				const struct pipe_shader_state *state)
-{
-	return si_create_shader_state(ctx, state, PIPE_SHADER_GEOMETRY);
-}
-
-static void *si_create_vs_state(struct pipe_context *ctx,
-				const struct pipe_shader_state *state)
-{
-	return si_create_shader_state(ctx, state, PIPE_SHADER_VERTEX);
-}
-
-static void *si_create_tcs_state(struct pipe_context *ctx,
-				 const struct pipe_shader_state *state)
-{
-	return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_CTRL);
-}
-
-static void *si_create_tes_state(struct pipe_context *ctx,
-				 const struct pipe_shader_state *state)
-{
-	return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_EVAL);
-}
-
 /**
  * Normally, we only emit 1 viewport and 1 scissor if no shader is using
  * the VIEWPORT_INDEX output, and emitting the other viewports and scissors
@@ -905,11 +877,21 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 	si_mark_atom_dirty(sctx, &sctx->cb_target_mask);
 }
 
-static void si_delete_shader_selector(struct pipe_context *ctx,
-				      struct si_shader_selector *sel)
+static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = (struct si_shader_selector *)state;
 	struct si_shader *p = sel->current, *c;
+	struct si_shader_selector **current_shader[SI_NUM_SHADERS] = {
+		[PIPE_SHADER_VERTEX] = &sctx->vs_shader,
+		[PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
+		[PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader,
+		[PIPE_SHADER_GEOMETRY] = &sctx->gs_shader,
+		[PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
+	};
+
+	if (*current_shader[sel->type] == sel)
+		*current_shader[sel->type] = NULL;
 
 	while (p) {
 		c = p->next_variant;
@@ -940,7 +922,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx,
 			break;
 		}
 
-		si_shader_destroy(ctx, p);
+		si_shader_destroy(p);
 		free(p);
 		p = c;
 	}
@@ -949,66 +931,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx,
 	free(sel);
 }
 
-static void si_delete_vs_shader(struct pipe_context *ctx, void *state)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *sel = (struct si_shader_selector *)state;
-
-	if (sctx->vs_shader == sel) {
-		sctx->vs_shader = NULL;
-	}
-
-	si_delete_shader_selector(ctx, sel);
-}
-
-static void si_delete_gs_shader(struct pipe_context *ctx, void *state)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *sel = (struct si_shader_selector *)state;
-
-	if (sctx->gs_shader == sel) {
-		sctx->gs_shader = NULL;
-	}
-
-	si_delete_shader_selector(ctx, sel);
-}
-
-static void si_delete_ps_shader(struct pipe_context *ctx, void *state)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *sel = (struct si_shader_selector *)state;
-
-	if (sctx->ps_shader == sel) {
-		sctx->ps_shader = NULL;
-	}
-
-	si_delete_shader_selector(ctx, sel);
-}
-
-static void si_delete_tcs_shader(struct pipe_context *ctx, void *state)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *sel = (struct si_shader_selector *)state;
-
-	if (sctx->tcs_shader == sel) {
-		sctx->tcs_shader = NULL;
-	}
-
-	si_delete_shader_selector(ctx, sel);
-}
-
-static void si_delete_tes_shader(struct pipe_context *ctx, void *state)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *sel = (struct si_shader_selector *)state;
-
-	if (sctx->tes_shader == sel) {
-		sctx->tes_shader = NULL;
-	}
-
-	si_delete_shader_selector(ctx, sel);
-}
-
 static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
@@ -1284,30 +1206,23 @@ static int si_update_scratch_buffer(struct si_context *sctx,
 
 static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
 {
-	if (!sctx->scratch_buffer)
-		return 0;
-
-	return sctx->scratch_buffer->b.b.width0;
+	return sctx->scratch_buffer ? sctx->scratch_buffer->b.b.width0 : 0;
 }
 
-static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx,
-					struct si_shader_selector *sel)
+static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader_selector *sel)
 {
-	if (!sel)
-		return 0;
-
-	return sel->current->scratch_bytes_per_wave;
+	return sel ? sel->current->scratch_bytes_per_wave : 0;
 }
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
 {
 	unsigned bytes = 0;
 
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tcs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tes_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader));
 	return bytes;
 }
 
@@ -1322,7 +1237,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 	int r;
 
 	if (scratch_needed_size > 0) {
-
 		if (scratch_needed_size > current_scratch_buffer_size) {
 			/* Create a bigger scratch buffer */
 			pipe_resource_reference(
@@ -1361,38 +1275,26 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
 
 		/* VS can be bound as LS, ES, or VS. */
-		if (sctx->tes_shader) {
-			r = si_update_scratch_buffer(sctx, sctx->vs_shader);
-			if (r < 0)
-				return false;
-			if (r == 1)
+		r = si_update_scratch_buffer(sctx, sctx->vs_shader);
+		if (r < 0)
+			return false;
+		if (r == 1) {
+			if (sctx->tes_shader)
 				si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
-		} else if (sctx->gs_shader) {
-			r = si_update_scratch_buffer(sctx, sctx->vs_shader);
-			if (r < 0)
-				return false;
-			if (r == 1)
+			else if (sctx->gs_shader)
 				si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
-		} else {
-			r = si_update_scratch_buffer(sctx, sctx->vs_shader);
-			if (r < 0)
-				return false;
-			if (r == 1)
+			else
 				si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
 		}
 
 		/* TES can be bound as ES or VS. */
-		if (sctx->gs_shader) {
-			r = si_update_scratch_buffer(sctx, sctx->tes_shader);
-			if (r < 0)
-				return false;
-			if (r == 1)
+		r = si_update_scratch_buffer(sctx, sctx->tes_shader);
+		if (r < 0)
+			return false;
+		if (r == 1) {
+			if (sctx->gs_shader)
 				si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
-		} else {
-			r = si_update_scratch_buffer(sctx, sctx->tes_shader);
-			if (r < 0)
-				return false;
-			if (r == 1)
+			else
 				si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
 		}
 	}
@@ -1661,11 +1563,11 @@ void si_init_shader_functions(struct si_context *sctx)
 	si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
 	si_init_atom(sctx, &sctx->spi_ps_input, &sctx->atoms.s.spi_ps_input, si_emit_spi_ps_input);
 
-	sctx->b.b.create_vs_state = si_create_vs_state;
-	sctx->b.b.create_tcs_state = si_create_tcs_state;
-	sctx->b.b.create_tes_state = si_create_tes_state;
-	sctx->b.b.create_gs_state = si_create_gs_state;
-	sctx->b.b.create_fs_state = si_create_fs_state;
+	sctx->b.b.create_vs_state = si_create_shader_selector;
+	sctx->b.b.create_tcs_state = si_create_shader_selector;
+	sctx->b.b.create_tes_state = si_create_shader_selector;
+	sctx->b.b.create_gs_state = si_create_shader_selector;
+	sctx->b.b.create_fs_state = si_create_shader_selector;
 
 	sctx->b.b.bind_vs_state = si_bind_vs_shader;
 	sctx->b.b.bind_tcs_state = si_bind_tcs_shader;
@@ -1673,9 +1575,9 @@ void si_init_shader_functions(struct si_context *sctx)
 	sctx->b.b.bind_gs_state = si_bind_gs_shader;
 	sctx->b.b.bind_fs_state = si_bind_ps_shader;
 
-	sctx->b.b.delete_vs_state = si_delete_vs_shader;
-	sctx->b.b.delete_tcs_state = si_delete_tcs_shader;
-	sctx->b.b.delete_tes_state = si_delete_tes_shader;
-	sctx->b.b.delete_gs_state = si_delete_gs_shader;
-	sctx->b.b.delete_fs_state = si_delete_ps_shader;
+	sctx->b.b.delete_vs_state = si_delete_shader_selector;
+	sctx->b.b.delete_tcs_state = si_delete_shader_selector;
+	sctx->b.b.delete_tes_state = si_delete_shader_selector;
+	sctx->b.b.delete_gs_state = si_delete_shader_selector;
+	sctx->b.b.delete_fs_state = si_delete_shader_selector;
 }
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index 2bf795de22d..f8622b96f45 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -312,6 +312,8 @@ void svga_context_flush( struct svga_context *svga,
     */
    svga->swc->flush(svga->swc, &fence);
 
+   svga->hud.num_flushes++;
+
    svga_screen_cache_flush(svgascreen, fence);
 
    /* To force the re-emission of rendertargets and texture sampler bindings on
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index e8575f36c3b..bcce18a3502 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -44,10 +44,21 @@
 
 
 /** Non-GPU queries for gallium HUD */
-#define SVGA_QUERY_DRAW_CALLS   (PIPE_QUERY_DRIVER_SPECIFIC + 0)
-#define SVGA_QUERY_FALLBACKS    (PIPE_QUERY_DRIVER_SPECIFIC + 1)
-#define SVGA_QUERY_MEMORY_USED  (PIPE_QUERY_DRIVER_SPECIFIC + 2)
-#define SVGA_QUERY_MAX          (PIPE_QUERY_DRIVER_SPECIFIC + 3)
+/* per-frame counters */
+#define SVGA_QUERY_NUM_DRAW_CALLS          (PIPE_QUERY_DRIVER_SPECIFIC + 0)
+#define SVGA_QUERY_NUM_FALLBACKS           (PIPE_QUERY_DRIVER_SPECIFIC + 1)
+#define SVGA_QUERY_NUM_FLUSHES             (PIPE_QUERY_DRIVER_SPECIFIC + 2)
+#define SVGA_QUERY_NUM_VALIDATIONS         (PIPE_QUERY_DRIVER_SPECIFIC + 3)
+#define SVGA_QUERY_MAP_BUFFER_TIME         (PIPE_QUERY_DRIVER_SPECIFIC + 4)
+#define SVGA_QUERY_NUM_RESOURCES_MAPPED    (PIPE_QUERY_DRIVER_SPECIFIC + 5)
+/* running total counters */
+#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 6)
+#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 7)
+#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 8)
+#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 9)
+#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 10)
+/*SVGA_QUERY_MAX has to be last because it is size of an array*/
+#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 11)
 
 /**
  * Maximum supported number of constant buffers per shader
@@ -463,9 +474,18 @@ struct svga_context
    /** List of buffers with queued transfers */
    struct list_head dirty_buffers;
 
-   /** performance / info queries */
-   uint64_t num_draw_calls;  /**< SVGA_QUERY_DRAW_CALLS */
-   uint64_t num_fallbacks;   /**< SVGA_QUERY_FALLBACKS */
+   /** performance / info queries for HUD */
+   struct {
+      uint64_t num_draw_calls;       /**< SVGA_QUERY_DRAW_CALLS */
+      uint64_t num_fallbacks;        /**< SVGA_QUERY_NUM_FALLBACKS */
+      uint64_t num_flushes;          /**< SVGA_QUERY_NUM_FLUSHES */
+      uint64_t num_validations;      /**< SVGA_QUERY_NUM_VALIDATIONS */
+      uint64_t map_buffer_time;      /**< SVGA_QUERY_MAP_BUFFER_TIME */
+      uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */
+      uint64_t num_shaders;          /**< SVGA_QUERY_NUM_SHADERS */
+      uint64_t num_state_objects;    /**< SVGA_QUERY_NUM_STATE_OBJECTS */
+      uint64_t num_surface_views;    /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
+   } hud;
 
    /** The currently bound stream output targets */
    unsigned num_so_targets;
diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
index 06bb3e3bd7e..0c9d6129b53 100644
--- a/src/gallium/drivers/svga/svga_pipe_blend.c
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -321,6 +321,8 @@ svga_create_blend_state(struct pipe_context *pipe,
       define_blend_state_object(svga, blend);
    }
 
+   svga->hud.num_state_objects++;
+
    return blend;
 }
 
@@ -359,6 +361,7 @@ static void svga_delete_blend_state(struct pipe_context *pipe,
    }
 
    FREE(blend);
+   svga->hud.num_state_objects--;
 }
 
 static void svga_set_blend_color( struct pipe_context *pipe,
diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
index 5ea623be4d9..d84ed1df48e 100644
--- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -202,6 +202,8 @@ svga_create_depth_stencil_state(struct pipe_context *pipe,
       define_depth_stencil_state_object(svga, ds);
    }
 
+   svga->hud.num_state_objects++;
+
    return ds;
 }
 
@@ -248,6 +250,7 @@ static void svga_delete_depth_stencil_state(struct pipe_context *pipe,
    }
 
    FREE(depth_stencil);
+   svga->hud.num_state_objects--;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
index 303d4565cdb..50ebb53df90 100644
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -177,7 +177,7 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    enum pipe_error ret = 0;
    boolean needed_swtnl;
 
-   svga->num_draw_calls++;  /* for SVGA_QUERY_DRAW_CALLS */
+   svga->hud.num_draw_calls++;  /* for SVGA_QUERY_NUM_DRAW_CALLS */
 
    if (u_reduced_prim(info->mode) == PIPE_PRIM_TRIANGLES &&
        svga->curr.rast->templ.cull_face == PIPE_FACE_FRONT_AND_BACK)
@@ -219,7 +219,7 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 #endif
 
    if (svga->state.sw.need_swtnl) {
-      svga->num_fallbacks++;  /* for SVGA_QUERY_FALLBACKS */
+      svga->hud.num_fallbacks++;  /* for SVGA_QUERY_NUM_FALLBACKS */
       if (!needed_swtnl) {
          /*
           * We're switching from HW to SW TNL.  SW TNL will require mapping all
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 7081e5a1c43..8b9818334ca 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -720,9 +720,17 @@ svga_create_query(struct pipe_context *pipe,
       define_query_vgpu10(svga, sq,
                           sizeof(SVGADXTimestampQueryResult));
       break;
-   case SVGA_QUERY_DRAW_CALLS:
-   case SVGA_QUERY_FALLBACKS:
+   case SVGA_QUERY_NUM_DRAW_CALLS:
+   case SVGA_QUERY_NUM_FALLBACKS:
+   case SVGA_QUERY_NUM_FLUSHES:
    case SVGA_QUERY_MEMORY_USED:
+   case SVGA_QUERY_NUM_SHADERS:
+   case SVGA_QUERY_NUM_RESOURCES:
+   case SVGA_QUERY_NUM_STATE_OBJECTS:
+   case SVGA_QUERY_NUM_VALIDATIONS:
+   case SVGA_QUERY_MAP_BUFFER_TIME:
+   case SVGA_QUERY_NUM_SURFACE_VIEWS:
+   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
       break;
    default:
       assert(!"unexpected query type in svga_create_query()");
@@ -778,9 +786,17 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
       destroy_query_vgpu10(svga, sq);
       sws->fence_reference(sws, &sq->fence, NULL);
       break;
-   case SVGA_QUERY_DRAW_CALLS:
-   case SVGA_QUERY_FALLBACKS:
+   case SVGA_QUERY_NUM_DRAW_CALLS:
+   case SVGA_QUERY_NUM_FALLBACKS:
+   case SVGA_QUERY_NUM_FLUSHES:
    case SVGA_QUERY_MEMORY_USED:
+   case SVGA_QUERY_NUM_SHADERS:
+   case SVGA_QUERY_NUM_RESOURCES:
+   case SVGA_QUERY_NUM_STATE_OBJECTS:
+   case SVGA_QUERY_NUM_VALIDATIONS:
+   case SVGA_QUERY_MAP_BUFFER_TIME:
+   case SVGA_QUERY_NUM_SURFACE_VIEWS:
+   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
       /* nothing */
       break;
    default:
@@ -842,13 +858,29 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
       ret = begin_query_vgpu10(svga, sq);
       assert(ret == PIPE_OK);
       break;
-   case SVGA_QUERY_DRAW_CALLS:
-      sq->begin_count = svga->num_draw_calls;
+   case SVGA_QUERY_NUM_DRAW_CALLS:
+      sq->begin_count = svga->hud.num_draw_calls;
       break;
-   case SVGA_QUERY_FALLBACKS:
-      sq->begin_count = svga->num_fallbacks;
+   case SVGA_QUERY_NUM_FALLBACKS:
+      sq->begin_count = svga->hud.num_fallbacks;
+      break;
+   case SVGA_QUERY_NUM_FLUSHES:
+      sq->begin_count = svga->hud.num_flushes;
+      break;
+   case SVGA_QUERY_NUM_VALIDATIONS:
+      sq->begin_count = svga->hud.num_validations;
+      break;
+   case SVGA_QUERY_MAP_BUFFER_TIME:
+      sq->begin_count = svga->hud.map_buffer_time;
+      break;
+   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
+      sq->begin_count = svga->hud.num_resources_mapped;
       break;
    case SVGA_QUERY_MEMORY_USED:
+   case SVGA_QUERY_NUM_SHADERS:
+   case SVGA_QUERY_NUM_RESOURCES:
+   case SVGA_QUERY_NUM_STATE_OBJECTS:
+   case SVGA_QUERY_NUM_SURFACE_VIEWS:
       /* nothing */
       break;
    default:
@@ -916,13 +948,29 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
       ret = end_query_vgpu10(svga, sq);
       assert(ret == PIPE_OK);
       break;
-   case SVGA_QUERY_DRAW_CALLS:
-      sq->end_count = svga->num_draw_calls;
+   case SVGA_QUERY_NUM_DRAW_CALLS:
+      sq->end_count = svga->hud.num_draw_calls;
+      break;
+   case SVGA_QUERY_NUM_FALLBACKS:
+      sq->end_count = svga->hud.num_fallbacks;
+      break;
+   case SVGA_QUERY_NUM_FLUSHES:
+      sq->end_count = svga->hud.num_flushes;
       break;
-   case SVGA_QUERY_FALLBACKS:
-      sq->end_count = svga->num_fallbacks;
+   case SVGA_QUERY_NUM_VALIDATIONS:
+      sq->end_count = svga->hud.num_validations;
+      break;
+   case SVGA_QUERY_MAP_BUFFER_TIME:
+      sq->end_count = svga->hud.map_buffer_time;
+      break;
+   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
+      sq->end_count = svga->hud.num_resources_mapped;
       break;
    case SVGA_QUERY_MEMORY_USED:
+   case SVGA_QUERY_NUM_SHADERS:
+   case SVGA_QUERY_NUM_RESOURCES:
+   case SVGA_QUERY_NUM_STATE_OBJECTS:
+   case SVGA_QUERY_NUM_SURFACE_VIEWS:
       /* nothing */
       break;
    default:
@@ -1007,13 +1055,30 @@ svga_get_query_result(struct pipe_context *pipe,
       *result = (uint64_t)sResult.numPrimitivesWritten;
       break;
    }
-   case SVGA_QUERY_DRAW_CALLS:
-      /* fall-through */
-   case SVGA_QUERY_FALLBACKS:
+   /* These are per-frame counters */
+   case SVGA_QUERY_NUM_DRAW_CALLS:
+   case SVGA_QUERY_NUM_FALLBACKS:
+   case SVGA_QUERY_NUM_FLUSHES:
+   case SVGA_QUERY_NUM_VALIDATIONS:
+   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
+   case SVGA_QUERY_MAP_BUFFER_TIME:
       vresult->u64 = sq->end_count - sq->begin_count;
       break;
+   /* These are running total counters */
    case SVGA_QUERY_MEMORY_USED:
-      vresult->u64 = svgascreen->total_resource_bytes;
+      vresult->u64 = svgascreen->hud.total_resource_bytes;
+      break;
+   case SVGA_QUERY_NUM_SHADERS:
+      vresult->u64 = svga->hud.num_shaders;
+      break;
+   case SVGA_QUERY_NUM_RESOURCES:
+      vresult->u64 = svgascreen->hud.num_resources;
+      break;
+   case SVGA_QUERY_NUM_STATE_OBJECTS:
+      vresult->u64 = svga->hud.num_state_objects;
+      break;
+   case SVGA_QUERY_NUM_SURFACE_VIEWS:
+      vresult->u64 = svga->hud.num_surface_views;
       break;
    default:
       assert(!"unexpected query type in svga_get_query_result");
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
index a7aadac0111..6310b7a5e86 100644
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -352,6 +352,8 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
       define_rasterizer_object(svga, rast);
    }
 
+   svga->hud.num_state_objects++;
+
    return rast;
 }
 
@@ -392,6 +394,7 @@ svga_delete_rasterizer_state(struct pipe_context *pipe, void *state)
    }
 
    FREE(state);
+   svga->hud.num_state_objects--;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 60e2d44ace4..95241176510 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -273,6 +273,8 @@ svga_create_sampler_state(struct pipe_context *pipe,
             cso->min_lod, cso->view_min_lod, cso->view_max_lod,
             cso->mipfilter == SVGA3D_TEX_FILTER_NONE ? "SVGA3D_TEX_FILTER_NONE" : "SOMETHING");
 
+   svga->hud.num_state_objects++;
+
    return cso;
 }
 
@@ -328,6 +330,7 @@ static void svga_delete_sampler_state(struct pipe_context *pipe,
    }
 
    FREE(sampler);
+   svga->hud.num_state_objects--;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c
index e0932a9dbc1..b932c568f53 100644
--- a/src/gallium/drivers/svga/svga_pipe_vertex.c
+++ b/src/gallium/drivers/svga/svga_pipe_vertex.c
@@ -274,6 +274,9 @@ svga_create_vertex_elements_state(struct pipe_context *pipe,
          translate_vertex_decls(svga, velems);
       }
    }
+
+   svga->hud.num_state_objects++;
+
    return velems;
 }
 
@@ -315,6 +318,7 @@ svga_delete_vertex_elements_state(struct pipe_context *pipe, void *state)
    }
 
    FREE(velems);
+   svga->hud.num_state_objects--;
 }
 
 void svga_cleanup_vertex_state( struct svga_context *svga )
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index 57e37fcfe14..71f2f4f2779 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -29,6 +29,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
 #include "os/os_thread.h"
+#include "os/os_time.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_resource.h"
@@ -77,6 +78,7 @@ svga_buffer_transfer_map(struct pipe_context *pipe,
    struct svga_buffer *sbuf = svga_buffer(resource);
    struct pipe_transfer *transfer;
    uint8_t *map;
+   int64_t begin = os_time_get();
 
    transfer = CALLOC_STRUCT(pipe_transfer);
    if (transfer == NULL) {
@@ -244,6 +246,8 @@ svga_buffer_transfer_map(struct pipe_context *pipe,
       FREE(transfer);
    }
 
+   svga->hud.map_buffer_time += (os_time_get() - begin);
+
    return map;
 }
 
@@ -331,7 +335,10 @@ svga_buffer_destroy( struct pipe_screen *screen,
    if (sbuf->swbuf && !sbuf->user)
       align_free(sbuf->swbuf);
 
-   ss->total_resource_bytes -= sbuf->size;
+   ss->hud.total_resource_bytes -= sbuf->size;
+   assert(ss->hud.num_resources > 0);
+   if (ss->hud.num_resources > 0)
+      ss->hud.num_resources--;
 
    FREE(sbuf);
 }
@@ -409,7 +416,9 @@ svga_buffer_create(struct pipe_screen *screen,
                    (debug_reference_descriptor)debug_describe_resource, 0);
 
    sbuf->size = util_resource_size(&sbuf->b.b);
-   ss->total_resource_bytes += sbuf->size;
+   ss->hud.total_resource_bytes += sbuf->size;
+
+   ss->hud.num_resources++;
 
    return &sbuf->b.b;
 
@@ -427,6 +436,7 @@ svga_user_buffer_create(struct pipe_screen *screen,
 			unsigned bind)
 {
    struct svga_buffer *sbuf;
+   struct svga_screen *ss = svga_screen(screen);
 
    sbuf = CALLOC_STRUCT(svga_buffer);
    if (!sbuf)
@@ -450,6 +460,8 @@ svga_user_buffer_create(struct pipe_screen *screen,
    debug_reference(&sbuf->b.b.reference,
                    (debug_reference_descriptor)debug_describe_resource, 0);
 
+   ss->hud.num_resources++;
+
    return &sbuf->b.b;
 
 no_sbuf:
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h
index 75e12c3220c..0591f8960b9 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.h
+++ b/src/gallium/drivers/svga/svga_resource_buffer.h
@@ -253,6 +253,9 @@ svga_buffer_hw_storage_map(struct svga_context *svga,
                            unsigned flags, boolean *retry)
 {
    struct svga_winsys_screen *sws = svga_buffer_winsys_screen(sbuf);
+
+   svga->hud.num_resources_mapped++;
+
    if (sws->have_gb_objects) {
       return svga->swc->surface_map(svga->swc, sbuf->handle, flags, retry);
    } else {
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 90787be8073..a02d1e495ff 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -29,6 +29,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "os/os_thread.h"
+#include "os/os_time.h"
 #include "util/u_format.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
@@ -229,11 +230,15 @@ svga_texture_destroy(struct pipe_screen *screen,
    SVGA_DBG(DEBUG_DMA, "unref sid %p (texture)\n", tex->handle);
    svga_screen_surface_destroy(ss, &tex->key, &tex->handle);
 
-   ss->total_resource_bytes -= tex->size;
+   ss->hud.total_resource_bytes -= tex->size;
 
    FREE(tex->defined);
    FREE(tex->rendered_to);
    FREE(tex);
+
+   assert(ss->hud.num_resources > 0);
+   if (ss->hud.num_resources > 0)
+      ss->hud.num_resources--;
 }
 
 
@@ -322,6 +327,8 @@ svga_texture_transfer_map(struct pipe_context *pipe,
    boolean use_direct_map = svga_have_gb_objects(svga) &&
       !svga_have_gb_dma(svga);
    unsigned d;
+   void *returnVal;
+   int64_t begin = os_time_get();
 
    /* We can't map texture storage directly unless we have GB objects */
    if (usage & PIPE_TRANSFER_MAP_DIRECTLY) {
@@ -464,10 +471,10 @@ svga_texture_transfer_map(struct pipe_context *pipe,
     * Begin mapping code
     */
    if (st->swbuf) {
-      return st->swbuf;
+      returnVal = st->swbuf;
    }
    else if (!st->use_direct_map) {
-      return sws->buffer_map(sws, st->hwbuf, usage);
+      returnVal = sws->buffer_map(sws, st->hwbuf, usage);
    }
    else {
       SVGA3dSize baseLevelSize;
@@ -518,9 +525,13 @@ svga_texture_transfer_map(struct pipe_context *pipe,
       offset += svga3dsurface_get_pixel_offset(tex->key.format,
                                                mip_width, mip_height,
                                                xoffset, yoffset, zoffset);
-
-      return (void *) (map + offset);
+      returnVal = (void *) (map + offset);
    }
+
+   svga->hud.map_buffer_time += (os_time_get() - begin);
+   svga->hud.num_resources_mapped++;
+
+   return returnVal;
 }
 
 
@@ -889,7 +900,8 @@ svga_texture_create(struct pipe_screen *screen,
                    (debug_reference_descriptor)debug_describe_resource, 0);
 
    tex->size = util_resource_size(template);
-   svgascreen->total_resource_bytes += tex->size;
+   svgascreen->hud.total_resource_bytes += tex->size;
+   svgascreen->hud.num_resources++;
 
    return &tex->b.b;
 }
@@ -901,6 +913,7 @@ svga_texture_from_handle(struct pipe_screen *screen,
 			 struct winsys_handle *whandle)
 {
    struct svga_winsys_screen *sws = svga_winsys_screen(screen);
+   struct svga_screen *ss = svga_screen(screen);
    struct svga_winsys_surface *srf;
    struct svga_texture *tex;
    enum SVGA3dSurfaceFormat format = 0;
@@ -970,5 +983,7 @@ svga_texture_from_handle(struct pipe_screen *screen,
    tex->rendered_to = CALLOC(1, sizeof(tex->rendered_to[0]));
    tex->imported = TRUE;
 
+   ss->hud.num_resources++;
+
    return &tex->b.b;
 }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index e0a28788238..dab89814334 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -772,9 +772,22 @@ svga_get_driver_query_info(struct pipe_screen *screen,
                            struct pipe_driver_query_info *info)
 {
    static const struct pipe_driver_query_info queries[] = {
-      {"draw-calls", SVGA_QUERY_DRAW_CALLS, {0}},
-      {"fallbacks", SVGA_QUERY_FALLBACKS, {0}},
-      {"memory-used", SVGA_QUERY_MEMORY_USED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES}
+      /* per-frame counters */
+      {"num-draw-calls", SVGA_QUERY_NUM_DRAW_CALLS, {0}},
+      {"num-fallbacks", SVGA_QUERY_NUM_FALLBACKS, {0}},
+      {"num-flushes", SVGA_QUERY_NUM_FLUSHES, {0}},
+      {"num-validations", SVGA_QUERY_NUM_VALIDATIONS, {0}},
+      {"map-buffer-time", SVGA_QUERY_MAP_BUFFER_TIME, {0},
+       PIPE_DRIVER_QUERY_TYPE_MICROSECONDS},
+      {"num-resources-mapped", SVGA_QUERY_NUM_RESOURCES_MAPPED, {0}},
+
+      /* running total counters */
+      {"memory-used", SVGA_QUERY_MEMORY_USED, {0},
+       PIPE_DRIVER_QUERY_TYPE_BYTES},
+      {"num-shaders", SVGA_QUERY_NUM_SHADERS, {0}},
+      {"num-resources", SVGA_QUERY_NUM_RESOURCES, {0}},
+      {"num-state-objects", SVGA_QUERY_NUM_STATE_OBJECTS, {0}},
+      {"num-surface-views", SVGA_QUERY_NUM_SURFACE_VIEWS, {0}},
    };
 
    if (!info)
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
index 5581d2e1ffd..98b56b2a6d1 100644
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -80,8 +80,12 @@ struct svga_screen
 
    struct svga_host_surface_cache cache;
 
-   /** Memory used by all resources (buffers and surfaces) */
-   uint64_t total_resource_bytes;
+   /** HUD counters */
+   struct {
+      /** Memory used by all resources (buffers and surfaces) */
+      uint64_t total_resource_bytes;
+      uint64_t num_resources;
+   } hud;
 };
 
 #ifndef DEBUG
diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c
index d46e7ebbc38..5c99e16d976 100644
--- a/src/gallium/drivers/svga/svga_shader.c
+++ b/src/gallium/drivers/svga/svga_shader.c
@@ -414,6 +414,14 @@ svga_set_shader(struct svga_context *svga,
 }
 
 
+struct svga_shader_variant *
+svga_new_shader_variant(struct svga_context *svga)
+{
+   svga->hud.num_shaders++;
+   return CALLOC_STRUCT(svga_shader_variant);
+}
+
+
 enum pipe_error
 svga_destroy_shader_variant(struct svga_context *svga,
                             SVGA3dShaderType type,
@@ -455,6 +463,8 @@ svga_destroy_shader_variant(struct svga_context *svga,
    FREE((unsigned *)variant->tokens);
    FREE(variant);
 
+   svga->hud.num_shaders--;
+
    return ret;
 }
 
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index b0800c1ecad..efcac408626 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -273,6 +273,9 @@ svga_set_shader(struct svga_context *svga,
                 SVGA3dShaderType type,
                 struct svga_shader_variant *variant);
 
+struct svga_shader_variant *
+svga_new_shader_variant(struct svga_context *svga);
+
 enum pipe_error
 svga_destroy_shader_variant(struct svga_context *svga,
                             SVGA3dShaderType type,
diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c
index 37d16dc9afe..722b369fd4b 100644
--- a/src/gallium/drivers/svga/svga_state.c
+++ b/src/gallium/drivers/svga/svga_state.c
@@ -225,6 +225,9 @@ svga_update_state(struct svga_context *svga, unsigned max_level)
       svga->state.dirty[i] |= svga->dirty;
 
    svga->dirty = 0;
+
+   svga->hud.num_validations++;
+
    return PIPE_OK;
 }
 
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index 75592d3bf8b..c93d2a5e565 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -718,7 +718,7 @@ emit_consts_vgpu10(struct svga_context *svga, unsigned shader)
             /* round down to mulitple of 16 (this may cause rendering problems
              * but should avoid a device error).
              */
-            size &= ~16;
+            size &= ~15;
          }
       }
 
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index c244d5352d9..e392778c2fb 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -90,7 +90,8 @@ translate_fragment_program(struct svga_context *svga,
                                         PIPE_SHADER_FRAGMENT);
    }
    else {
-      return svga_tgsi_vgpu9_translate(&fs->base, key, PIPE_SHADER_FRAGMENT);
+      return svga_tgsi_vgpu9_translate(svga, &fs->base, key,
+                                       PIPE_SHADER_FRAGMENT);
    }
 }
 
diff --git a/src/gallium/drivers/svga/svga_state_gs.c b/src/gallium/drivers/svga/svga_state_gs.c
index 7f75410fb57..0b336baee86 100644
--- a/src/gallium/drivers/svga/svga_state_gs.c
+++ b/src/gallium/drivers/svga/svga_state_gs.c
@@ -53,13 +53,9 @@ translate_geometry_program(struct svga_context *svga,
                            const struct svga_geometry_shader *gs,
                            const struct svga_compile_key *key)
 {
-   if (svga_have_vgpu10(svga)) {
-      return svga_tgsi_vgpu10_translate(svga, &gs->base, key,
-                                        PIPE_SHADER_GEOMETRY);
-   }
-   else {
-      return svga_tgsi_vgpu9_translate(&gs->base, key, PIPE_SHADER_GEOMETRY);
-   }
+   assert(svga_have_vgpu10(svga));
+   return svga_tgsi_vgpu10_translate(svga, &gs->base, key,
+                                     PIPE_SHADER_GEOMETRY);
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index a846b779e70..24574c1bf85 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -81,7 +81,8 @@ translate_vertex_program(struct svga_context *svga,
                                         PIPE_SHADER_VERTEX);
    }
    else {
-      return svga_tgsi_vgpu9_translate(&vs->base, key, PIPE_SHADER_VERTEX);
+      return svga_tgsi_vgpu9_translate(svga, &vs->base, key,
+                                       PIPE_SHADER_VERTEX);
    }
 }
 
diff --git a/src/gallium/drivers/svga/svga_surface.c b/src/gallium/drivers/svga/svga_surface.c
index aca5abcdfce..9f09311116e 100644
--- a/src/gallium/drivers/svga/svga_surface.c
+++ b/src/gallium/drivers/svga/svga_surface.c
@@ -317,6 +317,8 @@ svga_create_surface_view(struct pipe_context *pipe,
       s->real_level = surf_tmpl->u.tex.level;
    }
 
+   svga->hud.num_surface_views++;
+
    return &s->base;
 }
 
@@ -509,6 +511,8 @@ svga_surface_destroy(struct pipe_context *pipe,
 
    pipe_resource_reference(&surf->texture, NULL);
    FREE(surf);
+
+   svga->hud.num_surface_views--;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index 9a6fb465ccb..202eee276b7 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -175,7 +175,8 @@ svga_shader_emit_header(struct svga_shader_emitter *emit)
  * it is, it will be copied to a hardware buffer for upload.
  */
 struct svga_shader_variant *
-svga_tgsi_vgpu9_translate(const struct svga_shader *shader,
+svga_tgsi_vgpu9_translate(struct svga_context *svga,
+                          const struct svga_shader *shader,
                           const struct svga_compile_key *key, unsigned unit)
 {
    struct svga_shader_variant *variant = NULL;
@@ -227,7 +228,7 @@ svga_tgsi_vgpu9_translate(const struct svga_shader *shader,
       goto fail;
    }
 
-   variant = CALLOC_STRUCT(svga_shader_variant);
+   variant = svga_new_shader_variant(svga);
    if (variant == NULL)
       goto fail;
 
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index 207a3f0a845..2581135701f 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -63,7 +63,8 @@ static inline void svga_generate_vdecl_semantics( unsigned idx,
 
 
 struct svga_shader_variant *
-svga_tgsi_vgpu9_translate(const struct svga_shader *shader,
+svga_tgsi_vgpu9_translate(struct svga_context *svga,
+                          const struct svga_shader *shader,
                           const struct svga_compile_key *key, unsigned unit);
 
 struct svga_shader_variant *
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index e4f027b9567..d62f2bbcc96 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -6735,7 +6735,7 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
    /*
     * Create, initialize the 'variant' object.
     */
-   variant = CALLOC_STRUCT(svga_shader_variant);
+   variant = svga_new_shader_variant(svga);
    if (!variant)
       goto cleanup;
 
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index 022240df84f..b37a9714437 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -116,7 +116,7 @@ vlVaCreateImage(VADriverContextP ctx, VAImageFormat *format, int width, int heig
    img->width = width;
    img->height = height;
    w = align(width, 2);
-   h = align(width, 2);
+   h = align(height, 2);
 
    switch (format->fourcc) {
    case VA_FOURCC('N','V','1','2'):
@@ -240,9 +240,11 @@ vlVaGetImage(VADriverContextP ctx, VASurfaceID surface, int x, int y,
       return VA_STATUS_ERROR_OPERATION_FAILED;
 
    if (format != surf->buffer->buffer_format) {
-      /* support NV12 to YV12 conversion now only */
-      if (format == PIPE_FORMAT_YV12 &&
-          surf->buffer->buffer_format == PIPE_FORMAT_NV12)
+      /* support NV12 to YV12 and IYUV conversion now only */
+      if ((format == PIPE_FORMAT_YV12 &&
+          surf->buffer->buffer_format == PIPE_FORMAT_NV12) ||
+          (format == PIPE_FORMAT_IYUV &&
+          surf->buffer->buffer_format == PIPE_FORMAT_NV12))
          convert = true;
       else
          return VA_STATUS_ERROR_OPERATION_FAILED;
diff --git a/src/gallium/targets/d3dadapter9/Makefile.am b/src/gallium/targets/d3dadapter9/Makefile.am
index e26ca33a521..b5221472ef0 100644
--- a/src/gallium/targets/d3dadapter9/Makefile.am
+++ b/src/gallium/targets/d3dadapter9/Makefile.am
@@ -76,7 +76,6 @@ d3dadapter9_la_LIBADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgalliumvl_stub.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/glsl/libnir.la \
-	$(top_builddir)/src/libglsl_util.la \
 	$(top_builddir)/src/gallium/state_trackers/nine/libninetracker.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \
diff --git a/src/gallium/targets/pipe-loader/Makefile.am b/src/gallium/targets/pipe-loader/Makefile.am
index 4d9f7be2ec9..4f25b4f6073 100644
--- a/src/gallium/targets/pipe-loader/Makefile.am
+++ b/src/gallium/targets/pipe-loader/Makefile.am
@@ -53,7 +53,6 @@ endif
 PIPE_LIBS += \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/glsl/libnir.la \
-	$(top_builddir)/src/libglsl_util.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(top_builddir)/src/gallium/drivers/rbug/librbug.la \
 	$(top_builddir)/src/gallium/drivers/trace/libtrace.la \
diff --git a/src/gallium/targets/xa/Makefile.am b/src/gallium/targets/xa/Makefile.am
index 92173dedce3..02c42c665ed 100644
--- a/src/gallium/targets/xa/Makefile.am
+++ b/src/gallium/targets/xa/Makefile.am
@@ -38,7 +38,6 @@ libxatracker_la_LIBADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgalliumvl_stub.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/glsl/libnir.la \
-	$(top_builddir)/src/libglsl_util.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(LIBDRM_LIBS) \
 	$(GALLIUM_COMMON_LIB_DEPS)
diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 08368311b8a..8b0a73b250a 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -148,9 +148,6 @@ libglsl_la_SOURCES =					\
 
 
 libnir_la_SOURCES =					\
-	glsl_types.cpp					\
-	builtin_types.cpp				\
-	glsl_symbol_table.cpp				\
 	$(NIR_FILES)					\
 	$(NIR_GENERATED_FILES)
 
@@ -160,6 +157,7 @@ glsl_compiler_SOURCES = \
 glsl_compiler_LDADD =					\
 	libglsl.la					\
 	$(top_builddir)/src/libglsl_util.la		\
+	$(top_builddir)/src/util/libmesautil.la		\
 	$(PTHREAD_LIBS)
 
 spirv2nir_SOURCES = \
@@ -284,6 +282,5 @@ nir_tests_control_flow_tests_CFLAGS =			\
 nir_tests_control_flow_tests_LDADD =			\
 	$(top_builddir)/src/gtest/libgtest.la		\
 	$(top_builddir)/src/glsl/libnir.la		\
-	$(top_builddir)/src/libglsl_util.la		\
 	$(top_builddir)/src/util/libmesautil.la		\
 	$(PTHREAD_LIBS)
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 65a26268c2e..47dc628101d 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -20,6 +20,8 @@ NIR_GENERATED_FILES = \
 NIR_FILES = \
 	nir/glsl_to_nir.cpp \
 	nir/glsl_to_nir.h \
+	nir/glsl_types.cpp \
+	nir/glsl_types.h \
 	nir/nir.c \
 	nir/nir.h \
 	nir/nir_array.h \
@@ -33,6 +35,8 @@ NIR_FILES = \
 	nir/nir_gs_count_vertices.c \
 	nir/nir_intrinsics.c \
 	nir/nir_intrinsics.h \
+	nir/nir_instr_set.c \
+	nir/nir_instr_set.h \
 	nir/nir_live_variables.c \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
@@ -81,6 +85,8 @@ NIR_FILES = \
 	nir/nir_worklist.c \
 	nir/nir_worklist.h \
 	nir/nir_types.cpp \
+	nir/shader_enums.h \
+	nir/shader_enums.c \
 	nir/spirv_to_nir.c \
 	nir/spirv_glsl450_to_nir.c
 
@@ -103,8 +109,6 @@ LIBGLSL_FILES = \
 	glsl_parser_extras.h \
 	glsl_symbol_table.cpp \
 	glsl_symbol_table.h \
-	glsl_types.cpp \
-	glsl_types.h \
 	hir_field_selection.cpp \
 	ir_basic_block.cpp \
 	ir_basic_block.h \
@@ -206,8 +210,7 @@ LIBGLSL_FILES = \
 	opt_vectorize.cpp \
 	program.h \
 	s_expression.cpp \
-	s_expression.h \
-	shader_enums.h
+	s_expression.h
 
 # glsl_compiler
 
diff --git a/src/glsl/SConscript b/src/glsl/SConscript
index 89c603580a5..70bf5b09c3c 100644
--- a/src/glsl/SConscript
+++ b/src/glsl/SConscript
@@ -16,6 +16,7 @@ env.Prepend(CPPPATH = [
     '#src/gallium/include',
     '#src/gallium/auxiliary',
     '#src/glsl',
+    '#src/glsl/nir',
     '#src/glsl/glcpp',
 ])
 
@@ -60,6 +61,12 @@ source_lists = env.ParseSourceList('Makefile.sources')
 for l in ('LIBGLCPP_FILES', 'LIBGLSL_FILES'):
     glsl_sources += source_lists[l]
 
+# add nir/glsl_types.cpp manually, because SCons still doesn't know about NIR.
+# XXX: Remove this once we build NIR and NIR_FILES.
+glsl_sources += [
+    'nir/glsl_types.cpp',
+]
+
 if env['msvc']:
     env.Prepend(CPPPATH = ['#/src/getopt'])
     env.PrependUnique(LIBS = [getopt])
diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index 4c314366133..e803e6d7675 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -62,6 +62,8 @@ public:
    virtual ir_rvalue *hir(exec_list *instructions,
 			  struct _mesa_glsl_parse_state *state);
 
+   virtual bool has_sequence_subexpression() const;
+
    /**
     * Retrieve the source location of an AST node
     *
@@ -181,6 +183,7 @@ enum ast_operators {
    ast_post_dec,
    ast_field_selection,
    ast_array_index,
+   ast_unsized_array_dim,
 
    ast_function_call,
 
@@ -221,6 +224,8 @@ public:
    virtual void hir_no_rvalue(exec_list *instructions,
                               struct _mesa_glsl_parse_state *state);
 
+   virtual bool has_sequence_subexpression() const;
+
    ir_rvalue *do_hir(exec_list *instructions,
                      struct _mesa_glsl_parse_state *state,
                      bool needs_rvalue);
@@ -299,6 +304,8 @@ public:
    virtual void hir_no_rvalue(exec_list *instructions,
                               struct _mesa_glsl_parse_state *state);
 
+   virtual bool has_sequence_subexpression() const;
+
 private:
    /**
     * Is this function call actually a constructor?
@@ -318,16 +325,7 @@ public:
 
 class ast_array_specifier : public ast_node {
 public:
-   /** Unsized array specifier ([]) */
-   explicit ast_array_specifier(const struct YYLTYPE &locp)
-     : is_unsized_array(true)
-   {
-      set_location(locp);
-   }
-
-   /** Sized array specifier ([dim]) */
    ast_array_specifier(const struct YYLTYPE &locp, ast_expression *dim)
-     : is_unsized_array(false)
    {
       set_location(locp);
       array_dimensions.push_tail(&dim->link);
@@ -338,13 +336,16 @@ public:
       array_dimensions.push_tail(&dim->link);
    }
 
-   virtual void print(void) const;
+   const bool is_single_dimension()
+   {
+      return this->array_dimensions.tail_pred->prev != NULL &&
+             this->array_dimensions.tail_pred->prev->is_head_sentinel();
+   }
 
-   /* If true, this means that the array has an unsized outermost dimension. */
-   bool is_unsized_array;
+   virtual void print(void) const;
 
    /* This list contains objects of type ast_node containing the
-    * sized dimensions only, in outermost-to-innermost order.
+    * array dimensions in outermost-to-innermost order.
     */
    exec_list array_dimensions;
 };
diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index 5e8f49d70b0..74d403fdb65 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -28,13 +28,10 @@
 void
 ast_array_specifier::print(void) const
 {
-   if (this->is_unsized_array) {
-      printf("[ ] ");
-   }
-
    foreach_list_typed (ast_node, array_dimension, link, &this->array_dimensions) {
       printf("[ ");
-      array_dimension->print();
+      if (((ast_expression*)array_dimension)->oper != ast_unsized_array_dim)
+         array_dimension->print();
       printf("] ");
    }
 }
@@ -64,21 +61,29 @@ update_max_array_access(ir_rvalue *ir, int idx, YYLTYPE *loc,
       }
    } else if (ir_dereference_record *deref_record =
               ir->as_dereference_record()) {
-      /* There are two possibilities we need to consider:
+      /* There are three possibilities we need to consider:
        *
        * - Accessing an element of an array that is a member of a named
        *   interface block (e.g. ifc.foo[i])
        *
        * - Accessing an element of an array that is a member of a named
        *   interface block array (e.g. ifc[j].foo[i]).
+       *
+       * - Accessing an element of an array that is a member of a named
+       *   interface block array of arrays (e.g. ifc[j][k].foo[i]).
        */
       ir_dereference_variable *deref_var =
          deref_record->record->as_dereference_variable();
       if (deref_var == NULL) {
-         if (ir_dereference_array *deref_array =
-             deref_record->record->as_dereference_array()) {
-            deref_var = deref_array->array->as_dereference_variable();
+         ir_dereference_array *deref_array =
+            deref_record->record->as_dereference_array();
+         ir_dereference_array *deref_array_prev = NULL;
+         while (deref_array != NULL) {
+            deref_array_prev = deref_array;
+            deref_array = deref_array->array->as_dereference_array();
          }
+         if (deref_array_prev != NULL)
+            deref_var = deref_array_prev->array->as_dereference_variable();
       }
 
       if (deref_var != NULL) {
@@ -230,7 +235,7 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
                   ir_var_shader_storage) {
             _mesa_glsl_error(&loc, state, "unsized array index must be constant");
          }
-      } else if (array->type->fields.array->is_interface()
+      } else if (array->type->without_array()->is_interface()
                  && (array->variable_referenced()->data.mode == ir_var_uniform ||
                      array->variable_referenced()->data.mode == ir_var_shader_storage)
                  && !state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) {
diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index 26d4c62ce36..c5c5cae333b 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -437,13 +437,54 @@ generate_call(exec_list *instructions, ir_function_signature *sig,
       }
    }
 
-   /* If the function call is a constant expression, don't generate any
-    * instructions; just generate an ir_constant.
+   /* Section 4.3.2 (Const) of the GLSL 1.10.59 spec says:
+    *
+    *     "Initializers for const declarations must be formed from literal
+    *     values, other const variables (not including function call
+    *     paramaters), or expressions of these.
+    *
+    *     Constructors may be used in such expressions, but function calls may
+    *     not."
+    *
+    * Section 4.3.3 (Constant Expressions) of the GLSL 1.20.8 spec says:
+    *
+    *     "A constant expression is one of
+    *
+    *         ...
+    *
+    *         - a built-in function call whose arguments are all constant
+    *           expressions, with the exception of the texture lookup
+    *           functions, the noise functions, and ftransform. The built-in
+    *           functions dFdx, dFdy, and fwidth must return 0 when evaluated
+    *           inside an initializer with an argument that is a constant
+    *           expression."
+    *
+    * Section 5.10 (Constant Expressions) of the GLSL ES 1.00.17 spec says:
+    *
+    *     "A constant expression is one of
     *
-    * Function calls were first allowed to be constant expressions in GLSL
-    * 1.20 and GLSL ES 3.00.
+    *         ...
+    *
+    *         - a built-in function call whose arguments are all constant
+    *           expressions, with the exception of the texture lookup
+    *           functions."
+    *
+    * Section 4.3.3 (Constant Expressions) of the GLSL ES 3.00.4 spec says:
+    *
+    *     "A constant expression is one of
+    *
+    *         ...
+    *
+    *         - a built-in function call whose arguments are all constant
+    *           expressions, with the exception of the texture lookup
+    *           functions.  The built-in functions dFdx, dFdy, and fwidth must
+    *           return 0 when evaluated inside an initializer with an argument
+    *           that is a constant expression."
+    *
+    * If the function call is a constant expression, don't generate any
+    * instructions; just generate an ir_constant.
     */
-   if (state->is_version(120, 300)) {
+   if (state->is_version(120, 100)) {
       ir_constant *value = sig->constant_expression_value(actual_parameters, NULL);
       if (value != NULL) {
 	 return value;
@@ -950,6 +991,7 @@ process_array_constructor(exec_list *instructions,
    }
 
    bool all_parameters_are_constant = true;
+   const glsl_type *element_type = constructor_type->fields.array;
 
    /* Type cast each parameter and, if possible, fold constants. */
    foreach_in_list_safe(ir_rvalue, ir, &actual_parameters) {
@@ -976,12 +1018,34 @@ process_array_constructor(exec_list *instructions,
 	 }
       }
 
-      if (result->type != constructor_type->fields.array) {
+      if (constructor_type->fields.array->is_unsized_array()) {
+         /* As the inner parameters of the constructor are created without
+          * knowledge of each other we need to check to make sure unsized
+          * parameters of unsized constructors all end up with the same size.
+          *
+          * e.g we make sure to fail for a constructor like this:
+          * vec4[][] a = vec4[][](vec4[](vec4(0.0), vec4(1.0)),
+          *                       vec4[](vec4(0.0), vec4(1.0), vec4(1.0)),
+          *                       vec4[](vec4(0.0), vec4(1.0)));
+          */
+         if (element_type->is_unsized_array()) {
+             /* This is the first parameter so just get the type */
+            element_type = result->type;
+         } else if (element_type != result->type) {
+            _mesa_glsl_error(loc, state, "type error in array constructor: "
+                             "expected: %s, found %s",
+                             element_type->name,
+                             result->type->name);
+            return ir_rvalue::error_value(ctx);
+         }
+      } else if (result->type != constructor_type->fields.array) {
 	 _mesa_glsl_error(loc, state, "type error in array constructor: "
 			  "expected: %s, found %s",
 			  constructor_type->fields.array->name,
 			  result->type->name);
          return ir_rvalue::error_value(ctx);
+      } else {
+         element_type = result->type;
       }
 
       /* Attempt to convert the parameter to a constant valued expression.
@@ -998,6 +1062,14 @@ process_array_constructor(exec_list *instructions,
       ir->replace_with(result);
    }
 
+   if (constructor_type->fields.array->is_unsized_array()) {
+      constructor_type =
+	 glsl_type::get_array_instance(element_type,
+				       parameter_count);
+      assert(constructor_type != NULL);
+      assert(constructor_type->length == parameter_count);
+   }
+
    if (all_parameters_are_constant)
       return new(ctx) ir_constant(constructor_type, &actual_parameters);
 
@@ -1958,6 +2030,17 @@ ast_function_expression::hir(exec_list *instructions,
    unreachable("not reached");
 }
 
+bool
+ast_function_expression::has_sequence_subexpression() const
+{
+   foreach_list_typed(const ast_node, ast, link, &this->expressions) {
+      if (ast->has_sequence_subexpression())
+         return true;
+   }
+
+   return false;
+}
+
 ir_rvalue *
 ast_aggregate_initializer::hir(exec_list *instructions,
                                struct _mesa_glsl_parse_state *state)
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index f38ca84d129..0c11ec58d20 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -782,8 +782,30 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
     * Note: Whole-array assignments are not permitted in GLSL 1.10, but this
     * is handled by ir_dereference::is_lvalue.
     */
-   if (lhs->type->is_unsized_array() && rhs->type->is_array()
-       && (lhs->type->fields.array == rhs->type->fields.array)) {
+   const glsl_type *lhs_t = lhs->type;
+   const glsl_type *rhs_t = rhs->type;
+   bool unsized_array = false;
+   while(lhs_t->is_array()) {
+      if (rhs_t == lhs_t)
+         break; /* the rest of the inner arrays match so break out early */
+      if (!rhs_t->is_array()) {
+         unsized_array = false;
+         break; /* number of dimensions mismatch */
+      }
+      if (lhs_t->length == rhs_t->length) {
+         lhs_t = lhs_t->fields.array;
+         rhs_t = rhs_t->fields.array;
+         continue;
+      } else if (lhs_t->is_unsized_array()) {
+         unsized_array = true;
+      } else {
+         unsized_array = false;
+         break; /* sized array mismatch */
+      }
+      lhs_t = lhs_t->fields.array;
+      rhs_t = rhs_t->fields.array;
+   }
+   if (unsized_array) {
       if (is_initializer) {
          return rhs;
       } else {
@@ -1004,6 +1026,12 @@ ast_node::hir(exec_list *instructions, struct _mesa_glsl_parse_state *state)
    return NULL;
 }
 
+bool
+ast_node::has_sequence_subexpression() const
+{
+   return false;
+}
+
 void
 ast_function_expression::hir_no_rvalue(exec_list *instructions,
                                        struct _mesa_glsl_parse_state *state)
@@ -1805,6 +1833,10 @@ ast_expression::do_hir(exec_list *instructions,
       break;
    }
 
+   case ast_unsized_array_dim:
+      assert(!"ast_unsized_array_dim: Should never get here.");
+      break;
+
    case ast_function_call:
       /* Should *NEVER* get here.  ast_function_call should always be handled
        * by ast_function_expression::hir.
@@ -1916,6 +1948,83 @@ ast_expression::do_hir(exec_list *instructions,
    return result;
 }
 
+bool
+ast_expression::has_sequence_subexpression() const
+{
+   switch (this->oper) {
+   case ast_plus:
+   case ast_neg:
+   case ast_bit_not:
+   case ast_logic_not:
+   case ast_pre_inc:
+   case ast_pre_dec:
+   case ast_post_inc:
+   case ast_post_dec:
+      return this->subexpressions[0]->has_sequence_subexpression();
+
+   case ast_assign:
+   case ast_add:
+   case ast_sub:
+   case ast_mul:
+   case ast_div:
+   case ast_mod:
+   case ast_lshift:
+   case ast_rshift:
+   case ast_less:
+   case ast_greater:
+   case ast_lequal:
+   case ast_gequal:
+   case ast_nequal:
+   case ast_equal:
+   case ast_bit_and:
+   case ast_bit_xor:
+   case ast_bit_or:
+   case ast_logic_and:
+   case ast_logic_or:
+   case ast_logic_xor:
+   case ast_array_index:
+   case ast_mul_assign:
+   case ast_div_assign:
+   case ast_add_assign:
+   case ast_sub_assign:
+   case ast_mod_assign:
+   case ast_ls_assign:
+   case ast_rs_assign:
+   case ast_and_assign:
+   case ast_xor_assign:
+   case ast_or_assign:
+      return this->subexpressions[0]->has_sequence_subexpression() ||
+             this->subexpressions[1]->has_sequence_subexpression();
+
+   case ast_conditional:
+      return this->subexpressions[0]->has_sequence_subexpression() ||
+             this->subexpressions[1]->has_sequence_subexpression() ||
+             this->subexpressions[2]->has_sequence_subexpression();
+
+   case ast_sequence:
+      return true;
+
+   case ast_field_selection:
+   case ast_identifier:
+   case ast_int_constant:
+   case ast_uint_constant:
+   case ast_float_constant:
+   case ast_bool_constant:
+   case ast_double_constant:
+      return false;
+
+   case ast_aggregate:
+      unreachable("ast_aggregate: Should never get here.");
+
+   case ast_function_call:
+      unreachable("should be handled by ast_function_expression::hir");
+
+   case ast_unsized_array_dim:
+      unreachable("ast_unsized_array_dim: Should never get here.");
+   }
+
+   return false;
+}
 
 ir_rvalue *
 ast_expression_statement::hir(exec_list *instructions,
@@ -1968,6 +2077,14 @@ process_array_size(exec_node *node,
    exec_list dummy_instructions;
 
    ast_node *array_size = exec_node_data(ast_node, node, link);
+
+   /**
+    * Dimensions other than the outermost dimension can by unsized if they
+    * are immediately sized by a constructor or initializer.
+    */
+   if (((ast_expression*)array_size)->oper == ast_unsized_array_dim)
+      return 0;
+
    ir_rvalue *const ir = array_size->hir(& dummy_instructions, state);
    YYLTYPE loc = array_size->get_location();
 
@@ -1990,7 +2107,7 @@ process_array_size(exec_node *node,
    }
 
    ir_constant *const size = ir->constant_expression_value();
-   if (size == NULL) {
+   if (size == NULL || array_size->has_sequence_subexpression()) {
       _mesa_glsl_error(& loc, state, "array size must be a "
                        "constant valued expression");
       return 0;
@@ -2028,20 +2145,7 @@ process_array_type(YYLTYPE *loc, const glsl_type *base,
           *
           * "Only one-dimensional arrays may be declared."
           */
-         if (!state->ARB_arrays_of_arrays_enable) {
-            _mesa_glsl_error(loc, state,
-                             "invalid array of `%s'"
-                             "GL_ARB_arrays_of_arrays "
-                             "required for defining arrays of arrays",
-                             base->name);
-            return glsl_type::error_type;
-         }
-
-         if (base->length == 0) {
-            _mesa_glsl_error(loc, state,
-                             "only the outermost array dimension can "
-                             "be unsized",
-                             base->name);
+         if (!state->check_arrays_of_arrays_allowed(loc)) {
             return glsl_type::error_type;
          }
       }
@@ -2051,9 +2155,6 @@ process_array_type(YYLTYPE *loc, const glsl_type *base,
          unsigned array_size = process_array_size(node, state);
          array_type = glsl_type::get_array_instance(array_type, array_size);
       }
-
-      if (array_specifier->is_unsized_array)
-         array_type = glsl_type::get_array_instance(array_type, 0);
    }
 
    return array_type;
@@ -2592,6 +2693,25 @@ is_conflicting_fragcoord_redeclaration(struct _mesa_glsl_parse_state *state,
    return false;
 }
 
+static inline void
+validate_array_dimensions(const glsl_type *t,
+                          struct _mesa_glsl_parse_state *state,
+                          YYLTYPE *loc) {
+   if (t->is_array()) {
+      t = t->fields.array;
+      while (t->is_array()) {
+         if (t->is_unsized_array()) {
+            _mesa_glsl_error(loc, state,
+                             "only the outermost array dimension can "
+                             "be unsized",
+                             t->name);
+            break;
+         }
+         t = t->fields.array;
+      }
+   }
+}
+
 static void
 apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
                                  ir_variable *var,
@@ -3171,7 +3291,8 @@ process_initializer(ir_variable *var, ast_declaration *decl,
     */
    if (var->data.mode == ir_var_uniform) {
       state->check_version(120, 0, &initializer_loc,
-                           "cannot initialize uniforms");
+                           "cannot initialize uniform %s",
+                           var->name);
    }
 
    /* Section 4.3.7 "Buffer Variables" of the GLSL 4.30 spec:
@@ -3179,8 +3300,9 @@ process_initializer(ir_variable *var, ast_declaration *decl,
     *    "Buffer variables cannot have initializers."
     */
    if (var->data.mode == ir_var_shader_storage) {
-      _mesa_glsl_error(& initializer_loc, state,
-                       "SSBO variables cannot have initializers");
+      _mesa_glsl_error(&initializer_loc, state,
+                       "cannot initialize buffer variable %s",
+                       var->name);
    }
 
    /* From section 4.1.7 of the GLSL 4.40 spec:
@@ -3190,16 +3312,25 @@ process_initializer(ir_variable *var, ast_declaration *decl,
     *     shader."
     */
    if (var->type->contains_opaque()) {
-      _mesa_glsl_error(& initializer_loc, state,
-                       "cannot initialize opaque variable");
+      _mesa_glsl_error(&initializer_loc, state,
+                       "cannot initialize opaque variable %s",
+                       var->name);
    }
 
    if ((var->data.mode == ir_var_shader_in) && (state->current_function == NULL)) {
-      _mesa_glsl_error(& initializer_loc, state,
-		       "cannot initialize %s shader input / %s",
-		       _mesa_shader_stage_to_string(state->stage),
-		       (state->stage == MESA_SHADER_VERTEX)
-		       ? "attribute" : "varying");
+      _mesa_glsl_error(&initializer_loc, state,
+                       "cannot initialize %s shader input / %s %s",
+                       _mesa_shader_stage_to_string(state->stage),
+                       (state->stage == MESA_SHADER_VERTEX)
+                       ? "attribute" : "varying",
+                       var->name);
+   }
+
+   if (var->data.mode == ir_var_shader_out && state->current_function == NULL) {
+      _mesa_glsl_error(&initializer_loc, state,
+                       "cannot initialize %s shader output %s",
+                       _mesa_shader_stage_to_string(state->stage),
+                       var->name);
    }
 
    /* If the initializer is an ast_aggregate_initializer, recursively store
@@ -3214,16 +3345,72 @@ process_initializer(ir_variable *var, ast_declaration *decl,
 
    /* Calculate the constant value if this is a const or uniform
     * declaration.
+    *
+    * Section 4.3 (Storage Qualifiers) of the GLSL ES 1.00.17 spec says:
+    *
+    *     "Declarations of globals without a storage qualifier, or with
+    *     just the const qualifier, may include initializers, in which case
+    *     they will be initialized before the first line of main() is
+    *     executed.  Such initializers must be a constant expression."
+    *
+    * The same section of the GLSL ES 3.00.4 spec has similar language.
     */
    if (type->qualifier.flags.q.constant
-       || type->qualifier.flags.q.uniform) {
+       || type->qualifier.flags.q.uniform
+       || (state->es_shader && state->current_function == NULL)) {
       ir_rvalue *new_rhs = validate_assignment(state, initializer_loc,
                                                lhs, rhs, true);
       if (new_rhs != NULL) {
          rhs = new_rhs;
 
+         /* Section 4.3.3 (Constant Expressions) of the GLSL ES 3.00.4 spec
+          * says:
+          *
+          *     "A constant expression is one of
+          *
+          *        ...
+          *
+          *        - an expression formed by an operator on operands that are
+          *          all constant expressions, including getting an element of
+          *          a constant array, or a field of a constant structure, or
+          *          components of a constant vector.  However, the sequence
+          *          operator ( , ) and the assignment operators ( =, +=, ...)
+          *          are not included in the operators that can create a
+          *          constant expression."
+          *
+          * Section 12.43 (Sequence operator and constant expressions) says:
+          *
+          *     "Should the following construct be allowed?
+          *
+          *         float a[2,3];
+          *
+          *     The expression within the brackets uses the sequence operator
+          *     (',') and returns the integer 3 so the construct is declaring
+          *     a single-dimensional array of size 3.  In some languages, the
+          *     construct declares a two-dimensional array.  It would be
+          *     preferable to make this construct illegal to avoid confusion.
+          *
+          *     One possibility is to change the definition of the sequence
+          *     operator so that it does not return a constant-expression and
+          *     hence cannot be used to declare an array size.
+          *
+          *     RESOLUTION: The result of a sequence operator is not a
+          *     constant-expression."
+          *
+          * Section 4.3.3 (Constant Expressions) of the GLSL 4.30.9 spec
+          * contains language almost identical to the section 4.3.3 in the
+          * GLSL ES 3.00.4 spec.  This is a new limitation for these GLSL
+          * versions.
+          */
          ir_constant *constant_value = rhs->constant_expression_value();
-         if (!constant_value) {
+         if (!constant_value ||
+             (state->is_version(430, 300) &&
+              decl->initializer->has_sequence_subexpression())) {
+            const char *const variable_mode =
+               (type->qualifier.flags.q.constant)
+               ? "const"
+               : ((type->qualifier.flags.q.uniform) ? "uniform" : "global");
+
             /* If ARB_shading_language_420pack is enabled, initializers of
              * const-qualified local variables do not have to be constant
              * expressions. Const-qualified global variables must still be
@@ -3234,22 +3421,24 @@ process_initializer(ir_variable *var, ast_declaration *decl,
                _mesa_glsl_error(& initializer_loc, state,
                                 "initializer of %s variable `%s' must be a "
                                 "constant expression",
-                                (type->qualifier.flags.q.constant)
-                                ? "const" : "uniform",
+                                variable_mode,
                                 decl->identifier);
                if (var->type->is_numeric()) {
                   /* Reduce cascading errors. */
-                  var->constant_value = ir_constant::zero(state, var->type);
+                  var->constant_value = type->qualifier.flags.q.constant
+                     ? ir_constant::zero(state, var->type) : NULL;
                }
             }
          } else {
             rhs = constant_value;
-            var->constant_value = constant_value;
+            var->constant_value = type->qualifier.flags.q.constant
+               ? constant_value : NULL;
          }
       } else {
          if (var->type->is_numeric()) {
             /* Reduce cascading errors. */
-            var->constant_value = ir_constant::zero(state, var->type);
+            var->constant_value = type->qualifier.flags.q.constant
+               ? ir_constant::zero(state, var->type) : NULL;
          }
       }
    }
@@ -4265,6 +4454,8 @@ ast_declarator_list::hir(exec_list *instructions,
          result = process_initializer((earlier == NULL) ? var : earlier,
                                       decl, this->type,
                                       &initializer_instructions, state);
+      } else {
+         validate_array_dimensions(var_type, state, &loc);
       }
 
       /* From page 23 (page 29 of the PDF) of the GLSL 1.10 spec:
@@ -5790,6 +5981,7 @@ ast_process_structure_or_interface_block(exec_list *instructions,
 
          const struct glsl_type *field_type =
             process_array_type(&loc, decl_type, decl->array_specifier, state);
+         validate_array_dimensions(field_type, state, &loc);
          fields[i].type = field_type;
          fields[i].name = decl->identifier;
          fields[i].location = -1;
@@ -6142,7 +6334,8 @@ ast_interface_block::hir(exec_list *instructions,
                              _mesa_shader_stage_to_string(state->stage));
          }
          if (this->instance_name == NULL ||
-             strcmp(this->instance_name, "gl_in") != 0 || this->array_specifier == NULL) {
+             strcmp(this->instance_name, "gl_in") != 0 || this->array_specifier == NULL ||
+             !this->array_specifier->is_single_dimension()) {
             _mesa_glsl_error(&loc, state,
                              "gl_PerVertex input must be redeclared as "
                              "gl_in[]");
@@ -6305,6 +6498,9 @@ ast_interface_block::hir(exec_list *instructions,
       ir_variable *var;
 
       if (this->array_specifier != NULL) {
+         const glsl_type *block_array_type =
+            process_array_type(&loc, block_type, this->array_specifier, state);
+
          /* Section 4.3.7 (Interface Blocks) of the GLSL 1.50 spec says:
           *
           *     For uniform blocks declared an array, each individual array
@@ -6328,7 +6524,7 @@ ast_interface_block::hir(exec_list *instructions,
           * tessellation control shader output, and tessellation evaluation
           * shader input.
           */
-         if (this->array_specifier->is_unsized_array) {
+         if (block_array_type->is_unsized_array()) {
             bool allow_inputs = state->stage == MESA_SHADER_GEOMETRY ||
                                 state->stage == MESA_SHADER_TESS_CTRL ||
                                 state->stage == MESA_SHADER_TESS_EVAL;
@@ -6355,9 +6551,6 @@ ast_interface_block::hir(exec_list *instructions,
             }
          }
 
-         const glsl_type *block_array_type =
-            process_array_type(&loc, block_type, this->array_specifier, state);
-
          /* From section 4.3.9 (Interface Blocks) of the GLSL ES 3.10 spec:
           *
           *     * Arrays of arrays of blocks are not allowed
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index f0f6be21b7d..aae25f893e8 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -403,7 +403,7 @@ shader_atomic_counters(const _mesa_glsl_parse_state *state)
 static bool
 shader_storage_buffer_object(const _mesa_glsl_parse_state *state)
 {
-   return state->ARB_shader_storage_buffer_object_enable;
+   return state->has_shader_storage_buffer_objects();
 }
 
 static bool
diff --git a/src/glsl/builtin_types.cpp b/src/glsl/builtin_types.cpp
index 0aedbb3546a..bbdcd199e92 100644
--- a/src/glsl/builtin_types.cpp
+++ b/src/glsl/builtin_types.cpp
@@ -43,9 +43,7 @@
  * convenience pointers (glsl_type::foo_type).
  * @{
  */
-#define DECL_TYPE(NAME, ...)                                    \
-   const glsl_type glsl_type::_##NAME##_type = glsl_type(__VA_ARGS__, #NAME); \
-   const glsl_type *const glsl_type::NAME##_type = &glsl_type::_##NAME##_type;
+#define DECL_TYPE(NAME, ...)
 
 #define STRUCT_TYPE(NAME)                                       \
    const glsl_type glsl_type::_struct_##NAME##_type =           \
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index c1bcccc34f4..cd00f6e085b 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1962,7 +1962,9 @@ array_specifier:
    '[' ']'
    {
       void *ctx = state;
-      $$ = new(ctx) ast_array_specifier(@1);
+      $$ = new(ctx) ast_array_specifier(@1, new(ctx) ast_expression(
+                                                  ast_unsized_array_dim, NULL,
+                                                  NULL, NULL));
       $$->set_location_range(@1, @2);
    }
    | '[' constant_expression ']'
@@ -1973,29 +1975,21 @@ array_specifier:
    }
    | array_specifier '[' ']'
    {
+      void *ctx = state;
       $$ = $1;
 
-      if (!state->ARB_arrays_of_arrays_enable) {
-         _mesa_glsl_error(& @1, state,
-                          "GL_ARB_arrays_of_arrays "
-                          "required for defining arrays of arrays");
-      } else {
-         _mesa_glsl_error(& @1, state,
-                          "only the outermost array dimension can "
-                          "be unsized");
+      if (state->check_arrays_of_arrays_allowed(& @1)) {
+         $$->add_dimension(new(ctx) ast_expression(ast_unsized_array_dim, NULL,
+                                                   NULL, NULL));
       }
    }
    | array_specifier '[' constant_expression ']'
    {
       $$ = $1;
 
-      if (!state->ARB_arrays_of_arrays_enable) {
-         _mesa_glsl_error(& @1, state,
-                          "GL_ARB_arrays_of_arrays "
-                          "required for defining arrays of arrays");
+      if (state->check_arrays_of_arrays_allowed(& @1)) {
+         $$->add_dimension($3);
       }
-
-      $$->add_dimension($3);
    }
    ;
 
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 7fee43ece52..e8740f9ecb9 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -115,6 +115,20 @@ struct _mesa_glsl_parse_state {
                       unsigned required_glsl_es_version,
                       YYLTYPE *locp, const char *fmt, ...) PRINTFLIKE(5, 6);
 
+   bool check_arrays_of_arrays_allowed(YYLTYPE *locp)
+   {
+      if (!(ARB_arrays_of_arrays_enable || is_version(430, 310))) {
+         const char *const requirement = this->es_shader
+            ? "GLSL ES 3.10"
+            : "GL_ARB_arrays_of_arrays or GLSL 4.30";
+         _mesa_glsl_error(locp, this,
+                          "%s required for defining arrays of arrays.",
+                          requirement);
+         return false;
+      }
+      return true;
+   }
+
    bool check_precision_qualifiers_allowed(YYLTYPE *locp)
    {
       return check_version(130, 100, locp,
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 2c45b9edc0f..8933b230177 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -662,6 +662,22 @@ ir_expression::get_operator(const char *str)
    return (ir_expression_operation) -1;
 }
 
+ir_variable *
+ir_expression::variable_referenced() const
+{
+   switch (operation) {
+      case ir_binop_vector_extract:
+      case ir_triop_vector_insert:
+         /* We get these for things like a[0] where a is a vector type. In these
+          * cases we want variable_referenced() to return the actual vector
+          * variable this is wrapping.
+          */
+         return operands[0]->variable_referenced();
+      default:
+         return ir_rvalue::variable_referenced();
+   }
+}
+
 ir_constant::ir_constant()
    : ir_rvalue(ir_type_constant)
 {
@@ -1673,8 +1689,8 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
 
       if (type->is_interface())
          this->init_interface_type(type);
-      else if (type->is_array() && type->fields.array->is_interface())
-         this->init_interface_type(type->fields.array);
+      else if (type->without_array()->is_interface())
+         this->init_interface_type(type->without_array());
    }
 }
 
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 43a2bf0ae1c..9c9f22d018b 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1731,6 +1731,8 @@ public:
 
    virtual ir_visitor_status accept(ir_hierarchical_visitor *);
 
+   virtual ir_variable *variable_referenced() const;
+
    ir_expression_operation operation;
    ir_rvalue *operands[4];
 };
diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 309b6b72b5b..67ed3605a8c 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -36,6 +36,7 @@
 #include <math.h>
 #include "main/core.h" /* for MAX2, MIN2, CLAMP */
 #include "util/rounding.h" /* for _mesa_roundeven */
+#include "util/half_float.h"
 #include "ir.h"
 #include "glsl_types.h"
 #include "program/hash_table.h"
diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp
index b7a0f6e95ba..d7c29b00f88 100644
--- a/src/glsl/ir_set_program_inouts.cpp
+++ b/src/glsl/ir_set_program_inouts.cpp
@@ -242,6 +242,12 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
       type = type->fields.array;
    }
 
+   /* TODO: implement proper arrays of arrays support
+    * for now let the caller mark whole variable as used.
+    */
+   if (type->is_array() && type->fields.array->is_array())
+      return false;
+
    /* The code below only handles:
     *
     * - Indexing into matrices
diff --git a/src/glsl/ir_uniform.h b/src/glsl/ir_uniform.h
index 50fe76b7ea2..1854279925b 100644
--- a/src/glsl/ir_uniform.h
+++ b/src/glsl/ir_uniform.h
@@ -162,6 +162,22 @@ struct gl_uniform_storage {
    /** @} */
 
    /**
+    * This is a compiler-generated uniform that should not be advertised
+    * via the API.
+    */
+   bool hidden;
+
+   /**
+    * This is a built-in uniform that should not be modified through any gl API.
+    */
+   bool builtin;
+
+   /**
+    * This is a shader storage buffer variable, not an uniform.
+    */
+   bool is_shader_storage;
+
+   /**
     * Index within gl_shader_program::AtomicBuffers[] of the atomic
     * counter buffer this uniform is stored in, or -1 if this is not
     * an atomic counter.
@@ -181,20 +197,16 @@ struct gl_uniform_storage {
    unsigned num_compatible_subroutines;
 
    /**
-    * This is a compiler-generated uniform that should not be advertised
-    * via the API.
+    * A single integer identifying the number of active array elements of
+    * the top-level shader storage block member (GL_TOP_LEVEL_ARRAY_SIZE).
     */
-   bool hidden;
+   unsigned top_level_array_size;
 
    /**
-    * This is a built-in uniform that should not be modified through any gl API.
+    * A single integer identifying the stride between array elements of the
+    * top-level shader storage block member. (GL_TOP_LEVEL_ARRAY_STRIDE).
     */
-   bool builtin;
-
-   /**
-    * This is a shader storage buffer variable, not an uniform.
-    */
-   bool is_shader_storage;
+   unsigned top_level_array_stride;
 };
 
 #ifdef __cplusplus
diff --git a/src/glsl/ir_variable_refcount.cpp b/src/glsl/ir_variable_refcount.cpp
index e4d825c454b..790627bd1e3 100644
--- a/src/glsl/ir_variable_refcount.cpp
+++ b/src/glsl/ir_variable_refcount.cpp
@@ -46,6 +46,15 @@ static void
 free_entry(struct hash_entry *entry)
 {
    ir_variable_refcount_entry *ivre = (ir_variable_refcount_entry *) entry->data;
+
+   /* Free assignment list */
+   exec_node *n;
+   while ((n = ivre->assign_list.pop_head()) != NULL) {
+      struct assignment_entry *assignment_entry =
+         exec_node_data(struct assignment_entry, n, link);
+      free(assignment_entry);
+   }
+
    delete ivre;
 }
 
@@ -59,7 +68,6 @@ ir_variable_refcount_visitor::~ir_variable_refcount_visitor()
 ir_variable_refcount_entry::ir_variable_refcount_entry(ir_variable *var)
 {
    this->var = var;
-   assign = NULL;
    assigned_count = 0;
    declaration = false;
    referenced_count = 0;
@@ -125,8 +133,20 @@ ir_variable_refcount_visitor::visit_leave(ir_assignment *ir)
    entry = this->get_variable_entry(ir->lhs->variable_referenced());
    if (entry) {
       entry->assigned_count++;
-      if (entry->assign == NULL)
-	 entry->assign = ir;
+
+      /* Build a list for dead code optimisation. Don't add assignment if it
+       * was declared out of scope (outside the instruction stream). Also don't
+       * bother adding any more to the list if there are more references than
+       * assignments as this means the variable is used and won't be optimised
+       * out.
+       */
+      assert(entry->referenced_count >= entry->assigned_count);
+      if (entry->referenced_count == entry->assigned_count) {
+         struct assignment_entry *assignment_entry =
+            (struct assignment_entry *)calloc(1, sizeof(*assignment_entry));
+         assignment_entry->assign = ir;
+         entry->assign_list.push_head(&assignment_entry->link);
+      }
    }
 
    return visit_continue;
diff --git a/src/glsl/ir_variable_refcount.h b/src/glsl/ir_variable_refcount.h
index c15e8110d04..5c74c314781 100644
--- a/src/glsl/ir_variable_refcount.h
+++ b/src/glsl/ir_variable_refcount.h
@@ -33,13 +33,24 @@
 #include "ir_visitor.h"
 #include "glsl_types.h"
 
+struct assignment_entry {
+   exec_node link;
+   ir_assignment *assign;
+};
+
 class ir_variable_refcount_entry
 {
 public:
    ir_variable_refcount_entry(ir_variable *var);
 
    ir_variable *var; /* The key: the variable's pointer. */
-   ir_assignment *assign; /* An assignment to the variable, if any */
+
+   /**
+    * List of assignments to the variable, if any.
+    * This is intended to be used for dead code optimisation and may
+    * not be a complete list.
+    */
+   exec_list assign_list;
 
    /** Number of times the variable is referenced, including assignments. */
    unsigned referenced_count;
diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp
index 100d03c4e8f..70ef0e1c891 100644
--- a/src/glsl/link_atomics.cpp
+++ b/src/glsl/link_atomics.cpp
@@ -33,7 +33,7 @@ namespace {
     * Atomic counter as seen by the program.
     */
    struct active_atomic_counter {
-      unsigned id;
+      unsigned uniform_loc;
       ir_variable *var;
    };
 
@@ -52,7 +52,7 @@ namespace {
          free(counters);
       }
 
-      void push_back(unsigned id, ir_variable *var)
+      void push_back(unsigned uniform_loc, ir_variable *var)
       {
          active_atomic_counter *new_counters;
 
@@ -66,7 +66,7 @@ namespace {
          }
 
          counters = new_counters;
-         counters[num_counters].id = id;
+         counters[num_counters].uniform_loc = uniform_loc;
          counters[num_counters].var = var;
          num_counters++;
       }
@@ -95,6 +95,50 @@ namespace {
                y->data.atomic.offset < x->data.atomic.offset + x->type->atomic_size()));
    }
 
+   void
+   process_atomic_variable(const glsl_type *t, struct gl_shader_program *prog,
+                           unsigned *uniform_loc, ir_variable *var,
+                           active_atomic_buffer *const buffers,
+                           unsigned *num_buffers, int *offset,
+                           const unsigned shader_stage)
+   {
+      /* FIXME: Arrays of arrays get counted separately. For example:
+       * x1[3][3][2] = 9 counters
+       * x2[3][2]    = 3 counters
+       * x3[2]       = 1 counter
+       *
+       * However this code marks all the counters as active even when they
+       * might not be used.
+       */
+      if (t->is_array() && t->fields.array->is_array()) {
+         for (unsigned i = 0; i < t->length; i++) {
+            process_atomic_variable(t->fields.array, prog, uniform_loc,
+                                    var, buffers, num_buffers, offset,
+                                    shader_stage);
+         }
+      } else {
+         active_atomic_buffer *buf = &buffers[var->data.binding];
+         gl_uniform_storage *const storage =
+            &prog->UniformStorage[*uniform_loc];
+
+         /* If this is the first time the buffer is used, increment
+          * the counter of buffers used.
+          */
+         if (buf->size == 0)
+            (*num_buffers)++;
+
+         buf->push_back(*uniform_loc, var);
+
+         buf->stage_references[shader_stage]++;
+         buf->size = MAX2(buf->size, *offset + t->atomic_size());
+
+         storage->offset = *offset;
+         *offset += t->atomic_size();
+
+         (*uniform_loc)++;
+      }
+   }
+
    active_atomic_buffer *
    find_active_atomic_counters(struct gl_context *ctx,
                                struct gl_shader_program *prog,
@@ -114,23 +158,10 @@ namespace {
             ir_variable *var = node->as_variable();
 
             if (var && var->type->contains_atomic()) {
-               unsigned id = 0;
-               bool found = prog->UniformHash->get(id, var->name);
-               assert(found);
-               (void) found;
-               active_atomic_buffer *buf = &buffers[var->data.binding];
-
-               /* If this is the first time the buffer is used, increment
-                * the counter of buffers used.
-                */
-               if (buf->size == 0)
-                  (*num_buffers)++;
-
-               buf->push_back(id, var);
-
-               buf->stage_references[i]++;
-               buf->size = MAX2(buf->size, var->data.atomic.offset +
-                                var->type->atomic_size());
+               int offset = var->data.atomic.offset;
+               unsigned uniform_loc = var->data.location;
+               process_atomic_variable(var->type, prog, &uniform_loc,
+                                       var, buffers, num_buffers, &offset, i);
             }
          }
       }
@@ -197,10 +228,10 @@ link_assign_atomic_counter_resources(struct gl_context *ctx,
       /* Assign counter-specific fields. */
       for (unsigned j = 0; j < ab.num_counters; j++) {
          ir_variable *const var = ab.counters[j].var;
-         const unsigned id = ab.counters[j].id;
-         gl_uniform_storage *const storage = &prog->UniformStorage[id];
+         gl_uniform_storage *const storage =
+            &prog->UniformStorage[ab.counters[j].uniform_loc];
 
-         mab.Uniforms[j] = id;
+         mab.Uniforms[j] = ab.counters[j].uniform_loc;
          if (!var->data.explicit_binding)
             var->data.binding = i;
 
diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp
index bcf17fef758..422739af063 100644
--- a/src/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/glsl/link_uniform_block_active_visitor.cpp
@@ -71,6 +71,88 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
    return NULL;
 }
 
+/* For arrays of arrays this function will give us a middle ground between
+ * detecting inactive uniform blocks and structuring them in a way that makes
+ * it easy to calculate the offset for indirect indexing.
+ *
+ * For example given the shader:
+ *
+ *   uniform ArraysOfArraysBlock
+ *   {
+ *      vec4 a;
+ *   } i[3][4][5];
+ *
+ *   void main()
+ *   {
+ *      vec4 b = i[0][1][1].a;
+ *      gl_Position = i[2][2][3].a + b;
+ *   }
+ *
+ * There are only 2 active blocks above but for the sake of indirect indexing
+ * and not over complicating the code we will end up with a count of 8.
+ * Here each dimension has 2 different indices counted so we end up with 2*2*2
+ */
+struct uniform_block_array_elements **
+process_arrays(void *mem_ctx, ir_dereference_array *ir,
+               struct link_uniform_block_active *block)
+{
+   if (ir) {
+      struct uniform_block_array_elements **ub_array_ptr =
+         process_arrays(mem_ctx, ir->array->as_dereference_array(), block);
+      if (*ub_array_ptr == NULL) {
+         *ub_array_ptr = rzalloc(mem_ctx, struct uniform_block_array_elements);
+         (*ub_array_ptr)->ir = ir;
+      }
+
+      struct uniform_block_array_elements *ub_array = *ub_array_ptr;
+      ir_constant *c = ir->array_index->as_constant();
+      if (c) {
+         /* Index is a constant, so mark just that element used,
+          * if not already.
+          */
+         const unsigned idx = c->get_uint_component(0);
+
+         unsigned i;
+         for (i = 0; i < ub_array->num_array_elements; i++) {
+            if (ub_array->array_elements[i] == idx)
+               break;
+         }
+
+         assert(i <= ub_array->num_array_elements);
+
+         if (i == ub_array->num_array_elements) {
+            ub_array->array_elements = reralloc(mem_ctx,
+                                                ub_array->array_elements,
+                                                unsigned,
+                                                ub_array->num_array_elements + 1);
+
+            ub_array->array_elements[ub_array->num_array_elements] = idx;
+
+            ub_array->num_array_elements++;
+         }
+      } else {
+         /* The array index is not a constant,
+          * so mark the entire array used.
+          */
+         assert(ir->array->type->is_array());
+         if (ub_array->num_array_elements < ir->array->type->length) {
+            ub_array->num_array_elements = ir->array->type->length;
+            ub_array->array_elements = reralloc(mem_ctx,
+                                                ub_array->array_elements,
+                                                unsigned,
+                                                ub_array->num_array_elements);
+
+            for (unsigned i = 0; i < ub_array->num_array_elements; i++) {
+               ub_array->array_elements[i] = i;
+            }
+         }
+      }
+      return &ub_array->array;
+   } else {
+      return &block->array;
+   }
+}
+
 ir_visitor_status
 link_uniform_block_active_visitor::visit(ir_variable *var)
 {
@@ -101,24 +183,30 @@ link_uniform_block_active_visitor::visit(ir_variable *var)
       return visit_stop;
    }
 
-   assert(b->num_array_elements == 0);
-   assert(b->array_elements == NULL);
+   assert(b->array == NULL);
    assert(b->type != NULL);
    assert(!b->type->is_array() || b->has_instance_name);
 
    /* For uniform block arrays declared with a shared or std140 layout
     * qualifier, mark all its instances as used.
     */
-   if (b->type->is_array() && b->type->length > 0) {
-      b->num_array_elements = b->type->length;
-      b->array_elements = reralloc(this->mem_ctx,
-                                   b->array_elements,
-                                   unsigned,
-                                   b->num_array_elements);
-
-      for (unsigned i = 0; i < b->num_array_elements; i++) {
-         b->array_elements[i] = i;
+   const glsl_type *type = b->type;
+   struct uniform_block_array_elements **ub_array = &b->array;
+   while (type->is_array()) {
+      assert(b->type->length > 0);
+
+      *ub_array = rzalloc(this->mem_ctx, struct uniform_block_array_elements);
+      (*ub_array)->num_array_elements = type->length;
+      (*ub_array)->array_elements = reralloc(this->mem_ctx,
+                                             (*ub_array)->array_elements,
+                                             unsigned,
+                                             (*ub_array)->num_array_elements);
+
+      for (unsigned i = 0; i < (*ub_array)->num_array_elements; i++) {
+         (*ub_array)->array_elements[i] = i;
       }
+      ub_array = &(*ub_array)->array;
+      type = type->fields.array;
    }
 
    return visit_continue;
@@ -127,7 +215,13 @@ link_uniform_block_active_visitor::visit(ir_variable *var)
 ir_visitor_status
 link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir)
 {
-   ir_dereference_variable *const d = ir->array->as_dereference_variable();
+   /* cycle through arrays of arrays */
+   ir_dereference_array *base_ir = ir;
+   while (base_ir->array->ir_type == ir_type_dereference_array)
+      base_ir = base_ir->array->as_dereference_array();
+
+   ir_dereference_variable *const d =
+      base_ir->array->as_dereference_variable();
    ir_variable *const var = (d == NULL) ? NULL : d->var;
 
    /* If the r-value being dereferenced is not a variable (e.g., a field of a
@@ -158,55 +252,16 @@ link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir)
    /* Block arrays must be declared with an instance name.
     */
    assert(b->has_instance_name);
-   assert((b->num_array_elements == 0) == (b->array_elements == NULL));
    assert(b->type != NULL);
 
    /* If the block array was declared with a shared or
     * std140 layout qualifier, all its instances have been already marked
     * as used in link_uniform_block_active_visitor::visit(ir_variable *).
     */
-   if (var->get_interface_type()->interface_packing !=
-       GLSL_INTERFACE_PACKING_PACKED)
-      return visit_continue_with_parent;
-
-   ir_constant *c = ir->array_index->as_constant();
-
-   if (c) {
-      /* Index is a constant, so mark just that element used, if not already */
-      const unsigned idx = c->get_uint_component(0);
-
-      unsigned i;
-      for (i = 0; i < b->num_array_elements; i++) {
-         if (b->array_elements[i] == idx)
-            break;
-      }
-
-      assert(i <= b->num_array_elements);
-
-      if (i == b->num_array_elements) {
-         b->array_elements = reralloc(this->mem_ctx,
-                                      b->array_elements,
-                                      unsigned,
-                                      b->num_array_elements + 1);
-
-         b->array_elements[b->num_array_elements] = idx;
-
-         b->num_array_elements++;
-      }
-   } else {
-      /* The array index is not a constant, so mark the entire array used. */
-      assert(b->type->is_array());
-      if (b->num_array_elements < b->type->length) {
-         b->num_array_elements = b->type->length;
-         b->array_elements = reralloc(this->mem_ctx,
-                                      b->array_elements,
-                                      unsigned,
-                                      b->num_array_elements);
-
-         for (unsigned i = 0; i < b->num_array_elements; i++) {
-            b->array_elements[i] = i;
-         }
-      }
+   if (var->get_interface_type()->interface_packing ==
+       GLSL_INTERFACE_PACKING_PACKED) {
+      b->var = var;
+      process_arrays(this->mem_ctx, ir, b);
    }
 
    return visit_continue_with_parent;
@@ -234,8 +289,7 @@ link_uniform_block_active_visitor::visit(ir_dereference_variable *ir)
       return visit_stop;
    }
 
-   assert(b->num_array_elements == 0);
-   assert(b->array_elements == NULL);
+   assert(b->array == NULL);
    assert(b->type != NULL);
 
    return visit_continue;
diff --git a/src/glsl/link_uniform_block_active_visitor.h b/src/glsl/link_uniform_block_active_visitor.h
index b663a884db4..afb52c14a37 100644
--- a/src/glsl/link_uniform_block_active_visitor.h
+++ b/src/glsl/link_uniform_block_active_visitor.h
@@ -28,11 +28,20 @@
 #include "ir.h"
 #include "util/hash_table.h"
 
+struct uniform_block_array_elements {
+   unsigned *array_elements;
+   unsigned num_array_elements;
+
+   ir_dereference_array *ir;
+
+   struct uniform_block_array_elements *array;
+};
+
 struct link_uniform_block_active {
    const glsl_type *type;
+   ir_variable *var;
 
-   unsigned *array_elements;
-   unsigned num_array_elements;
+   struct uniform_block_array_elements *array;
 
    unsigned binding;
 
diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp
index 7ceffee799e..5285d8d01e4 100644
--- a/src/glsl/link_uniform_blocks.cpp
+++ b/src/glsl/link_uniform_blocks.cpp
@@ -116,7 +116,7 @@ private:
          char *open_bracket = strchr(v->IndexName, '[');
          assert(open_bracket != NULL);
 
-         char *close_bracket = strchr(open_bracket, ']');
+         char *close_bracket = strchr(open_bracket, '.') - 1;
          assert(close_bracket != NULL);
 
          /* Length of the tail without the ']' but with the NUL.
@@ -185,6 +185,91 @@ struct block {
    bool has_instance_name;
 };
 
+static void
+process_block_array(struct uniform_block_array_elements *ub_array, char **name,
+                    size_t name_length, gl_uniform_block *blocks,
+                    ubo_visitor *parcel, gl_uniform_buffer_variable *variables,
+                    const struct link_uniform_block_active *const b,
+                    unsigned *block_index, unsigned *binding_offset,
+                    struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   if (ub_array) {
+      for (unsigned j = 0; j < ub_array->num_array_elements; j++) {
+	 size_t new_length = name_length;
+
+         /* Append the subscript to the current variable name */
+         ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]",
+                                      ub_array->array_elements[j]);
+
+         process_block_array(ub_array->array, name, new_length, blocks,
+                             parcel, variables, b, block_index,
+                             binding_offset, ctx, prog);
+      }
+   } else {
+      unsigned i = *block_index;
+      const glsl_type *type =  b->type->without_array();
+
+      blocks[i].Name = ralloc_strdup(blocks, *name);
+      blocks[i].Uniforms = &variables[(*parcel).index];
+
+      /* The GL_ARB_shading_language_420pack spec says:
+       *
+       *     "If the binding identifier is used with a uniform block
+       *     instanced as an array then the first element of the array
+       *     takes the specified block binding and each subsequent
+       *     element takes the next consecutive uniform block binding
+       *     point."
+       */
+      blocks[i].Binding = (b->has_binding) ? b->binding + *binding_offset : 0;
+
+      blocks[i].UniformBufferSize = 0;
+      blocks[i]._Packing = gl_uniform_block_packing(type->interface_packing);
+
+      parcel->process(type, blocks[i].Name);
+
+      blocks[i].UniformBufferSize = parcel->buffer_size;
+
+      /* Check SSBO size is lower than maximum supported size for SSBO */
+      if (b->is_shader_storage &&
+          parcel->buffer_size > ctx->Const.MaxShaderStorageBlockSize) {
+         linker_error(prog, "shader storage block `%s' has size %d, "
+                      "which is larger than than the maximum allowed (%d)",
+                      b->type->name,
+                      parcel->buffer_size,
+                      ctx->Const.MaxShaderStorageBlockSize);
+      }
+      blocks[i].NumUniforms =
+         (unsigned)(ptrdiff_t)(&variables[parcel->index] - blocks[i].Uniforms);
+      blocks[i].IsShaderStorage = b->is_shader_storage;
+
+      *block_index = *block_index + 1;
+      *binding_offset = *binding_offset + 1;
+   }
+}
+
+/* This function resizes the array types of the block so that later we can use
+ * this new size to correctly calculate the offest for indirect indexing.
+ */
+const glsl_type *
+resize_block_array(const glsl_type *type,
+                   struct uniform_block_array_elements *ub_array)
+{
+   if (type->is_array()) {
+      struct uniform_block_array_elements *child_array =
+         type->fields.array->is_array() ? ub_array->array : NULL;
+      const glsl_type *new_child_type =
+         resize_block_array(type->fields.array, child_array);
+
+      const glsl_type *new_type =
+         glsl_type::get_array_instance(new_child_type,
+                                       ub_array->num_array_elements);
+      ub_array->ir->array->type = new_type;
+      return new_type;
+   } else {
+      return type;
+   }
+}
+
 unsigned
 link_uniform_blocks(void *mem_ctx,
                     struct gl_context *ctx,
@@ -223,21 +308,25 @@ link_uniform_blocks(void *mem_ctx,
    struct hash_entry *entry;
 
    hash_table_foreach (block_hash, entry) {
-      const struct link_uniform_block_active *const b =
-         (const struct link_uniform_block_active *) entry->data;
+      struct link_uniform_block_active *const b =
+         (struct link_uniform_block_active *) entry->data;
 
-      const glsl_type *const block_type =
-         b->type->is_array() ? b->type->fields.array : b->type;
+      assert((b->array != NULL) == b->type->is_array());
 
-      assert((b->num_array_elements > 0) == b->type->is_array());
+      if (b->array != NULL &&
+          (b->type->without_array()->interface_packing ==
+           GLSL_INTERFACE_PACKING_PACKED)) {
+         b->type = resize_block_array(b->type, b->array);
+         b->var->type = b->type;
+      }
 
       block_size.num_active_uniforms = 0;
-      block_size.process(block_type, "");
+      block_size.process(b->type->without_array(), "");
 
-      if (b->num_array_elements > 0) {
-         num_blocks += b->num_array_elements;
-         num_variables += b->num_array_elements
-            * block_size.num_active_uniforms;
+      if (b->array != NULL) {
+         unsigned aoa_size = b->type->arrays_of_arrays_size();
+         num_blocks += aoa_size;
+         num_variables += aoa_size * block_size.num_active_uniforms;
       } else {
          num_blocks++;
          num_variables += block_size.num_active_uniforms;
@@ -281,50 +370,15 @@ link_uniform_blocks(void *mem_ctx,
          (const struct link_uniform_block_active *) entry->data;
       const glsl_type *block_type = b->type;
 
-      if (b->num_array_elements > 0) {
-         const char *const name = block_type->fields.array->name;
+      if (b->array != NULL) {
+         unsigned binding_offset = 0;
+         char *name = ralloc_strdup(NULL, block_type->without_array()->name);
+         size_t name_length = strlen(name);
 
          assert(b->has_instance_name);
-         for (unsigned j = 0; j < b->num_array_elements; j++) {
-            blocks[i].Name = ralloc_asprintf(blocks, "%s[%u]", name,
-                                             b->array_elements[j]);
-            blocks[i].Uniforms = &variables[parcel.index];
-
-            /* The GL_ARB_shading_language_420pack spec says:
-             *
-             *     "If the binding identifier is used with a uniform block
-             *     instanced as an array then the first element of the array
-             *     takes the specified block binding and each subsequent
-             *     element takes the next consecutive uniform block binding
-             *     point."
-             */
-            blocks[i].Binding = (b->has_binding) ? b->binding + j : 0;
-
-            blocks[i].UniformBufferSize = 0;
-            blocks[i]._Packing =
-               gl_uniform_block_packing(block_type->interface_packing);
-
-            parcel.process(block_type->fields.array,
-                           blocks[i].Name);
-
-            blocks[i].UniformBufferSize = parcel.buffer_size;
-
-            /* Check SSBO size is lower than maximum supported size for SSBO */
-            if (b->is_shader_storage &&
-                parcel.buffer_size > ctx->Const.MaxShaderStorageBlockSize) {
-               linker_error(prog, "shader storage block `%s' has size %d, "
-                            "which is larger than than the maximum allowed (%d)",
-                            block_type->name,
-                            parcel.buffer_size,
-                            ctx->Const.MaxShaderStorageBlockSize);
-            }
-            blocks[i].NumUniforms =
-               (unsigned)(ptrdiff_t)(&variables[parcel.index] - blocks[i].Uniforms);
-
-            blocks[i].IsShaderStorage = b->is_shader_storage;
-
-            i++;
-         }
+         process_block_array(b->array, &name, name_length, blocks, &parcel,
+                             variables, b, &i, &binding_offset, ctx, prog);
+         ralloc_free(name);
       } else {
          blocks[i].Name = ralloc_strdup(blocks, block_type->name);
          blocks[i].Uniforms = &variables[parcel.index];
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index e9e108a2765..35b9f9c6017 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -49,7 +49,7 @@ get_uniform_block_index(const gl_shader_program *shProg,
                         const char *uniformBlockName)
 {
    for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) {
-      if (!strcmp(shProg->UniformBlocks[i].Name, uniformBlockName))
+      if (!strcmp(shProg->BufferInterfaceBlocks[i].Name, uniformBlockName))
 	 return i;
    }
 
@@ -107,51 +107,64 @@ copy_constant_to_storage(union gl_constant_value *storage,
  * they have no storage and should be handled elsewhere.
  */
 void
-set_opaque_binding(gl_shader_program *prog, const char *name, int binding)
+set_opaque_binding(void *mem_ctx, gl_shader_program *prog,
+                   const glsl_type *type, const char *name, int *binding)
 {
-   struct gl_uniform_storage *const storage =
-      get_storage(prog->UniformStorage, prog->NumUniformStorage, name);
 
-   if (storage == NULL) {
-      assert(storage != NULL);
-      return;
-   }
+   if (type->is_array() && type->fields.array->is_array()) {
+      const glsl_type *const element_type = type->fields.array;
 
-   const unsigned elements = MAX2(storage->array_elements, 1);
+      for (unsigned int i = 0; i < type->length; i++) {
+	 const char *element_name = ralloc_asprintf(mem_ctx, "%s[%d]", name, i);
 
-   /* Section 4.4.4 (Opaque-Uniform Layout Qualifiers) of the GLSL 4.20 spec
-    * says:
-    *
-    *     "If the binding identifier is used with an array, the first element
-    *     of the array takes the specified unit and each subsequent element
-    *     takes the next consecutive unit."
-    */
-   for (unsigned int i = 0; i < elements; i++) {
-      storage->storage[i].i = binding + i;
-   }
+	 set_opaque_binding(mem_ctx, prog, element_type,
+                            element_name, binding);
+      }
+   } else {
+      struct gl_uniform_storage *const storage =
+         get_storage(prog->UniformStorage, prog->NumUniformStorage, name);
 
-   for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
-      gl_shader *shader = prog->_LinkedShaders[sh];
+      if (storage == NULL) {
+         assert(storage != NULL);
+         return;
+      }
 
-      if (shader) {
-         if (storage->type->base_type == GLSL_TYPE_SAMPLER &&
-             storage->opaque[sh].active) {
-            for (unsigned i = 0; i < elements; i++) {
-               const unsigned index = storage->opaque[sh].index + i;
-               shader->SamplerUnits[index] = storage->storage[i].i;
-            }
+      const unsigned elements = MAX2(storage->array_elements, 1);
+
+      /* Section 4.4.4 (Opaque-Uniform Layout Qualifiers) of the GLSL 4.20 spec
+       * says:
+       *
+       *     "If the binding identifier is used with an array, the first element
+       *     of the array takes the specified unit and each subsequent element
+       *     takes the next consecutive unit."
+       */
+      for (unsigned int i = 0; i < elements; i++) {
+         storage->storage[i].i = (*binding)++;
+      }
+
+      for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
+        gl_shader *shader = prog->_LinkedShaders[sh];
 
-         } else if (storage->type->base_type == GLSL_TYPE_IMAGE &&
+         if (shader) {
+            if (storage->type->base_type == GLSL_TYPE_SAMPLER &&
+                storage->opaque[sh].active) {
+               for (unsigned i = 0; i < elements; i++) {
+                  const unsigned index = storage->opaque[sh].index + i;
+                  shader->SamplerUnits[index] = storage->storage[i].i;
+               }
+
+            } else if (storage->type->base_type == GLSL_TYPE_IMAGE &&
                     storage->opaque[sh].active) {
-            for (unsigned i = 0; i < elements; i++) {
-               const unsigned index = storage->opaque[sh].index + i;
-               shader->ImageUnits[index] = storage->storage[i].i;
+               for (unsigned i = 0; i < elements; i++) {
+                  const unsigned index = storage->opaque[sh].index + i;
+                  shader->ImageUnits[index] = storage->storage[i].i;
+               }
             }
          }
       }
-   }
 
-   storage->initialized = true;
+      storage->initialized = true;
+   }
 }
 
 void
@@ -170,7 +183,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding)
 
          if (stage_index != -1) {
             struct gl_shader *sh = prog->_LinkedShaders[i];
-            sh->UniformBlocks[stage_index].Binding = binding;
+            sh->BufferInterfaceBlocks[stage_index].Binding = binding;
          }
       }
 }
@@ -180,6 +193,7 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog,
 			const char *name, const glsl_type *type,
                         ir_constant *val, unsigned int boolean_true)
 {
+   const glsl_type *t_without_array = type->without_array();
    if (type->is_record()) {
       ir_constant *field_constant;
 
@@ -194,7 +208,8 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog,
 	 field_constant = (ir_constant *)field_constant->next;
       }
       return;
-   } else if (type->is_array() && type->fields.array->is_record()) {
+   } else if (t_without_array->is_record() ||
+              (type->is_array() && type->fields.array->is_array())) {
       const glsl_type *const element_type = type->fields.array;
 
       for (unsigned int i = 0; i < type->length; i++) {
@@ -284,7 +299,9 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
 
             if (type->without_array()->is_sampler() ||
                 type->without_array()->is_image()) {
-               linker::set_opaque_binding(prog, var->name, var->data.binding);
+               int binding = var->data.binding;
+               linker::set_opaque_binding(mem_ctx, prog, var->type,
+                                          var->name, &binding);
             } else if (var->is_in_buffer_block()) {
                const glsl_type *const iface_type = var->get_interface_type();
 
@@ -327,9 +344,9 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
             } else {
                assert(!"Explicit binding not on a sampler, UBO or atomic.");
             }
-         } else if (var->constant_value) {
+         } else if (var->constant_initializer) {
             linker::set_uniform_initializer(mem_ctx, prog, var->name,
-                                            var->type, var->constant_value,
+                                            var->type, var->constant_initializer,
                                             boolean_true);
          }
       }
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 0ccd9c8c865..fe00aa30d07 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -149,7 +149,8 @@ program_resource_visitor::process(ir_variable *var)
       recursion(var->type, &name, strlen(name), row_major, NULL, packing,
                 false, record_array_count);
       ralloc_free(name);
-   } else if (t->without_array()->is_record()) {
+   } else if (t_without_array->is_record() ||
+              (t->is_array() && t->fields.array->is_array())) {
       char *name = ralloc_strdup(NULL, var->name);
       recursion(var->type, &name, strlen(name), row_major, NULL, packing,
                 false, record_array_count);
@@ -160,6 +161,7 @@ program_resource_visitor::process(ir_variable *var)
                 false, record_array_count);
       ralloc_free(name);
    } else {
+      this->set_record_array_count(record_array_count);
       this->visit_field(t, var->name, row_major, NULL, packing, false);
    }
 }
@@ -231,7 +233,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
          this->leave_record(t, *name, row_major, packing);
       }
    } else if (t->without_array()->is_record() ||
-              t->without_array()->is_interface()) {
+              t->without_array()->is_interface() ||
+              (t->is_array() && t->fields.array->is_array())) {
       if (record_type == NULL && t->fields.array->is_record())
          record_type = t->fields.array;
 
@@ -387,6 +390,7 @@ private:
    {
       assert(!type->without_array()->is_record());
       assert(!type->without_array()->is_interface());
+      assert(!(type->is_array() && type->fields.array->is_array()));
 
       (void) row_major;
 
@@ -502,9 +506,9 @@ public:
 
             for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) {
                if (strncmp(var->get_interface_type()->name,
-                           prog->UniformBlocks[i].Name,
+                           prog->BufferInterfaceBlocks[i].Name,
                            l) == 0
-                   && prog->UniformBlocks[i].Name[l] == '[') {
+                   && prog->BufferInterfaceBlocks[i].Name[l] == '[') {
                   ubo_block_index = i;
                   break;
                }
@@ -512,7 +516,7 @@ public:
          } else {
             for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) {
                if (strcmp(var->get_interface_type()->name,
-                          prog->UniformBlocks[i].Name) == 0) {
+                          prog->BufferInterfaceBlocks[i].Name) == 0) {
                   ubo_block_index = i;
                   break;
                }
@@ -530,7 +534,7 @@ public:
             ubo_byte_offset = 0;
          } else {
             const struct gl_uniform_block *const block =
-               &prog->UniformBlocks[ubo_block_index];
+               &prog->BufferInterfaceBlocks[ubo_block_index];
 
             assert(var->data.location != -1);
 
@@ -712,6 +716,7 @@ private:
    {
       assert(!type->without_array()->is_record());
       assert(!type->without_array()->is_interface());
+      assert(!(type->is_array() && type->fields.array->is_array()));
 
       unsigned id;
       bool found = this->map->get(id, name);
@@ -804,10 +809,11 @@ private:
          if (type->is_array()) {
             if (packing == GLSL_INTERFACE_PACKING_STD430)
                this->uniforms[id].array_stride =
-                  type->fields.array->std430_array_stride(row_major);
+                  type->without_array()->std430_array_stride(row_major);
             else
                this->uniforms[id].array_stride =
-                  glsl_align(type->fields.array->std140_size(row_major), 16);
+                  glsl_align(type->without_array()->std140_size(row_major),
+                             16);
 	 } else {
 	    this->uniforms[id].array_stride = 0;
 	 }
@@ -966,15 +972,16 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
 
       if (var->type->is_record()) {
          sentinel = '.';
-      } else if (var->type->without_array()->is_record()) {
+      } else if (var->type->is_array() && (var->type->fields.array->is_array()
+                 || var->type->without_array()->is_record())) {
          sentinel = '[';
       }
 
       const unsigned l = strlen(var->name);
-      for (unsigned i = 0; i < shader->NumUniformBlocks; i++) {
-	 for (unsigned j = 0; j < shader->UniformBlocks[i].NumUniforms; j++) {
+      for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) {
+	 for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) {
             if (sentinel) {
-               const char *begin = shader->UniformBlocks[i].Uniforms[j].Name;
+               const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name;
                const char *end = strchr(begin, sentinel);
 
                if (end == NULL)
@@ -989,7 +996,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
                   break;
                }
             } else if (!strcmp(var->name,
-                               shader->UniformBlocks[i].Uniforms[j].Name)) {
+                               shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) {
 	       found = true;
 	       var->data.location = j;
 	       break;
@@ -1115,10 +1122,10 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       sh->num_uniform_components = uniform_size.num_shader_uniform_components;
       sh->num_combined_uniform_components = sh->num_uniform_components;
 
-      for (unsigned i = 0; i < sh->NumUniformBlocks; i++) {
-         if (!sh->UniformBlocks[i].IsShaderStorage) {
+      for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) {
+         if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) {
 	    sh->num_combined_uniform_components +=
-	       sh->UniformBlocks[i].UniformBufferSize / 4;
+	       sh->BufferInterfaceBlocks[i].UniformBufferSize / 4;
          }
       }
    }
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index a97b4ef0a32..25ca928aa43 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -65,6 +65,7 @@
  */
 
 #include <ctype.h>
+#include "util/strndup.h"
 #include "main/core.h"
 #include "glsl_symbol_table.h"
 #include "glsl_parser_extras.h"
@@ -1161,7 +1162,7 @@ cross_validate_uniforms(struct gl_shader_program *prog)
 }
 
 /**
- * Accumulates the array of prog->UniformBlocks and checks that all
+ * Accumulates the array of prog->BufferInterfaceBlocks and checks that all
  * definitons of blocks agree on their contents.
  */
 static bool
@@ -1170,7 +1171,7 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
    unsigned max_num_uniform_blocks = 0;
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i])
-	 max_num_uniform_blocks += prog->_LinkedShaders[i]->NumUniformBlocks;
+	 max_num_uniform_blocks += prog->_LinkedShaders[i]->NumBufferInterfaceBlocks;
    }
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
@@ -1184,15 +1185,15 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
       if (sh == NULL)
 	 continue;
 
-      for (unsigned int j = 0; j < sh->NumUniformBlocks; j++) {
+      for (unsigned int j = 0; j < sh->NumBufferInterfaceBlocks; j++) {
 	 int index = link_cross_validate_uniform_block(prog,
-						       &prog->UniformBlocks,
+						       &prog->BufferInterfaceBlocks,
 						       &prog->NumBufferInterfaceBlocks,
-						       &sh->UniformBlocks[j]);
+						       &sh->BufferInterfaceBlocks[j]);
 
 	 if (index == -1) {
 	    linker_error(prog, "uniform block `%s' has mismatching definitions\n",
-			 sh->UniformBlocks[j].Name);
+			 sh->BufferInterfaceBlocks[j].Name);
 	    return false;
 	 }
 
@@ -1386,8 +1387,10 @@ public:
 
    virtual ir_visitor_status visit(ir_variable *var)
    {
+      const glsl_type *type_without_array;
       fixup_type(&var->type, var->data.max_array_access,
                  var->data.from_ssbo_unsized_array);
+      type_without_array = var->type->without_array();
       if (var->type->is_interface()) {
          if (interface_contains_unsized_arrays(var->type)) {
             const glsl_type *new_type =
@@ -1397,11 +1400,10 @@ public:
             var->type = new_type;
             var->change_interface_type(new_type);
          }
-      } else if (var->type->is_array() &&
-                 var->type->fields.array->is_interface()) {
-         if (interface_contains_unsized_arrays(var->type->fields.array)) {
+      } else if (type_without_array->is_interface()) {
+         if (interface_contains_unsized_arrays(type_without_array)) {
             const glsl_type *new_type =
-               resize_interface_members(var->type->fields.array,
+               resize_interface_members(type_without_array,
                                         var->get_max_ifc_array_access(),
                                         var->is_in_shader_storage_block());
             var->change_interface_type(new_type);
@@ -2064,9 +2066,9 @@ link_intrastage_shaders(void *mem_ctx,
    linked->ir = new(linked) exec_list;
    clone_ir_list(mem_ctx, linked->ir, main->ir);
 
-   linked->UniformBlocks = uniform_blocks;
-   linked->NumUniformBlocks = num_uniform_blocks;
-   ralloc_steal(linked, linked->UniformBlocks);
+   linked->BufferInterfaceBlocks = uniform_blocks;
+   linked->NumBufferInterfaceBlocks = num_uniform_blocks;
+   ralloc_steal(linked, linked->BufferInterfaceBlocks);
 
    link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders);
@@ -2804,19 +2806,19 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 
    for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) {
       /* Don't check SSBOs for Uniform Block Size */
-      if (!prog->UniformBlocks[i].IsShaderStorage &&
-          prog->UniformBlocks[i].UniformBufferSize > ctx->Const.MaxUniformBlockSize) {
+      if (!prog->BufferInterfaceBlocks[i].IsShaderStorage &&
+          prog->BufferInterfaceBlocks[i].UniformBufferSize > ctx->Const.MaxUniformBlockSize) {
          linker_error(prog, "Uniform block %s too big (%d/%d)\n",
-                      prog->UniformBlocks[i].Name,
-                      prog->UniformBlocks[i].UniformBufferSize,
+                      prog->BufferInterfaceBlocks[i].Name,
+                      prog->BufferInterfaceBlocks[i].UniformBufferSize,
                       ctx->Const.MaxUniformBlockSize);
       }
 
-      if (prog->UniformBlocks[i].IsShaderStorage &&
-          prog->UniformBlocks[i].UniformBufferSize > ctx->Const.MaxShaderStorageBlockSize) {
+      if (prog->BufferInterfaceBlocks[i].IsShaderStorage &&
+          prog->BufferInterfaceBlocks[i].UniformBufferSize > ctx->Const.MaxShaderStorageBlockSize) {
          linker_error(prog, "Shader storage block %s too big (%d/%d)\n",
-                      prog->UniformBlocks[i].Name,
-                      prog->UniformBlocks[i].UniformBufferSize,
+                      prog->BufferInterfaceBlocks[i].Name,
+                      prog->BufferInterfaceBlocks[i].UniformBufferSize,
                       ctx->Const.MaxShaderStorageBlockSize);
       }
 
@@ -2824,7 +2826,7 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 	 if (prog->UniformBlockStageIndex[j][i] != -1) {
             struct gl_shader *sh = prog->_LinkedShaders[j];
             int stage_index = prog->UniformBlockStageIndex[j][i];
-            if (sh && sh->UniformBlocks[stage_index].IsShaderStorage) {
+            if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) {
                shader_blocks[j]++;
                total_shader_storage_blocks++;
             } else {
@@ -2941,7 +2943,7 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 
          for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
             int stage_index = prog->UniformBlockStageIndex[i][j];
-            if (stage_index != -1 && sh->UniformBlocks[stage_index].IsShaderStorage)
+            if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage)
                total_shader_storage_blocks++;
          }
 
@@ -3147,7 +3149,7 @@ should_add_buffer_variable(struct gl_shader_program *shProg,
       return true;
 
    for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) {
-      block_name = shProg->UniformBlocks[i].Name;
+      block_name = shProg->BufferInterfaceBlocks[i].Name;
       if (strncmp(block_name, name, strlen(block_name)) == 0) {
          found_interface = true;
          break;
@@ -3389,6 +3391,242 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage)
    return true;
 }
 
+static char*
+get_top_level_name(const char *name)
+{
+   const char *first_dot = strchr(name, '.');
+   const char *first_square_bracket = strchr(name, '[');
+   int name_size = 0;
+   /* From ARB_program_interface_query spec:
+    *
+    * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying the
+    *  number of active array elements of the top-level shader storage block
+    *  member containing to the active variable is written to <params>.  If the
+    *  top-level block member is not declared as an array, the value one is
+    *  written to <params>.  If the top-level block member is an array with no
+    *  declared size, the value zero is written to <params>.
+    */
+
+   /* The buffer variable is on top level.*/
+   if (!first_square_bracket && !first_dot)
+      name_size = strlen(name);
+   else if ((!first_square_bracket ||
+            (first_dot && first_dot < first_square_bracket)))
+      name_size = first_dot - name;
+   else
+      name_size = first_square_bracket - name;
+
+   return strndup(name, name_size);
+}
+
+static char*
+get_var_name(const char *name)
+{
+   const char *first_dot = strchr(name, '.');
+
+   if (!first_dot)
+      return strdup(name);
+
+   return strndup(first_dot+1, strlen(first_dot) - 1);
+}
+
+static bool
+is_top_level_shader_storage_block_member(const char* name,
+                                         const char* interface_name,
+                                         const char* field_name)
+{
+   bool result = false;
+
+   /* If the given variable is already a top-level shader storage
+    * block member, then return array_size = 1.
+    * We could have two possibilities: if we have an instanced
+    * shader storage block or not instanced.
+    *
+    * For the first, we check create a name as it was in top level and
+    * compare it with the real name. If they are the same, then
+    * the variable is already at top-level.
+    *
+    * Full instanced name is: interface name + '.' + var name +
+    *    NULL character
+    */
+   int name_length = strlen(interface_name) + 1 + strlen(field_name) + 1;
+   char *full_instanced_name = (char *) calloc(name_length, sizeof(char));
+   if (!full_instanced_name) {
+      fprintf(stderr, "%s: Cannot allocate space for name\n", __func__);
+      return false;
+   }
+
+   snprintf(full_instanced_name, name_length, "%s.%s",
+            interface_name, field_name);
+
+   /* Check if its top-level shader storage block member of an
+    * instanced interface block, or of a unnamed interface block.
+    */
+   if (strcmp(name, full_instanced_name) == 0 ||
+       strcmp(name, field_name) == 0)
+      result = true;
+
+   free(full_instanced_name);
+   return result;
+}
+
+static void
+calculate_array_size(struct gl_shader_program *shProg,
+                     struct gl_uniform_storage *uni)
+{
+   int block_index = uni->block_index;
+   int array_size = -1;
+   char *var_name = get_top_level_name(uni->name);
+   char *interface_name =
+      get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name);
+
+   if (strcmp(var_name, interface_name) == 0) {
+      /* Deal with instanced array of SSBOs */
+      char *temp_name = get_var_name(uni->name);
+      free(var_name);
+      var_name = get_top_level_name(temp_name);
+      free(temp_name);
+   }
+
+   for (unsigned i = 0; i < shProg->NumShaders; i++) {
+      if (shProg->Shaders[i] == NULL)
+         continue;
+
+      const gl_shader *stage = shProg->Shaders[i];
+      foreach_in_list(ir_instruction, node, stage->ir) {
+         ir_variable *var = node->as_variable();
+         if (!var || !var->get_interface_type() ||
+             var->data.mode != ir_var_shader_storage)
+            continue;
+
+         const glsl_type *interface = var->get_interface_type();
+
+         if (strcmp(interface_name, interface->name) != 0)
+            continue;
+
+         for (unsigned i = 0; i < interface->length; i++) {
+            const glsl_struct_field *field = &interface->fields.structure[i];
+            if (strcmp(field->name, var_name) != 0)
+               continue;
+            /* From GL_ARB_program_interface_query spec:
+             *
+             * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer
+             * identifying the number of active array elements of the top-level
+             * shader storage block member containing to the active variable is
+             * written to <params>.  If the top-level block member is not
+             * declared as an array, the value one is written to <params>.  If
+             * the top-level block member is an array with no declared size,
+             * the value zero is written to <params>.
+             */
+            if (is_top_level_shader_storage_block_member(uni->name,
+                                                         interface_name,
+                                                         var_name))
+               array_size = 1;
+            else if (field->type->is_unsized_array())
+               array_size = 0;
+            else if (field->type->is_array())
+               array_size = field->type->length;
+            else
+               array_size = 1;
+
+            goto found_top_level_array_size;
+         }
+      }
+   }
+found_top_level_array_size:
+   free(interface_name);
+   free(var_name);
+   uni->top_level_array_size = array_size;
+}
+
+static void
+calculate_array_stride(struct gl_shader_program *shProg,
+                       struct gl_uniform_storage *uni)
+{
+   int block_index = uni->block_index;
+   int array_stride = -1;
+   char *var_name = get_top_level_name(uni->name);
+   char *interface_name =
+      get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name);
+
+   if (strcmp(var_name, interface_name) == 0) {
+      /* Deal with instanced array of SSBOs */
+      char *temp_name = get_var_name(uni->name);
+      free(var_name);
+      var_name = get_top_level_name(temp_name);
+      free(temp_name);
+   }
+
+   for (unsigned i = 0; i < shProg->NumShaders; i++) {
+      if (shProg->Shaders[i] == NULL)
+         continue;
+
+      const gl_shader *stage = shProg->Shaders[i];
+      foreach_in_list(ir_instruction, node, stage->ir) {
+         ir_variable *var = node->as_variable();
+         if (!var || !var->get_interface_type() ||
+             var->data.mode != ir_var_shader_storage)
+            continue;
+
+         const glsl_type *interface = var->get_interface_type();
+
+         if (strcmp(interface_name, interface->name) != 0) {
+            continue;
+         }
+
+         for (unsigned i = 0; i < interface->length; i++) {
+            const glsl_struct_field *field = &interface->fields.structure[i];
+            if (strcmp(field->name, var_name) != 0)
+               continue;
+            /* From GL_ARB_program_interface_query:
+             *
+             * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer
+             *  identifying the stride between array elements of the top-level
+             *  shader storage block member containing the active variable is
+             *  written to <params>.  For top-level block members declared as
+             *  arrays, the value written is the difference, in basic machine
+             *  units, between the offsets of the active variable for
+             *  consecutive elements in the top-level array.  For top-level
+             *  block members not declared as an array, zero is written to
+             *  <params>."
+             */
+            if (field->type->is_array()) {
+               const enum glsl_matrix_layout matrix_layout =
+                  glsl_matrix_layout(field->matrix_layout);
+               bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;
+               const glsl_type *array_type = field->type->fields.array;
+
+               if (is_top_level_shader_storage_block_member(uni->name,
+                                                            interface_name,
+                                                            var_name)) {
+                  array_stride = 0;
+                  goto found_top_level_array_stride;
+               }
+               if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) {
+                  if (array_type->is_record() || array_type->is_array()) {
+                     array_stride = array_type->std140_size(row_major);
+                     array_stride = glsl_align(array_stride, 16);
+                  } else {
+                     unsigned element_base_align = 0;
+                     element_base_align = array_type->std140_base_alignment(row_major);
+                     array_stride = MAX2(element_base_align, 16);
+                  }
+               } else {
+                  array_stride = array_type->std430_array_stride(row_major);
+               }
+            } else {
+               array_stride = 0;
+            }
+            goto found_top_level_array_stride;
+         }
+      }
+   }
+found_top_level_array_stride:
+   free(interface_name);
+   free(var_name);
+   uni->top_level_array_stride = array_stride;
+}
+
 /**
  * Builds up a list of program resources that point to existing
  * resource data.
@@ -3473,6 +3711,11 @@ build_program_resource_list(struct gl_shader_program *shProg)
                                       shProg->UniformStorage[i].name))
          continue;
 
+      if (is_shader_storage) {
+         calculate_array_size(shProg, &shProg->UniformStorage[i]);
+         calculate_array_stride(shProg, &shProg->UniformStorage[i]);
+      }
+
       if (!add_program_resource(shProg, type,
                                 &shProg->UniformStorage[i], stageref))
          return;
@@ -3480,10 +3723,10 @@ build_program_resource_list(struct gl_shader_program *shProg)
 
    /* Add program uniform blocks and shader storage blocks. */
    for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) {
-      bool is_shader_storage = shProg->UniformBlocks[i].IsShaderStorage;
+      bool is_shader_storage = shProg->BufferInterfaceBlocks[i].IsShaderStorage;
       GLenum type = is_shader_storage ? GL_SHADER_STORAGE_BLOCK : GL_UNIFORM_BLOCK;
       if (!add_program_resource(shProg, type,
-          &shProg->UniformBlocks[i], 0))
+          &shProg->BufferInterfaceBlocks[i], 0))
          return;
    }
 
@@ -3599,6 +3842,42 @@ link_assign_subroutine_types(struct gl_shader_program *prog)
    }
 }
 
+static void
+split_ubos_and_ssbos(void *mem_ctx,
+                     struct gl_uniform_block *blocks,
+                     unsigned num_blocks,
+                     struct gl_uniform_block ***ubos,
+                     unsigned *num_ubos,
+                     struct gl_uniform_block ***ssbos,
+                     unsigned *num_ssbos)
+{
+   unsigned num_ubo_blocks = 0;
+   unsigned num_ssbo_blocks = 0;
+
+   for (unsigned i = 0; i < num_blocks; i++) {
+      if (blocks[i].IsShaderStorage)
+         num_ssbo_blocks++;
+      else
+         num_ubo_blocks++;
+   }
+
+   *ubos = ralloc_array(mem_ctx, gl_uniform_block *, num_ubo_blocks);
+   *num_ubos = 0;
+
+   *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
+   *num_ssbos = 0;
+
+   for (unsigned i = 0; i < num_blocks; i++) {
+      if (blocks[i].IsShaderStorage) {
+         (*ssbos)[(*num_ssbos)++] = &blocks[i];
+      } else {
+         (*ubos)[(*num_ubos)++] = &blocks[i];
+      }
+   }
+
+   assert(*num_ubos + *num_ssbos == num_blocks);
+}
+
 void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 {
@@ -4110,6 +4389,31 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
    }
 
+   /* Split BufferInterfaceBlocks into UniformBlocks and ShaderStorageBlocks
+    * for gl_shader_program and gl_shader, so that drivers that need separate
+    * index spaces for each set can have that.
+    */
+   for (unsigned i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++) {
+      if (prog->_LinkedShaders[i] != NULL) {
+         gl_shader *sh = prog->_LinkedShaders[i];
+         split_ubos_and_ssbos(sh,
+                              sh->BufferInterfaceBlocks,
+                              sh->NumBufferInterfaceBlocks,
+                              &sh->UniformBlocks,
+                              &sh->NumUniformBlocks,
+                              &sh->ShaderStorageBlocks,
+                              &sh->NumShaderStorageBlocks);
+      }
+   }
+
+   split_ubos_and_ssbos(prog,
+                        prog->BufferInterfaceBlocks,
+                        prog->NumBufferInterfaceBlocks,
+                        &prog->UniformBlocks,
+                        &prog->NumUniformBlocks,
+                        &prog->ShaderStorageBlocks,
+                        &prog->NumShaderStorageBlocks);
+
    /* FINISHME: Assign fragment shader output locations. */
 
 done:
diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp
index 01bbdd0587e..276a2dedf47 100644
--- a/src/glsl/lower_named_interface_blocks.cpp
+++ b/src/glsl/lower_named_interface_blocks.cpp
@@ -65,6 +65,39 @@
 #include "ir_rvalue_visitor.h"
 #include "program/hash_table.h"
 
+static const glsl_type *
+process_array_type(const glsl_type *type, unsigned idx)
+{
+   const glsl_type *element_type = type->fields.array;
+   if (element_type->is_array()) {
+      const glsl_type *new_array_type = process_array_type(element_type, idx);
+      return glsl_type::get_array_instance(new_array_type, type->length);
+   } else {
+      return glsl_type::get_array_instance(
+         element_type->fields.structure[idx].type, type->length);
+   }
+}
+
+static ir_rvalue *
+process_array_ir(void * const mem_ctx,
+                 ir_dereference_array *deref_array_prev,
+                 ir_rvalue *deref_var)
+{
+   ir_dereference_array *deref_array =
+      deref_array_prev->array->as_dereference_array();
+
+   if (deref_array == NULL) {
+      return new(mem_ctx) ir_dereference_array(deref_var,
+                                               deref_array_prev->array_index);
+   } else {
+      deref_array = (ir_dereference_array *) process_array_ir(mem_ctx,
+                                                              deref_array,
+                                                              deref_var);
+      return new(mem_ctx) ir_dereference_array(deref_array,
+                                               deref_array_prev->array_index);
+   }
+}
+
 namespace {
 
 class flatten_named_interface_blocks_declarations : public ir_rvalue_visitor
@@ -112,15 +145,9 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
           var->data.mode == ir_var_shader_storage)
          continue;
 
-      const glsl_type * iface_t = var->type;
-      const glsl_type * array_t = NULL;
+      const glsl_type * iface_t = var->type->without_array();
       exec_node *insert_pos = var;
 
-      if (iface_t->is_array()) {
-         array_t = iface_t;
-         iface_t = array_t->fields.array;
-      }
-
       assert (iface_t->is_interface());
 
       for (unsigned i = 0; i < iface_t->length; i++) {
@@ -137,7 +164,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
             ir_variable *new_var;
             char *var_name =
                ralloc_strdup(mem_ctx, iface_t->fields.structure[i].name);
-            if (array_t == NULL) {
+            if (!var->type->is_array()) {
                new_var =
                   new(mem_ctx) ir_variable(iface_t->fields.structure[i].type,
                                            var_name,
@@ -145,9 +172,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                new_var->data.from_named_ifc_block_nonarray = 1;
             } else {
                const glsl_type *new_array_type =
-                  glsl_type::get_array_instance(
-                     iface_t->fields.structure[i].type,
-                     array_t->length);
+                  process_array_type(var->type, i);
                new_var =
                   new(mem_ctx) ir_variable(new_array_type,
                                            var_name,
@@ -236,9 +261,8 @@ flatten_named_interface_blocks_declarations::handle_rvalue(ir_rvalue **rvalue)
       ir_dereference_array *deref_array =
          ir->record->as_dereference_array();
       if (deref_array != NULL) {
-         *rvalue =
-            new(mem_ctx) ir_dereference_array(deref_var,
-                                              deref_array->array_index);
+         *rvalue = process_array_ir(mem_ctx, deref_array,
+                                    (ir_rvalue *)deref_var);
       } else {
          *rvalue = deref_var;
       }
diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index 247620e6148..e818c048461 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -203,55 +203,114 @@ static const char *
 interface_field_name(void *mem_ctx, char *base_name, ir_rvalue *d,
                      ir_rvalue **nonconst_block_index)
 {
-   ir_rvalue *previous_index = NULL;
    *nonconst_block_index = NULL;
+   char *name_copy = NULL;
+   size_t base_length = 0;
+
+   /* Loop back through the IR until we find the uniform block */
+   ir_rvalue *ir = d;
+   while (ir != NULL) {
+      switch (ir->ir_type) {
+      case ir_type_dereference_variable: {
+         /* Exit loop */
+         ir = NULL;
+         break;
+      }
+
+      case ir_type_dereference_record: {
+         ir_dereference_record *r = (ir_dereference_record *) ir;
+         ir = r->record->as_dereference();
+
+         /* If we got here it means any previous array subscripts belong to
+          * block members and not the block itself so skip over them in the
+          * next pass.
+          */
+         d = ir;
+         break;
+      }
+
+      case ir_type_dereference_array: {
+         ir_dereference_array *a = (ir_dereference_array *) ir;
+         ir = a->array->as_dereference();
+         break;
+      }
+
+      case ir_type_swizzle: {
+         ir_swizzle *s = (ir_swizzle *) ir;
+         ir = s->val->as_dereference();
+         break;
+      }
+
+      default:
+         assert(!"Should not get here.");
+         break;
+      }
+   }
 
    while (d != NULL) {
       switch (d->ir_type) {
       case ir_type_dereference_variable: {
          ir_dereference_variable *v = (ir_dereference_variable *) d;
-         if (previous_index
-             && v->var->is_interface_instance()
-             && v->var->type->is_array()) {
-
-            ir_constant *const_index = previous_index->as_constant();
-            if (!const_index) {
-               *nonconst_block_index = previous_index;
-               return ralloc_asprintf(mem_ctx, "%s[0]", base_name);
-            } else {
-               return ralloc_asprintf(mem_ctx,
-                                      "%s[%d]",
-                                      base_name,
-                                      const_index->get_uint_component(0));
-            }
+         if (name_copy != NULL &&
+             v->var->is_interface_instance() &&
+             v->var->type->is_array()) {
+            return name_copy;
          } else {
+            *nonconst_block_index = NULL;
             return base_name;
          }
 
          break;
       }
 
-      case ir_type_dereference_record: {
-         ir_dereference_record *r = (ir_dereference_record *) d;
-
-         d = r->record->as_dereference();
-         break;
-      }
-
       case ir_type_dereference_array: {
          ir_dereference_array *a = (ir_dereference_array *) d;
+         size_t new_length;
+
+         if (name_copy == NULL) {
+            name_copy = ralloc_strdup(mem_ctx, base_name);
+            base_length = strlen(name_copy);
+         }
+
+         /* For arrays of arrays we start at the innermost array and work our
+          * way out so we need to insert the subscript at the base of the
+          * name string rather than just attaching it to the end.
+          */
+         new_length = base_length;
+         ir_constant *const_index = a->array_index->as_constant();
+         char *end = ralloc_strdup(NULL, &name_copy[new_length]);
+         if (!const_index) {
+            ir_rvalue *array_index = a->array_index;
+            if (array_index->type != glsl_type::uint_type)
+               array_index = i2u(array_index);
+
+            if (a->array->type->is_array() &&
+                a->array->type->fields.array->is_array()) {
+               ir_constant *base_size = new(mem_ctx)
+                  ir_constant(a->array->type->fields.array->arrays_of_arrays_size());
+               array_index = mul(array_index, base_size);
+            }
+
+            if (*nonconst_block_index) {
+               *nonconst_block_index = add(*nonconst_block_index, array_index);
+            } else {
+               *nonconst_block_index = array_index;
+            }
+
+            ralloc_asprintf_rewrite_tail(&name_copy, &new_length, "[0]%s",
+                                         end);
+         } else {
+            ralloc_asprintf_rewrite_tail(&name_copy, &new_length, "[%d]%s",
+                                         const_index->get_uint_component(0),
+                                         end);
+         }
+         ralloc_free(end);
 
          d = a->array->as_dereference();
-         previous_index = a->array_index;
 
          break;
       }
-      case ir_type_swizzle: {
-         ir_swizzle *s = (ir_swizzle *) d;
 
-         d = s->val->as_dereference();
-         break;
-      }
       default:
          assert(!"Should not get here.");
          break;
@@ -277,27 +336,31 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var,
       interface_field_name(mem_ctx, (char *) var->get_interface_type()->name,
                            deref, &nonconst_block_index);
 
-   /* Locate the ubo block by interface name */
+   /* Locate the block by interface name */
+   this->is_shader_storage = var->is_in_shader_storage_block();
+   unsigned num_blocks;
+   struct gl_uniform_block **blocks;
+   if (this->is_shader_storage) {
+      num_blocks = shader->NumShaderStorageBlocks;
+      blocks = shader->ShaderStorageBlocks;
+   } else {
+      num_blocks = shader->NumUniformBlocks;
+      blocks = shader->UniformBlocks;
+   }
    this->uniform_block = NULL;
-   for (unsigned i = 0; i < shader->NumUniformBlocks; i++) {
-      if (strcmp(field_name, shader->UniformBlocks[i].Name) == 0) {
+   for (unsigned i = 0; i < num_blocks; i++) {
+      if (strcmp(field_name, blocks[i]->Name) == 0) {
 
          ir_constant *index = new(mem_ctx) ir_constant(i);
 
          if (nonconst_block_index) {
-            if (nonconst_block_index->type != glsl_type::uint_type)
-               nonconst_block_index = i2u(nonconst_block_index);
             this->uniform_block = add(nonconst_block_index, index);
          } else {
             this->uniform_block = index;
          }
 
-         this->is_shader_storage = shader->UniformBlocks[i].IsShaderStorage;
-
-         struct gl_uniform_block *block = &shader->UniformBlocks[i];
-
          this->ubo_var = var->is_interface_instance()
-            ? &block->Uniforms[0] : &block->Uniforms[var->data.location];
+            ? &blocks[i]->Uniforms[0] : &blocks[i]->Uniforms[var->data.location];
 
          break;
       }
@@ -335,7 +398,7 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var,
             if (deref_array->array->type->is_double())
                array_stride *= 2;
             *matrix_columns = deref_array->array->type->matrix_columns;
-         } else if (deref_array->type->is_interface()) {
+         } else if (deref_array->type->without_array()->is_interface()) {
             /* We're processing an array dereference of an interface instance
              * array. The thing being dereferenced *must* be a variable
              * dereference because interfaces cannot be embedded in other
@@ -344,7 +407,6 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var,
              * interface instance array will have the same offsets relative to
              * the base of the block that backs them.
              */
-            assert(deref_array->array->as_dereference_variable());
             deref = deref_array->array->as_dereference();
             break;
          } else {
@@ -744,7 +806,31 @@ lower_ubo_reference_visitor::emit_access(bool is_write,
        * or 32 depending on the number of columns.
        */
       assert(matrix_columns <= 4);
-      unsigned matrix_stride = glsl_align(matrix_columns * N, 16);
+      unsigned matrix_stride = 0;
+      /* Matrix stride for std430 mat2xY matrices are not rounded up to
+       * vec4 size. From OpenGL 4.3 spec, section 7.6.2.2 "Standard Uniform
+       * Block Layout":
+       *
+       * "2. If the member is a two- or four-component vector with components
+       * consuming N basic machine units, the base alignment is 2N or 4N,
+       * respectively." [...]
+       * "4. If the member is an array of scalars or vectors, the base alignment
+       * and array stride are set to match the base alignment of a single array
+       * element, according to rules (1), (2), and (3), and rounded up to the
+       * base alignment of a vec4." [...]
+       * "7. If the member is a row-major matrix with C columns and R rows, the
+       * matrix is stored identically to an array of R row vectors with C
+       * components each, according to rule (4)." [...]
+       * "When using the std430 storage layout, shader storage blocks will be
+       * laid out in buffer storage identically to uniform and shader storage
+       * blocks using the std140 layout, except that the base alignment and
+       * stride of arrays of scalars and vectors in rule 4 and of structures in
+       * rule 9 are not rounded up a multiple of the base alignment of a vec4."
+       */
+      if (packing == GLSL_INTERFACE_PACKING_STD430 && matrix_columns == 2)
+         matrix_stride = 2 * N;
+      else
+         matrix_stride = glsl_align(matrix_columns * N, 16);
 
       const glsl_type *deref_type = deref->type->base_type == GLSL_TYPE_FLOAT ?
          glsl_type::float_type : glsl_type::double_type;
diff --git a/src/glsl/lower_vec_index_to_cond_assign.cpp b/src/glsl/lower_vec_index_to_cond_assign.cpp
index 0c3394a504b..b6238825f8a 100644
--- a/src/glsl/lower_vec_index_to_cond_assign.cpp
+++ b/src/glsl/lower_vec_index_to_cond_assign.cpp
@@ -88,7 +88,9 @@ ir_vec_index_to_cond_assign_visitor::convert_vec_index_to_cond_assign(void *mem_
    exec_list list;
 
    /* Store the index to a temporary to avoid reusing its tree. */
-   index = new(base_ir) ir_variable(glsl_type::int_type,
+   assert(orig_index->type == glsl_type::int_type ||
+          orig_index->type == glsl_type::uint_type);
+   index = new(base_ir) ir_variable(orig_index->type,
 				    "vec_index_tmp_i",
 				    ir_var_temporary);
    list.push_tail(index);
diff --git a/src/glsl/lower_vector_insert.cpp b/src/glsl/lower_vector_insert.cpp
index 6d7cfa94262..26d31b03c12 100644
--- a/src/glsl/lower_vector_insert.cpp
+++ b/src/glsl/lower_vector_insert.cpp
@@ -108,9 +108,13 @@ vector_insert_visitor::handle_rvalue(ir_rvalue **rv)
       factory.emit(assign(temp, expr->operands[0]));
       factory.emit(assign(src_temp, expr->operands[1]));
 
+      assert(expr->operands[2]->type == glsl_type::int_type ||
+             expr->operands[2]->type == glsl_type::uint_type);
+
       for (unsigned i = 0; i < expr->type->vector_elements; i++) {
          ir_constant *const cmp_index =
-            new(factory.mem_ctx) ir_constant(int(i));
+            ir_constant::zero(factory.mem_ctx, expr->operands[2]->type);
+         cmp_index->value.u[0] = i;
 
          ir_variable *const cmp_result =
             factory.make_temp(glsl_type::bool_type, "index_condition");
diff --git a/src/glsl/builtin_type_macros.h b/src/glsl/nir/builtin_type_macros.h
index 8e16ae45489..8e16ae45489 100644
--- a/src/glsl/builtin_type_macros.h
+++ b/src/glsl/nir/builtin_type_macros.h
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 6bedb4eb8e6..e57e834d948 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -152,11 +152,13 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
       if (sh->Program->SamplersUsed & (1 << i))
          num_textures = i;
 
-   shader->info.name = ralloc_asprintf(shader, "GLSL%d", sh->Name);
+   shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
+   if (shader_prog->Label)
+      shader->info.label = ralloc_strdup(shader, shader_prog->Label);
    shader->info.num_textures = num_textures;
    shader->info.num_ubos = sh->NumUniformBlocks;
    shader->info.num_abos = shader_prog->NumAtomicBuffers;
-   shader->info.num_ssbos = shader_prog->NumBufferInterfaceBlocks;
+   shader->info.num_ssbos = sh->NumShaderStorageBlocks;
    shader->info.num_images = sh->NumImages;
    shader->info.inputs_read = sh->Program->InputsRead;
    shader->info.outputs_written = sh->Program->OutputsWritten;
@@ -164,11 +166,37 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
    shader->info.uses_texture_gather = sh->Program->UsesGather;
    shader->info.uses_clip_distance_out = sh->Program->UsesClipDistanceOut;
    shader->info.separate_shader = shader_prog->SeparateShader;
-   shader->info.gs.vertices_out = sh->Geom.VerticesOut;
-   shader->info.gs.invocations = sh->Geom.Invocations;
    shader->info.has_transform_feedback_varyings =
       shader_prog->TransformFeedback.NumVarying > 0;
 
+   switch (stage) {
+   case MESA_SHADER_GEOMETRY:
+      shader->info.gs.vertices_out = sh->Geom.VerticesOut;
+      shader->info.gs.invocations = sh->Geom.Invocations;
+      break;
+
+   case MESA_SHADER_FRAGMENT: {
+      struct gl_fragment_program *fp =
+         (struct gl_fragment_program *)sh->Program;
+
+      shader->info.fs.uses_discard = fp->UsesKill;
+      shader->info.fs.early_fragment_tests = sh->EarlyFragmentTests;
+      shader->info.fs.depth_layout = fp->FragDepthLayout;
+      break;
+   }
+
+   case MESA_SHADER_COMPUTE: {
+      struct gl_compute_program *cp = (struct gl_compute_program *)sh->Program;
+      shader->info.cs.local_size[0] = cp->LocalSize[0];
+      shader->info.cs.local_size[1] = cp->LocalSize[1];
+      shader->info.cs.local_size[2] = cp->LocalSize[2];
+      break;
+   }
+
+   default:
+      break; /* No stage-specific info */
+   }
+
    return shader;
 }
 
@@ -393,35 +421,10 @@ nir_visitor::visit(ir_variable *ir)
 
    var->interface_type = ir->get_interface_type();
 
-   switch (var->data.mode) {
-   case nir_var_local:
-      exec_list_push_tail(&impl->locals, &var->node);
-      break;
-
-   case nir_var_global:
-      exec_list_push_tail(&shader->globals, &var->node);
-      break;
-
-   case nir_var_shader_in:
-      exec_list_push_tail(&shader->inputs, &var->node);
-      break;
-
-   case nir_var_shader_out:
-      exec_list_push_tail(&shader->outputs, &var->node);
-      break;
-
-   case nir_var_uniform:
-   case nir_var_shader_storage:
-      exec_list_push_tail(&shader->uniforms, &var->node);
-      break;
-
-   case nir_var_system_value:
-      exec_list_push_tail(&shader->system_values, &var->node);
-      break;
-
-   default:
-      unreachable("not reached");
-   }
+   if (var->data.mode == nir_var_local)
+      nir_function_impl_add_variable(impl, var);
+   else
+      nir_shader_add_variable(shader, var);
 
    _mesa_hash_table_insert(var_table, ir, var);
    this->var = var;
@@ -695,9 +698,21 @@ nir_visitor::visit(ir_call *ir)
       } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_xor_internal") == 0) {
          op = nir_intrinsic_ssbo_atomic_xor;
       } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_min_internal") == 0) {
-         op = nir_intrinsic_ssbo_atomic_min;
+         assert(ir->return_deref);
+         if (ir->return_deref->type == glsl_type::int_type)
+            op = nir_intrinsic_ssbo_atomic_imin;
+         else if (ir->return_deref->type == glsl_type::uint_type)
+            op = nir_intrinsic_ssbo_atomic_umin;
+         else
+            unreachable("Invalid type");
       } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_max_internal") == 0) {
-         op = nir_intrinsic_ssbo_atomic_max;
+         assert(ir->return_deref);
+         if (ir->return_deref->type == glsl_type::int_type)
+            op = nir_intrinsic_ssbo_atomic_imax;
+         else if (ir->return_deref->type == glsl_type::uint_type)
+            op = nir_intrinsic_ssbo_atomic_umax;
+         else
+            unreachable("Invalid type");
       } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_exchange_internal") == 0) {
          op = nir_intrinsic_ssbo_atomic_exchange;
       } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_comp_swap_internal") == 0) {
@@ -906,8 +921,10 @@ nir_visitor::visit(ir_call *ir)
          break;
       }
       case nir_intrinsic_ssbo_atomic_add:
-      case nir_intrinsic_ssbo_atomic_min:
-      case nir_intrinsic_ssbo_atomic_max:
+      case nir_intrinsic_ssbo_atomic_imin:
+      case nir_intrinsic_ssbo_atomic_umin:
+      case nir_intrinsic_ssbo_atomic_imax:
+      case nir_intrinsic_ssbo_atomic_umax:
       case nir_intrinsic_ssbo_atomic_and:
       case nir_intrinsic_ssbo_atomic_or:
       case nir_intrinsic_ssbo_atomic_xor:
@@ -2065,13 +2082,10 @@ nir_visitor::visit(ir_constant *ir)
     * constant initializer and return a dereference.
     */
 
-   nir_variable *var = ralloc(this->shader, nir_variable);
-   var->name = ralloc_strdup(var, "const_temp");
-   var->type = ir->type;
-   var->data.mode = nir_var_local;
+   nir_variable *var =
+      nir_local_variable_create(this->impl, ir->type, "const_temp");
    var->data.read_only = true;
    var->constant_initializer = constant_copy(ir, var);
-   exec_list_push_tail(&this->impl->locals, &var->node);
 
    this->deref_head = nir_deref_var_create(this->shader, var);
    this->deref_tail = &this->deref_head->deref;
diff --git a/src/glsl/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp
index 9ef2fbf2525..309f9dca61e 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/nir/glsl_types.cpp
@@ -1175,7 +1175,22 @@ glsl_type::record_location_offset(unsigned length) const
          const glsl_type *wa = st->without_array();
          if (wa->is_record()) {
             unsigned r_offset = wa->record_location_offset(wa->length);
-            offset += st->is_array() ? st->length * r_offset : r_offset;
+            offset += st->is_array() ?
+               st->arrays_of_arrays_size() * r_offset : r_offset;
+         } else if (st->is_array() && st->fields.array->is_array()) {
+            unsigned outer_array_size = st->length;
+            const glsl_type *base_type = st->fields.array;
+
+            /* For arrays of arrays the outer arrays take up a uniform
+             * slot for each element. The innermost array elements share a
+             * single slot so we ignore the innermost array when calculating
+             * the offset.
+             */
+            while (base_type->fields.array->is_array()) {
+               outer_array_size = outer_array_size * base_type->length;
+               base_type = base_type->fields.array;
+            }
+            offset += outer_array_size;
          } else {
             /* We dont worry about arrays here because unless the array
              * contains a structure or another array it only takes up a single
@@ -1419,8 +1434,8 @@ glsl_type::std140_size(bool row_major) const
       unsigned int array_len;
 
       if (this->is_array()) {
-         element_type = this->fields.array;
-         array_len = this->length;
+         element_type = this->without_array();
+         array_len = this->arrays_of_arrays_size();
       } else {
          element_type = this;
          array_len = 1;
@@ -1453,12 +1468,13 @@ glsl_type::std140_size(bool row_major) const
     *      the array are laid out in order, according to rule (9).
     */
    if (this->is_array()) {
-      if (this->fields.array->is_record()) {
-         return this->length * this->fields.array->std140_size(row_major);
+      if (this->without_array()->is_record()) {
+	 return this->arrays_of_arrays_size() *
+            this->without_array()->std140_size(row_major);
       } else {
-         unsigned element_base_align =
-            this->fields.array->std140_base_alignment(row_major);
-         return this->length * MAX2(element_base_align, 16);
+	 unsigned element_base_align =
+	    this->without_array()->std140_base_alignment(row_major);
+	 return this->arrays_of_arrays_size() * MAX2(element_base_align, 16);
       }
    }
 
@@ -1818,3 +1834,17 @@ glsl_type::coordinate_components() const
 
    return size;
 }
+
+/**
+ * Declarations of type flyweights (glsl_type::_foo_type) and
+ * convenience pointers (glsl_type::foo_type).
+ * @{
+ */
+#define DECL_TYPE(NAME, ...)                                    \
+   const glsl_type glsl_type::_##NAME##_type = glsl_type(__VA_ARGS__, #NAME); \
+   const glsl_type *const glsl_type::NAME##_type = &glsl_type::_##NAME##_type;
+
+#define STRUCT_TYPE(NAME)
+
+#include "builtin_type_macros.h"
+/** @} */
diff --git a/src/glsl/glsl_types.h b/src/glsl/nir/glsl_types.h
index b83e1ca3d2c..b83e1ca3d2c 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/nir/glsl_types.h
diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index e12da805281..793bdafb54b 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -103,6 +103,72 @@ nir_reg_remove(nir_register *reg)
    exec_node_remove(&reg->node);
 }
 
+void
+nir_shader_add_variable(nir_shader *shader, nir_variable *var)
+{
+   switch (var->data.mode) {
+   case nir_var_local:
+      assert(!"nir_shader_add_variable cannot be used for local variables");
+      break;
+
+   case nir_var_global:
+      exec_list_push_tail(&shader->globals, &var->node);
+      break;
+
+   case nir_var_shader_in:
+      exec_list_push_tail(&shader->inputs, &var->node);
+      break;
+
+   case nir_var_shader_out:
+      exec_list_push_tail(&shader->outputs, &var->node);
+      break;
+
+   case nir_var_uniform:
+   case nir_var_shader_storage:
+      exec_list_push_tail(&shader->uniforms, &var->node);
+      break;
+
+   case nir_var_system_value:
+      exec_list_push_tail(&shader->system_values, &var->node);
+      break;
+   }
+}
+
+nir_variable *
+nir_variable_create(nir_shader *shader, nir_variable_mode mode,
+                    const struct glsl_type *type, const char *name)
+{
+   nir_variable *var = rzalloc(shader, nir_variable);
+   var->name = ralloc_strdup(var, name);
+   var->type = type;
+   var->data.mode = mode;
+
+   if ((mode == nir_var_shader_in && shader->stage != MESA_SHADER_VERTEX) ||
+       (mode == nir_var_shader_out && shader->stage != MESA_SHADER_FRAGMENT))
+      var->data.interpolation = INTERP_QUALIFIER_SMOOTH;
+
+   if (mode == nir_var_shader_in || mode == nir_var_uniform)
+      var->data.read_only = true;
+
+   nir_shader_add_variable(shader, var);
+
+   return var;
+}
+
+nir_variable *
+nir_local_variable_create(nir_function_impl *impl,
+                          const struct glsl_type *type, const char *name)
+{
+   nir_variable *var = rzalloc(impl->overload->function->shader, nir_variable);
+   var->name = ralloc_strdup(var, name);
+   var->type = type;
+   var->data.mode = nir_var_local;
+
+   nir_function_impl_add_variable(impl, var);
+
+   return var;
+}
+
 nir_function *
 nir_function_create(nir_shader *shader, const char *name)
 {
@@ -1080,31 +1146,33 @@ nir_src_as_const_value(nir_src src)
    return &load->value;
 }
 
+/**
+ * Returns true if the source is known to be dynamically uniform. Otherwise it
+ * returns false which means it may or may not be dynamically uniform but it
+ * can't be determined.
+ */
 bool
-nir_srcs_equal(nir_src src1, nir_src src2)
+nir_src_is_dynamically_uniform(nir_src src)
 {
-   if (src1.is_ssa) {
-      if (src2.is_ssa) {
-         return src1.ssa == src2.ssa;
-      } else {
-         return false;
-      }
-   } else {
-      if (src2.is_ssa) {
-         return false;
-      } else {
-         if ((src1.reg.indirect == NULL) != (src2.reg.indirect == NULL))
-            return false;
+   if (!src.is_ssa)
+      return false;
 
-         if (src1.reg.indirect) {
-            if (!nir_srcs_equal(*src1.reg.indirect, *src2.reg.indirect))
-               return false;
-         }
+   /* Constants are trivially dynamically uniform */
+   if (src.ssa->parent_instr->type == nir_instr_type_load_const)
+      return true;
 
-         return src1.reg.reg == src2.reg.reg &&
-                src1.reg.base_offset == src2.reg.base_offset;
-      }
+   /* As are uniform variables */
+   if (src.ssa->parent_instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src.ssa->parent_instr);
+
+      if (intr->intrinsic == nir_intrinsic_load_uniform)
+         return true;
    }
+
+   /* XXX: this could have many more tests, such as when a sampler function is
+    * called with dynamically uniform arguments.
+    */
+   return false;
 }
 
 static void
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index f7b9483d74a..825c34805c4 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -35,7 +35,7 @@
 #include "util/set.h"
 #include "util/bitset.h"
 #include "nir_types.h"
-#include "glsl/shader_enums.h"
+#include "shader_enums.h"
 #include <stdio.h>
 
 #include "nir_opcodes.h"
@@ -738,7 +738,7 @@ nir_alu_instr_channel_used(nir_alu_instr *instr, unsigned src, unsigned channel)
  * used for a source
  */
 static inline unsigned
-nir_ssa_alu_instr_src_components(nir_alu_instr *instr, unsigned src)
+nir_ssa_alu_instr_src_components(const nir_alu_instr *instr, unsigned src)
 {
    assert(instr->dest.dest.is_ssa);
 
@@ -1486,6 +1486,9 @@ typedef struct nir_shader_compiler_options {
 typedef struct nir_shader_info {
    const char *name;
 
+   /* Descriptive name provided by the client; may be NULL */
+   const char *label;
+
    /* Number of textures used by this shader */
    unsigned num_textures;
    /* Number of uniform buffers used by this shader */
@@ -1516,13 +1519,32 @@ typedef struct nir_shader_info {
    /** Was this shader linked with any transform feedback varyings? */
    bool has_transform_feedback_varyings;
 
-   struct {
-      /** The maximum number of vertices the geometry shader might write. */
-      unsigned vertices_out;
+   union {
+      struct {
+         /** The maximum number of vertices the geometry shader might write. */
+         unsigned vertices_out;
+
+         /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
+         unsigned invocations;
+      } gs;
+
+      struct {
+         bool uses_discard;
+
+         /**
+          * Whether early fragment tests are enabled as defined by
+          * ARB_shader_image_load_store.
+          */
+         bool early_fragment_tests;
+
+         /** gl_FragDepth layout for ARB_conservative_depth. */
+         enum gl_frag_depth_layout depth_layout;
+      } fs;
 
-      /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
-      unsigned invocations;
-   } gs;
+      struct {
+         unsigned local_size[3];
+      } cs;
+   };
 } nir_shader_info;
 
 typedef struct nir_shader {
@@ -1585,6 +1607,26 @@ nir_register *nir_local_reg_create(nir_function_impl *impl);
 
 void nir_reg_remove(nir_register *reg);
 
+/** Adds a variable to the appropreate list in nir_shader */
+void nir_shader_add_variable(nir_shader *shader, nir_variable *var);
+
+static inline void
+nir_function_impl_add_variable(nir_function_impl *impl, nir_variable *var)
+{
+   assert(var->data.mode == nir_var_local);
+   exec_list_push_tail(&impl->locals, &var->node);
+}
+
+/** creates a variable, sets a few defaults, and adds it to the list */
+nir_variable *nir_variable_create(nir_shader *shader,
+                                  nir_variable_mode mode,
+                                  const struct glsl_type *type,
+                                  const char *name);
+/** creates a local variable and adds it to the list */
+nir_variable *nir_local_variable_create(nir_function_impl *impl,
+                                        const struct glsl_type *type,
+                                        const char *name);
+
 /** creates a function and adds it to the shader's list of functions */
 nir_function *nir_function_create(nir_shader *shader, const char *name);
 
@@ -1821,6 +1863,7 @@ bool nir_foreach_dest(nir_instr *instr, nir_foreach_dest_cb cb, void *state);
 bool nir_foreach_src(nir_instr *instr, nir_foreach_src_cb cb, void *state);
 
 nir_const_value *nir_src_as_const_value(nir_src src);
+bool nir_src_is_dynamically_uniform(nir_src src);
 bool nir_srcs_equal(nir_src src1, nir_src src2);
 void nir_instr_rewrite_src(nir_instr *instr, nir_src *src, nir_src new_src);
 void nir_instr_move_src(nir_instr *dest_instr, nir_src *dest, nir_src *src);
diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py
index 8fd9b1039a7..2ba8554645d 100644
--- a/src/glsl/nir/nir_constant_expressions.py
+++ b/src/glsl/nir/nir_constant_expressions.py
@@ -29,6 +29,7 @@ template = """\
 #include <math.h>
 #include "main/core.h"
 #include "util/rounding.h" /* for _mesa_roundeven */
+#include "util/half_float.h"
 #include "nir_constant_expressions.h"
 
 #if defined(__SUNPRO_CC)
diff --git a/src/glsl/nir/nir_instr_set.c b/src/glsl/nir/nir_instr_set.c
new file mode 100644
index 00000000000..d3f939fe805
--- /dev/null
+++ b/src/glsl/nir/nir_instr_set.c
@@ -0,0 +1,519 @@
+/*
+ * Copyright © 2014 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir_instr_set.h"
+#include "nir_vla.h"
+
+#define HASH(hash, data) _mesa_fnv32_1a_accumulate((hash), (data))
+
+static uint32_t
+hash_src(uint32_t hash, const nir_src *src)
+{
+   assert(src->is_ssa);
+   hash = HASH(hash, src->ssa);
+   return hash;
+}
+
+static uint32_t
+hash_alu_src(uint32_t hash, const nir_alu_src *src, unsigned num_components)
+{
+   hash = HASH(hash, src->abs);
+   hash = HASH(hash, src->negate);
+
+   for (unsigned i = 0; i < num_components; i++)
+      hash = HASH(hash, src->swizzle[i]);
+
+   hash = hash_src(hash, &src->src);
+   return hash;
+}
+
+static uint32_t
+hash_alu(uint32_t hash, const nir_alu_instr *instr)
+{
+   hash = HASH(hash, instr->op);
+   hash = HASH(hash, instr->dest.dest.ssa.num_components);
+
+   if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
+      assert(nir_op_infos[instr->op].num_inputs == 2);
+      uint32_t hash0 = hash_alu_src(hash, &instr->src[0],
+                                    nir_ssa_alu_instr_src_components(instr, 0));
+      uint32_t hash1 = hash_alu_src(hash, &instr->src[1],
+                                    nir_ssa_alu_instr_src_components(instr, 1));
+      /* For commutative operations, we need some commutative way of
+       * combining the hashes.  One option would be to XOR them but that
+       * means that anything with two identical sources will hash to 0 and
+       * that's common enough we probably don't want the guaranteed
+       * collision.  Either addition or multiplication will also work.
+       */
+      hash = hash0 * hash1;
+   } else {
+      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+         hash = hash_alu_src(hash, &instr->src[i],
+                             nir_ssa_alu_instr_src_components(instr, i));
+      }
+   }
+
+   return hash;
+}
+
+static uint32_t
+hash_load_const(uint32_t hash, const nir_load_const_instr *instr)
+{
+   hash = HASH(hash, instr->def.num_components);
+
+   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f,
+                                          instr->def.num_components
+                                             * sizeof(instr->value.f[0]));
+
+   return hash;
+}
+
+static int
+cmp_phi_src(const void *data1, const void *data2)
+{
+   nir_phi_src *src1 = *(nir_phi_src **)data1;
+   nir_phi_src *src2 = *(nir_phi_src **)data2;
+   return src1->pred - src2->pred;
+}
+
+static uint32_t
+hash_phi(uint32_t hash, const nir_phi_instr *instr)
+{
+   hash = HASH(hash, instr->instr.block);
+
+   /* sort sources by predecessor, since the order shouldn't matter */
+   unsigned num_preds = instr->instr.block->predecessors->entries;
+   NIR_VLA(nir_phi_src *, srcs, num_preds);
+   unsigned i = 0;
+   nir_foreach_phi_src(instr, src) {
+      srcs[i++] = src;
+   }
+
+   qsort(srcs, num_preds, sizeof(nir_phi_src *), cmp_phi_src);
+
+   for (i = 0; i < num_preds; i++) {
+      hash = hash_src(hash, &srcs[i]->src);
+      hash = HASH(hash, srcs[i]->pred);
+   }
+
+   return hash;
+}
+
+static uint32_t
+hash_intrinsic(uint32_t hash, const nir_intrinsic_instr *instr)
+{
+   const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+   hash = HASH(hash, instr->intrinsic);
+
+   if (info->has_dest)
+      hash = HASH(hash, instr->dest.ssa.num_components);
+
+   assert(info->num_variables == 0);
+
+   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->const_index,
+                                          info->num_indices
+                                             * sizeof(instr->const_index[0]));
+   return hash;
+}
+
+static uint32_t
+hash_tex(uint32_t hash, const nir_tex_instr *instr)
+{
+   hash = HASH(hash, instr->op);
+   hash = HASH(hash, instr->num_srcs);
+
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      hash = HASH(hash, instr->src[i].src_type);
+      hash = hash_src(hash, &instr->src[i].src);
+   }
+
+   hash = HASH(hash, instr->coord_components);
+   hash = HASH(hash, instr->sampler_dim);
+   hash = HASH(hash, instr->is_array);
+   hash = HASH(hash, instr->is_shadow);
+   hash = HASH(hash, instr->is_new_style_shadow);
+   hash = HASH(hash, instr->const_offset);
+   unsigned component = instr->component;
+   hash = HASH(hash, component);
+   hash = HASH(hash, instr->sampler_index);
+   hash = HASH(hash, instr->sampler_array_size);
+
+   assert(!instr->sampler);
+
+   return hash;
+}
+
+/* Computes a hash of an instruction for use in a hash table. Note that this
+ * will only work for instructions where instr_can_rewrite() returns true, and
+ * it should return identical hashes for two instructions that are the same
+ * according nir_instrs_equal().
+ */
+
+static uint32_t
+hash_instr(const void *data)
+{
+   const nir_instr *instr = data;
+   uint32_t hash = _mesa_fnv32_1a_offset_bias;
+
+   switch (instr->type) {
+   case nir_instr_type_alu:
+      hash = hash_alu(hash, nir_instr_as_alu(instr));
+      break;
+   case nir_instr_type_load_const:
+      hash = hash_load_const(hash, nir_instr_as_load_const(instr));
+      break;
+   case nir_instr_type_phi:
+      hash = hash_phi(hash, nir_instr_as_phi(instr));
+      break;
+   case nir_instr_type_intrinsic:
+      hash = hash_intrinsic(hash, nir_instr_as_intrinsic(instr));
+      break;
+   case nir_instr_type_tex:
+      hash = hash_tex(hash, nir_instr_as_tex(instr));
+      break;
+   default:
+      unreachable("Invalid instruction type");
+   }
+
+   return hash;
+}
+
+bool
+nir_srcs_equal(nir_src src1, nir_src src2)
+{
+   if (src1.is_ssa) {
+      if (src2.is_ssa) {
+         return src1.ssa == src2.ssa;
+      } else {
+         return false;
+      }
+   } else {
+      if (src2.is_ssa) {
+         return false;
+      } else {
+         if ((src1.reg.indirect == NULL) != (src2.reg.indirect == NULL))
+            return false;
+
+         if (src1.reg.indirect) {
+            if (!nir_srcs_equal(*src1.reg.indirect, *src2.reg.indirect))
+               return false;
+         }
+
+         return src1.reg.reg == src2.reg.reg &&
+                src1.reg.base_offset == src2.reg.base_offset;
+      }
+   }
+}
+
+static bool
+nir_alu_srcs_equal(const nir_alu_instr *alu1, const nir_alu_instr *alu2,
+                   unsigned src1, unsigned src2)
+{
+   if (alu1->src[src1].abs != alu2->src[src2].abs ||
+       alu1->src[src1].negate != alu2->src[src2].negate)
+      return false;
+
+   for (unsigned i = 0; i < nir_ssa_alu_instr_src_components(alu1, src1); i++) {
+      if (alu1->src[src1].swizzle[i] != alu2->src[src2].swizzle[i])
+         return false;
+   }
+
+   return nir_srcs_equal(alu1->src[src1].src, alu2->src[src2].src);
+}
+
+/* Returns "true" if two instructions are equal. Note that this will only
+ * work for the subset of instructions defined by instr_can_rewrite(). Also,
+ * it should only return "true" for instructions that hash_instr() will return
+ * the same hash for (ignoring collisions, of course).
+ */
+
+static bool
+nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
+{
+   if (instr1->type != instr2->type)
+      return false;
+
+   switch (instr1->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu1 = nir_instr_as_alu(instr1);
+      nir_alu_instr *alu2 = nir_instr_as_alu(instr2);
+
+      if (alu1->op != alu2->op)
+         return false;
+
+      /* TODO: We can probably acutally do something more inteligent such
+       * as allowing different numbers and taking a maximum or something
+       * here */
+      if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components)
+         return false;
+
+      if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
+         assert(nir_op_infos[alu1->op].num_inputs == 2);
+         return (nir_alu_srcs_equal(alu1, alu2, 0, 0) &&
+                 nir_alu_srcs_equal(alu1, alu2, 1, 1)) ||
+                (nir_alu_srcs_equal(alu1, alu2, 0, 1) &&
+                 nir_alu_srcs_equal(alu1, alu2, 1, 0));
+      } else {
+         for (unsigned i = 0; i < nir_op_infos[alu1->op].num_inputs; i++) {
+            if (!nir_alu_srcs_equal(alu1, alu2, i, i))
+               return false;
+         }
+      }
+      return true;
+   }
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex1 = nir_instr_as_tex(instr1);
+      nir_tex_instr *tex2 = nir_instr_as_tex(instr2);
+
+      if (tex1->op != tex2->op)
+         return false;
+
+      if (tex1->num_srcs != tex2->num_srcs)
+         return false;
+      for (unsigned i = 0; i < tex1->num_srcs; i++) {
+         if (tex1->src[i].src_type != tex2->src[i].src_type ||
+             !nir_srcs_equal(tex1->src[i].src, tex2->src[i].src)) {
+            return false;
+         }
+      }
+
+      if (tex1->coord_components != tex2->coord_components ||
+          tex1->sampler_dim != tex2->sampler_dim ||
+          tex1->is_array != tex2->is_array ||
+          tex1->is_shadow != tex2->is_shadow ||
+          tex1->is_new_style_shadow != tex2->is_new_style_shadow ||
+          memcmp(tex1->const_offset, tex2->const_offset,
+                 sizeof(tex1->const_offset)) != 0 ||
+          tex1->component != tex2->component ||
+         tex1->sampler_index != tex2->sampler_index ||
+         tex1->sampler_array_size != tex2->sampler_array_size) {
+         return false;
+      }
+
+      /* Don't support un-lowered sampler derefs currently. */
+      assert(!tex1->sampler && !tex2->sampler);
+
+      return true;
+   }
+   case nir_instr_type_load_const: {
+      nir_load_const_instr *load1 = nir_instr_as_load_const(instr1);
+      nir_load_const_instr *load2 = nir_instr_as_load_const(instr2);
+
+      if (load1->def.num_components != load2->def.num_components)
+         return false;
+
+      return memcmp(load1->value.f, load2->value.f,
+                    load1->def.num_components * sizeof(*load2->value.f)) == 0;
+   }
+   case nir_instr_type_phi: {
+      nir_phi_instr *phi1 = nir_instr_as_phi(instr1);
+      nir_phi_instr *phi2 = nir_instr_as_phi(instr2);
+
+      if (phi1->instr.block != phi2->instr.block)
+         return false;
+
+      nir_foreach_phi_src(phi1, src1) {
+         nir_foreach_phi_src(phi2, src2) {
+            if (src1->pred == src2->pred) {
+               if (!nir_srcs_equal(src1->src, src2->src))
+                  return false;
+
+               break;
+            }
+         }
+      }
+
+      return true;
+   }
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrinsic1 = nir_instr_as_intrinsic(instr1);
+      nir_intrinsic_instr *intrinsic2 = nir_instr_as_intrinsic(instr2);
+      const nir_intrinsic_info *info =
+         &nir_intrinsic_infos[intrinsic1->intrinsic];
+
+      if (intrinsic1->intrinsic != intrinsic2->intrinsic ||
+          intrinsic1->num_components != intrinsic2->num_components)
+         return false;
+
+      if (info->has_dest && intrinsic1->dest.ssa.num_components !=
+                            intrinsic2->dest.ssa.num_components)
+         return false;
+
+      for (unsigned i = 0; i < info->num_srcs; i++) {
+         if (!nir_srcs_equal(intrinsic1->src[i], intrinsic2->src[i]))
+            return false;
+      }
+
+      assert(info->num_variables == 0);
+
+      for (unsigned i = 0; i < info->num_indices; i++) {
+         if (intrinsic1->const_index[i] != intrinsic2->const_index[i])
+            return false;
+      }
+
+      return true;
+   }
+   case nir_instr_type_call:
+   case nir_instr_type_jump:
+   case nir_instr_type_ssa_undef:
+   case nir_instr_type_parallel_copy:
+   default:
+      unreachable("Invalid instruction type");
+   }
+
+   return false;
+}
+
+static bool
+src_is_ssa(nir_src *src, void *data)
+{
+   (void) data;
+   return src->is_ssa;
+}
+
+static bool
+dest_is_ssa(nir_dest *dest, void *data)
+{
+   (void) data;
+   return dest->is_ssa;
+}
+
+/* This function determines if uses of an instruction can safely be rewritten
+ * to use another identical instruction instead. Note that this function must
+ * be kept in sync with hash_instr() and nir_instrs_equal() -- only
+ * instructions that pass this test will be handed on to those functions, and
+ * conversely they must handle everything that this function returns true for.
+ */
+
+static bool
+instr_can_rewrite(nir_instr *instr)
+{
+   /* We only handle SSA. */
+   if (!nir_foreach_dest(instr, dest_is_ssa, NULL) ||
+       !nir_foreach_src(instr, src_is_ssa, NULL))
+      return false;
+
+   switch (instr->type) {
+   case nir_instr_type_alu:
+   case nir_instr_type_load_const:
+   case nir_instr_type_phi:
+      return true;
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+      /* Don't support un-lowered sampler derefs currently. */
+      if (tex->sampler)
+         return false;
+
+      return true;
+   }
+   case nir_instr_type_intrinsic: {
+      const nir_intrinsic_info *info =
+         &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic];
+      return (info->flags & NIR_INTRINSIC_CAN_ELIMINATE) &&
+             (info->flags & NIR_INTRINSIC_CAN_REORDER) &&
+             info->num_variables == 0; /* not implemented yet */
+   }
+   case nir_instr_type_call:
+   case nir_instr_type_jump:
+   case nir_instr_type_ssa_undef:
+      return false;
+   case nir_instr_type_parallel_copy:
+   default:
+      unreachable("Invalid instruction type");
+   }
+
+   return false;
+}
+
+static nir_ssa_def *
+nir_instr_get_dest_ssa_def(nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_alu:
+      assert(nir_instr_as_alu(instr)->dest.dest.is_ssa);
+      return &nir_instr_as_alu(instr)->dest.dest.ssa;
+   case nir_instr_type_load_const:
+      return &nir_instr_as_load_const(instr)->def;
+   case nir_instr_type_phi:
+      assert(nir_instr_as_phi(instr)->dest.is_ssa);
+      return &nir_instr_as_phi(instr)->dest.ssa;
+   case nir_instr_type_intrinsic:
+      assert(nir_instr_as_intrinsic(instr)->dest.is_ssa);
+      return &nir_instr_as_intrinsic(instr)->dest.ssa;
+   case nir_instr_type_tex:
+      assert(nir_instr_as_tex(instr)->dest.is_ssa);
+      return &nir_instr_as_tex(instr)->dest.ssa;
+   default:
+      unreachable("We never ask for any of these");
+   }
+}
+
+static bool
+cmp_func(const void *data1, const void *data2)
+{
+   return nir_instrs_equal(data1, data2);
+}
+
+struct set *
+nir_instr_set_create(void *mem_ctx)
+{
+   return _mesa_set_create(mem_ctx, hash_instr, cmp_func);
+}
+
+void
+nir_instr_set_destroy(struct set *instr_set)
+{
+   _mesa_set_destroy(instr_set, NULL);
+}
+
+bool
+nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr)
+{
+   if (!instr_can_rewrite(instr))
+      return false;
+
+   struct set_entry *entry = _mesa_set_search(instr_set, instr);
+   if (entry) {
+      nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr);
+      nir_ssa_def *new_def =
+         nir_instr_get_dest_ssa_def((nir_instr *) entry->key);
+      nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def));
+      return true;
+   }
+
+   _mesa_set_add(instr_set, instr);
+   return false;
+}
+
+void
+nir_instr_set_remove(struct set *instr_set, nir_instr *instr)
+{
+   if (!instr_can_rewrite(instr))
+      return;
+
+   struct set_entry *entry = _mesa_set_search(instr_set, instr);
+   if (entry)
+      _mesa_set_remove(instr_set, entry);
+}
+
diff --git a/src/glsl/nir/nir_instr_set.h b/src/glsl/nir/nir_instr_set.h
new file mode 100644
index 00000000000..939e8ddbf58
--- /dev/null
+++ b/src/glsl/nir/nir_instr_set.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2014 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "nir.h"
+
+/**
+ * This file defines functions for creating, destroying, and manipulating an
+ * "instruction set," which is an abstraction for finding duplicate
+ * instructions using a hash set. Note that the question of whether an
+ * instruction is actually a duplicate (e.g. whether it has any side effects)
+ * is handled transparently. The user can pass any instruction to
+ * nir_instr_set_add_or_rewrite() and nir_instr_set_remove(), and if the
+ * instruction isn't safe to rewrite or isn't supported, it's silently
+ * removed.
+ */
+
+/*@{*/
+
+/** Creates an instruction set, using a given ralloc mem_ctx */
+struct set *nir_instr_set_create(void *mem_ctx);
+
+/** Destroys an instruction set. */
+void nir_instr_set_destroy(struct set *instr_set);
+
+/**
+ * Adds an instruction to an instruction set if it doesn't exist, or if it
+ * does already exist, rewrites all uses of it to point to the other
+ * already-inserted instruction. Returns 'true' if the uses of the instruction
+ * were rewritten.
+ */
+bool nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr);
+
+/**
+ * Removes an instruction from an instruction set, so that other instructions
+ * won't be merged with it.
+ */
+void nir_instr_set_remove(struct set *instr_set, nir_instr *instr);
+
+/*@}*/
+
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index b5a0d715aa3..68a18b9c11a 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -174,8 +174,10 @@ INTRINSIC(image_samples, 0, ARR(), true, 1, 1, 0,
  * 3: For CompSwap only: the second data parameter.
  */
 INTRINSIC(ssbo_atomic_add, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_min, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_max, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
+INTRINSIC(ssbo_atomic_imin, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
+INTRINSIC(ssbo_atomic_umin, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
+INTRINSIC(ssbo_atomic_imax, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
+INTRINSIC(ssbo_atomic_umax, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
 INTRINSIC(ssbo_atomic_and, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
 INTRINSIC(ssbo_atomic_or, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
 INTRINSIC(ssbo_atomic_xor, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c
index 6f9ecc019ec..46e137652a1 100644
--- a/src/glsl/nir/nir_lower_atomics.c
+++ b/src/glsl/nir/nir_lower_atomics.c
@@ -72,20 +72,22 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl)
 
    nir_ssa_def *offset_def = &offset_const->def;
 
-   if (instr->variables[0]->deref.child != NULL) {
-      assert(instr->variables[0]->deref.child->deref_type ==
-             nir_deref_type_array);
-      nir_deref_array *deref_array =
-         nir_deref_as_array(instr->variables[0]->deref.child);
-      assert(deref_array->deref.child == NULL);
+   nir_deref *tail = &instr->variables[0]->deref;
+   while (tail->child != NULL) {
+      assert(tail->child->deref_type == nir_deref_type_array);
+      nir_deref_array *deref_array = nir_deref_as_array(tail->child);
+      tail = tail->child;
 
-      offset_const->value.u[0] +=
-         deref_array->base_offset * ATOMIC_COUNTER_SIZE;
+      unsigned child_array_elements = tail->child != NULL ?
+         glsl_get_aoa_size(tail->type) : 1;
+
+      offset_const->value.u[0] += deref_array->base_offset *
+         child_array_elements * ATOMIC_COUNTER_SIZE;
 
       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
          nir_load_const_instr *atomic_counter_size =
                nir_load_const_instr_create(mem_ctx, 1);
-         atomic_counter_size->value.u[0] = ATOMIC_COUNTER_SIZE;
+         atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
          nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr);
 
          nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul);
@@ -102,7 +104,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl)
          add->src[0].src.is_ssa = true;
          add->src[0].src.ssa = &mul->dest.dest.ssa;
          add->src[1].src.is_ssa = true;
-         add->src[1].src.ssa = &offset_const->def;
+         add->src[1].src.ssa = offset_def;
          nir_instr_insert_before(&instr->instr, &add->instr);
 
          offset_def = &add->dest.dest.ssa;
diff --git a/src/glsl/nir/nir_opt_cse.c b/src/glsl/nir/nir_opt_cse.c
index 64c94afd480..93a6635337a 100644
--- a/src/glsl/nir/nir_opt_cse.c
+++ b/src/glsl/nir/nir_opt_cse.c
@@ -22,306 +22,60 @@
  *
  * Authors:
  *    Jason Ekstrand ([email protected])
+ *    Connor Abbott ([email protected])
  *
  */
 
-#include "nir.h"
+#include "nir_instr_set.h"
 
 /*
  * Implements common subexpression elimination
  */
 
-struct cse_state {
-   void *mem_ctx;
-   bool progress;
-};
-
-static bool
-nir_alu_srcs_equal(nir_alu_instr *alu1, nir_alu_instr *alu2, unsigned src1,
-                   unsigned src2)
-{
-   if (alu1->src[src1].abs != alu2->src[src2].abs ||
-       alu1->src[src1].negate != alu2->src[src2].negate)
-      return false;
-
-   for (unsigned i = 0; i < nir_ssa_alu_instr_src_components(alu1, src1); i++) {
-      if (alu1->src[src1].swizzle[i] != alu2->src[src2].swizzle[i])
-         return false;
-   }
-
-   return nir_srcs_equal(alu1->src[src1].src, alu2->src[src2].src);
-}
-
-static bool
-nir_instrs_equal(nir_instr *instr1, nir_instr *instr2)
-{
-   if (instr1->type != instr2->type)
-      return false;
-
-   switch (instr1->type) {
-   case nir_instr_type_alu: {
-      nir_alu_instr *alu1 = nir_instr_as_alu(instr1);
-      nir_alu_instr *alu2 = nir_instr_as_alu(instr2);
-
-      if (alu1->op != alu2->op)
-         return false;
-
-      /* TODO: We can probably acutally do something more inteligent such
-       * as allowing different numbers and taking a maximum or something
-       * here */
-      if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components)
-         return false;
-
-      if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
-         assert(nir_op_infos[alu1->op].num_inputs == 2);
-         return (nir_alu_srcs_equal(alu1, alu2, 0, 0) &&
-                 nir_alu_srcs_equal(alu1, alu2, 1, 1)) ||
-                (nir_alu_srcs_equal(alu1, alu2, 0, 1) &&
-                 nir_alu_srcs_equal(alu1, alu2, 1, 0));
-      } else {
-         for (unsigned i = 0; i < nir_op_infos[alu1->op].num_inputs; i++) {
-            if (!nir_alu_srcs_equal(alu1, alu2, i, i))
-               return false;
-         }
-      }
-      return true;
-   }
-   case nir_instr_type_tex: {
-      nir_tex_instr *tex1 = nir_instr_as_tex(instr1);
-      nir_tex_instr *tex2 = nir_instr_as_tex(instr2);
-
-      if (tex1->op != tex2->op)
-         return false;
-
-      if (tex1->num_srcs != tex2->num_srcs)
-         return false;
-      for (unsigned i = 0; i < tex1->num_srcs; i++) {
-         if (tex1->src[i].src_type != tex2->src[i].src_type ||
-             !nir_srcs_equal(tex1->src[i].src, tex2->src[i].src)) {
-            return false;
-         }
-      }
-
-      if (tex1->coord_components != tex2->coord_components ||
-          tex1->sampler_dim != tex2->sampler_dim ||
-          tex1->is_array != tex2->is_array ||
-          tex1->is_shadow != tex2->is_shadow ||
-          tex1->is_new_style_shadow != tex2->is_new_style_shadow ||
-          memcmp(tex1->const_offset, tex2->const_offset,
-                 sizeof(tex1->const_offset)) != 0 ||
-          tex1->component != tex2->component ||
-         tex1->sampler_index != tex2->sampler_index ||
-         tex1->sampler_array_size != tex2->sampler_array_size) {
-         return false;
-      }
-
-      /* Don't support un-lowered sampler derefs currently. */
-      if (tex1->sampler || tex2->sampler)
-         return false;
-
-      return true;
-   }
-   case nir_instr_type_load_const: {
-      nir_load_const_instr *load1 = nir_instr_as_load_const(instr1);
-      nir_load_const_instr *load2 = nir_instr_as_load_const(instr2);
-
-      if (load1->def.num_components != load2->def.num_components)
-         return false;
-
-      return memcmp(load1->value.f, load2->value.f,
-                    load1->def.num_components * sizeof(*load2->value.f)) == 0;
-   }
-   case nir_instr_type_phi: {
-      nir_phi_instr *phi1 = nir_instr_as_phi(instr1);
-      nir_phi_instr *phi2 = nir_instr_as_phi(instr2);
-
-      if (phi1->instr.block != phi2->instr.block)
-         return false;
-
-      nir_foreach_phi_src(phi1, src1) {
-         nir_foreach_phi_src(phi2, src2) {
-            if (src1->pred == src2->pred) {
-               if (!nir_srcs_equal(src1->src, src2->src))
-                  return false;
-
-               break;
-            }
-         }
-      }
-
-      return true;
-   }
-   case nir_instr_type_intrinsic: {
-      nir_intrinsic_instr *intrinsic1 = nir_instr_as_intrinsic(instr1);
-      nir_intrinsic_instr *intrinsic2 = nir_instr_as_intrinsic(instr2);
-      const nir_intrinsic_info *info =
-         &nir_intrinsic_infos[intrinsic1->intrinsic];
-
-      if (intrinsic1->intrinsic != intrinsic2->intrinsic ||
-          intrinsic1->num_components != intrinsic2->num_components)
-         return false;
-
-      if (info->has_dest && intrinsic1->dest.ssa.num_components !=
-                            intrinsic2->dest.ssa.num_components)
-         return false;
-
-      for (unsigned i = 0; i < info->num_srcs; i++) {
-         if (!nir_srcs_equal(intrinsic1->src[i], intrinsic2->src[i]))
-            return false;
-      }
-
-      assert(info->num_variables == 0);
-
-      for (unsigned i = 0; i < info->num_indices; i++) {
-         if (intrinsic1->const_index[i] != intrinsic2->const_index[i])
-            return false;
-      }
-
-      return true;
-   }
-   case nir_instr_type_call:
-   case nir_instr_type_jump:
-   case nir_instr_type_ssa_undef:
-   case nir_instr_type_parallel_copy:
-   default:
-      unreachable("Invalid instruction type");
-   }
-
-   return false;
-}
-
-static bool
-src_is_ssa(nir_src *src, void *data)
-{
-   (void) data;
-   return src->is_ssa;
-}
-
-static bool
-dest_is_ssa(nir_dest *dest, void *data)
-{
-   (void) data;
-   return dest->is_ssa;
-}
+/*
+ * Visits and CSE's the given block and all its descendants in the dominance
+ * tree recursively. Note that the instr_set is guaranteed to only ever
+ * contain instructions that dominate the current block.
+ */
 
 static bool
-nir_instr_can_cse(nir_instr *instr)
-{
-   /* We only handle SSA. */
-   if (!nir_foreach_dest(instr, dest_is_ssa, NULL) ||
-       !nir_foreach_src(instr, src_is_ssa, NULL))
-      return false;
-
-   switch (instr->type) {
-   case nir_instr_type_alu:
-   case nir_instr_type_tex:
-   case nir_instr_type_load_const:
-   case nir_instr_type_phi:
-      return true;
-   case nir_instr_type_intrinsic: {
-      const nir_intrinsic_info *info =
-         &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic];
-      return (info->flags & NIR_INTRINSIC_CAN_ELIMINATE) &&
-             (info->flags & NIR_INTRINSIC_CAN_REORDER) &&
-             info->num_variables == 0; /* not implemented yet */
-   }
-   case nir_instr_type_call:
-   case nir_instr_type_jump:
-   case nir_instr_type_ssa_undef:
-      return false;
-   case nir_instr_type_parallel_copy:
-   default:
-      unreachable("Invalid instruction type");
-   }
-
-   return false;
-}
-
-static nir_ssa_def *
-nir_instr_get_dest_ssa_def(nir_instr *instr)
+cse_block(nir_block *block, struct set *instr_set)
 {
-   switch (instr->type) {
-   case nir_instr_type_alu:
-      assert(nir_instr_as_alu(instr)->dest.dest.is_ssa);
-      return &nir_instr_as_alu(instr)->dest.dest.ssa;
-   case nir_instr_type_tex:
-      assert(nir_instr_as_tex(instr)->dest.is_ssa);
-      return &nir_instr_as_tex(instr)->dest.ssa;
-   case nir_instr_type_load_const:
-      return &nir_instr_as_load_const(instr)->def;
-   case nir_instr_type_phi:
-      assert(nir_instr_as_phi(instr)->dest.is_ssa);
-      return &nir_instr_as_phi(instr)->dest.ssa;
-   case nir_instr_type_intrinsic:
-      assert(nir_instr_as_intrinsic(instr)->dest.is_ssa);
-      return &nir_instr_as_intrinsic(instr)->dest.ssa;
-   default:
-      unreachable("We never ask for any of these");
-   }
-}
-
-static void
-nir_opt_cse_instr(nir_instr *instr, struct cse_state *state)
-{
-   if (!nir_instr_can_cse(instr))
-      return;
+   bool progress = false;
 
-   for (struct exec_node *node = instr->node.prev;
-        !exec_node_is_head_sentinel(node); node = node->prev) {
-      nir_instr *other = exec_node_data(nir_instr, node, node);
-      if (nir_instrs_equal(instr, other)) {
-         nir_ssa_def *other_def = nir_instr_get_dest_ssa_def(other);
-         nir_ssa_def_rewrite_uses(nir_instr_get_dest_ssa_def(instr),
-                                  nir_src_for_ssa(other_def));
+   nir_foreach_instr_safe(block, instr) {
+      if (nir_instr_set_add_or_rewrite(instr_set, instr)) {
+         progress = true;
          nir_instr_remove(instr);
-         state->progress = true;
-         return;
       }
    }
 
-   for (nir_block *block = instr->block->imm_dom;
-        block != NULL; block = block->imm_dom) {
-      nir_foreach_instr_reverse(block, other) {
-         if (nir_instrs_equal(instr, other)) {
-            nir_ssa_def *other_def = nir_instr_get_dest_ssa_def(other);
-            nir_ssa_def_rewrite_uses(nir_instr_get_dest_ssa_def(instr),
-                                     nir_src_for_ssa(other_def));
-            nir_instr_remove(instr);
-            state->progress = true;
-            return;
-         }
-      }
+   for (unsigned i = 0; i < block->num_dom_children; i++) {
+      nir_block *child = block->dom_children[i];
+      progress |= cse_block(child, instr_set);
    }
-}
-
-static bool
-nir_opt_cse_block(nir_block *block, void *void_state)
-{
-   struct cse_state *state = void_state;
 
-   nir_foreach_instr_safe(block, instr)
-      nir_opt_cse_instr(instr, state);
+   nir_foreach_instr(block, instr)
+     nir_instr_set_remove(instr_set, instr);
 
-   return true;
+   return progress;
 }
 
 static bool
 nir_opt_cse_impl(nir_function_impl *impl)
 {
-   struct cse_state state;
-
-   state.mem_ctx = ralloc_parent(impl);
-   state.progress = false;
+   struct set *instr_set = nir_instr_set_create(NULL);
 
    nir_metadata_require(impl, nir_metadata_dominance);
 
-   nir_foreach_block(impl, nir_opt_cse_block, &state);
+   bool progress = cse_block(nir_start_block(impl), instr_set);
 
-   if (state.progress)
+   if (progress)
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
 
-   return state.progress;
+   nir_instr_set_destroy(instr_set);
+   return progress;
 }
 
 bool
@@ -336,3 +90,4 @@ nir_opt_cse(nir_shader *shader)
 
    return progress;
 }
+
diff --git a/src/glsl/nir/nir_sweep.c b/src/glsl/nir/nir_sweep.c
index b6ce43b5224..5a22f509f50 100644
--- a/src/glsl/nir/nir_sweep.c
+++ b/src/glsl/nir/nir_sweep.c
@@ -155,6 +155,8 @@ nir_sweep(nir_shader *nir)
    ralloc_adopt(rubbish, nir);
 
    ralloc_steal(nir, (char *)nir->info.name);
+   if (nir->info.label)
+      ralloc_steal(nir, (char *)nir->info.label);
 
    /* Variables and registers are not dead.  Steal them back. */
    steal_list(nir, nir_variable, &nir->uniforms);
diff --git a/src/glsl/nir/nir_types.cpp b/src/glsl/nir/nir_types.cpp
index 01f0e9b5abc..4a1250e546c 100644
--- a/src/glsl/nir/nir_types.cpp
+++ b/src/glsl/nir/nir_types.cpp
@@ -118,6 +118,12 @@ glsl_get_length(const struct glsl_type *type)
    return type->is_matrix() ? type->matrix_columns : type->length;
 }
 
+unsigned
+glsl_get_aoa_size(const struct glsl_type *type)
+{
+   return type->arrays_of_arrays_size();
+}
+
 const char *
 glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index)
 {
diff --git a/src/glsl/nir/nir_types.h b/src/glsl/nir/nir_types.h
index 1a0cb1fb774..a61af6cba75 100644
--- a/src/glsl/nir/nir_types.h
+++ b/src/glsl/nir/nir_types.h
@@ -31,7 +31,7 @@
 
 /* C wrapper around glsl_types.h */
 
-#include "../glsl_types.h"
+#include "glsl_types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -65,6 +65,8 @@ unsigned glsl_get_matrix_columns(const struct glsl_type *type);
 
 unsigned glsl_get_length(const struct glsl_type *type);
 
+unsigned glsl_get_aoa_size(const struct glsl_type *type);
+
 const char *glsl_get_struct_elem_name(const struct glsl_type *type,
                                       unsigned index);
 
diff --git a/src/glsl/shader_enums.c b/src/glsl/nir/shader_enums.c
index c196b791d4f..66a25e72344 100644
--- a/src/glsl/shader_enums.c
+++ b/src/glsl/nir/shader_enums.c
@@ -26,8 +26,9 @@
  *    Rob Clark <[email protected]>
  */
 
-#include "glsl/shader_enums.h"
+#include "shader_enums.h"
 #include "util/macros.h"
+#include "mesa/main/config.h"
 
 #define ENUM(x) [x] = #x
 #define NAME(val) ((((val) < ARRAY_SIZE(names)) && names[(val)]) ? names[(val)] : "UNKNOWN")
@@ -42,6 +43,7 @@ const char * gl_shader_stage_name(gl_shader_stage stage)
       ENUM(MESA_SHADER_FRAGMENT),
       ENUM(MESA_SHADER_COMPUTE),
    };
+   STATIC_ASSERT(ARRAY_SIZE(names) == MESA_SHADER_STAGES);
    return NAME(stage);
 }
 
@@ -82,6 +84,7 @@ const char * gl_vert_attrib_name(gl_vert_attrib attrib)
       ENUM(VERT_ATTRIB_GENERIC14),
       ENUM(VERT_ATTRIB_GENERIC15),
    };
+   STATIC_ASSERT(ARRAY_SIZE(names) == VERT_ATTRIB_MAX);
    return NAME(attrib);
 }
 
@@ -147,6 +150,7 @@ const char * gl_varying_slot_name(gl_varying_slot slot)
       ENUM(VARYING_SLOT_VAR30),
       ENUM(VARYING_SLOT_VAR31),
    };
+   STATIC_ASSERT(ARRAY_SIZE(names) == VARYING_SLOT_MAX);
    return NAME(slot);
 }
 
@@ -169,8 +173,10 @@ const char * gl_system_value_name(gl_system_value sysval)
      ENUM(SYSTEM_VALUE_TESS_LEVEL_INNER),
      ENUM(SYSTEM_VALUE_LOCAL_INVOCATION_ID),
      ENUM(SYSTEM_VALUE_WORK_GROUP_ID),
+     ENUM(SYSTEM_VALUE_NUM_WORK_GROUPS),
      ENUM(SYSTEM_VALUE_VERTEX_CNT),
    };
+   STATIC_ASSERT(ARRAY_SIZE(names) == SYSTEM_VALUE_MAX);
    return NAME(sysval);
 }
 
@@ -182,6 +188,7 @@ const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
       ENUM(INTERP_QUALIFIER_FLAT),
       ENUM(INTERP_QUALIFIER_NOPERSPECTIVE),
    };
+   STATIC_ASSERT(ARRAY_SIZE(names) == INTERP_QUALIFIER_COUNT);
    return NAME(qual);
 }
 
@@ -201,5 +208,6 @@ const char * gl_frag_result_name(gl_frag_result result)
       ENUM(FRAG_RESULT_DATA6),
       ENUM(FRAG_RESULT_DATA7),
    };
+   STATIC_ASSERT(ARRAY_SIZE(names) == FRAG_RESULT_MAX);
    return NAME(result);
 }
diff --git a/src/glsl/shader_enums.h b/src/glsl/nir/shader_enums.h
index 2a5d2c5bfa7..d1cf7ca04cc 100644
--- a/src/glsl/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@@ -233,6 +233,11 @@ typedef enum
    VARYING_SLOT_VAR31,
 } gl_varying_slot;
 
+
+#define VARYING_SLOT_MAX	(VARYING_SLOT_VAR0 + MAX_VARYING)
+#define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
+#define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)
+
 const char * gl_varying_slot_name(gl_varying_slot slot);
 
 /**
@@ -473,4 +478,23 @@ typedef enum
 
 const char * gl_frag_result_name(gl_frag_result result);
 
+#define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
+
+/**
+ * \brief Layout qualifiers for gl_FragDepth.
+ *
+ * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
+ * a layout qualifier.
+ *
+ * \see enum ir_depth_layout
+ */
+enum gl_frag_depth_layout
+{
+   FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
+   FRAG_DEPTH_LAYOUT_ANY,
+   FRAG_DEPTH_LAYOUT_GREATER,
+   FRAG_DEPTH_LAYOUT_LESS,
+   FRAG_DEPTH_LAYOUT_UNCHANGED
+};
+
 #endif /* SHADER_ENUMS_H */
diff --git a/src/glsl/opt_dead_code.cpp b/src/glsl/opt_dead_code.cpp
index 2cb7f41adef..c5be166e75a 100644
--- a/src/glsl/opt_dead_code.cpp
+++ b/src/glsl/opt_dead_code.cpp
@@ -75,24 +75,35 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
 	  || !entry->declaration)
 	 continue;
 
-      if (entry->assign) {
-	 /* Remove a single dead assignment to the variable we found.
-	  * Don't do so if it's a shader or function output or a shader
-	  * storage variable though.
+      if (!entry->assign_list.is_empty()) {
+	 /* Remove all the dead assignments to the variable we found.
+	  * Don't do so if it's a shader or function output, though.
 	  */
 	 if (entry->var->data.mode != ir_var_function_out &&
 	     entry->var->data.mode != ir_var_function_inout &&
              entry->var->data.mode != ir_var_shader_out &&
              entry->var->data.mode != ir_var_shader_storage) {
-	    entry->assign->remove();
-	    progress = true;
 
-	    if (debug) {
-	       printf("Removed assignment to %s@%p\n",
-		      entry->var->name, (void *) entry->var);
-	    }
+            while (!entry->assign_list.is_empty()) {
+               struct assignment_entry *assignment_entry =
+                  exec_node_data(struct assignment_entry,
+                                 entry->assign_list.head, link);
+
+	       assignment_entry->assign->remove();
+
+	       if (debug) {
+	          printf("Removed assignment to %s@%p\n",
+		         entry->var->name, (void *) entry->var);
+               }
+
+               assignment_entry->link.remove();
+               free(assignment_entry);
+            }
+            progress = true;
 	 }
-      } else {
+      }
+
+      if (entry->assign_list.is_empty()) {
 	 /* If there are no assignments or references to the variable left,
 	  * then we can remove its declaration.
 	  */
@@ -103,7 +114,7 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
 	  */
          if (entry->var->data.mode == ir_var_uniform ||
              entry->var->data.mode == ir_var_shader_storage) {
-            if (uniform_locations_assigned || entry->var->constant_value)
+            if (uniform_locations_assigned || entry->var->constant_initializer)
                continue;
 
             /* Section 2.11.6 (Uniform Variables) of the OpenGL ES 3.0.3 spec
diff --git a/src/glsl/opt_tree_grafting.cpp b/src/glsl/opt_tree_grafting.cpp
index a7a219c55ca..e38a0e93058 100644
--- a/src/glsl/opt_tree_grafting.cpp
+++ b/src/glsl/opt_tree_grafting.cpp
@@ -373,8 +373,6 @@ tree_grafting_basic_block(ir_instruction *bb_first,
 	  entry->referenced_count != 2)
 	 continue;
 
-      assert(assign == entry->assign);
-
       /* Found a possibly graftable assignment.  Now, walk through the
        * rest of the BB seeing if the deref is here, and if nothing interfered with
        * pasting its expression's values in between.
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 05140192893..3a95360eda6 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -113,9 +113,18 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
    ralloc_free(shProg->InfoLog);
    shProg->InfoLog = ralloc_strdup(shProg, "");
 
+   ralloc_free(shProg->BufferInterfaceBlocks);
+   shProg->BufferInterfaceBlocks = NULL;
+   shProg->NumBufferInterfaceBlocks = 0;
+
    ralloc_free(shProg->UniformBlocks);
    shProg->UniformBlocks = NULL;
-   shProg->NumBufferInterfaceBlocks = 0;
+   shProg->NumUniformBlocks = 0;
+
+   ralloc_free(shProg->ShaderStorageBlocks);
+   shProg->ShaderStorageBlocks = NULL;
+   shProg->NumShaderStorageBlocks = 0;
+
    for (i = 0; i < MESA_SHADER_STAGES; i++) {
       ralloc_free(shProg->UniformBlockStageIndex[i]);
       shProg->UniformBlockStageIndex[i] = NULL;
diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk
index 2e308b83733..cd31e148222 100644
--- a/src/mesa/Android.libmesa_dricore.mk
+++ b/src/mesa/Android.libmesa_dricore.mk
@@ -50,7 +50,7 @@ endif # MESA_ENABLE_ASM
 ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
 LOCAL_SRC_FILES += \
 	main/streaming-load-memcpy.c \
-	mesa/main/sse_minmax.c
+	main/sse_minmax.c
 LOCAL_CFLAGS := \
 	-msse4.1 \
        -DUSE_SSE41
@@ -60,6 +60,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa/main \
 	$(MESA_TOP)/src/glsl \
+	$(MESA_TOP)/src/glsl/nir \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
 
diff --git a/src/mesa/Android.libmesa_glsl_utils.mk b/src/mesa/Android.libmesa_glsl_utils.mk
index ed620ac648c..9e150eaa3c0 100644
--- a/src/mesa/Android.libmesa_glsl_utils.mk
+++ b/src/mesa/Android.libmesa_glsl_utils.mk
@@ -37,6 +37,7 @@ LOCAL_MODULE := libmesa_glsl_utils
 
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/glsl \
+	$(MESA_TOP)/src/glsl/nir \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
@@ -62,6 +63,7 @@ LOCAL_CFLAGS := -D_POSIX_C_SOURCE=199309L
 
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/glsl \
+	$(MESA_TOP)/src/glsl/nir \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
diff --git a/src/mesa/Android.libmesa_st_mesa.mk b/src/mesa/Android.libmesa_st_mesa.mk
index b4b7fd97722..427a35f4f6e 100644
--- a/src/mesa/Android.libmesa_st_mesa.mk
+++ b/src/mesa/Android.libmesa_st_mesa.mk
@@ -55,6 +55,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa/main \
 	$(MESA_TOP)/src/glsl \
+	$(MESA_TOP)/src/glsl/nir \
 	$(MESA_TOP)/src/gallium/auxiliary \
 	$(MESA_TOP)/src/gallium/include
 
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 0915594cea6..34fb4461985 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -415,6 +415,7 @@ STATETRACKER_FILES = \
 	state_tracker/st_cache.h \
 	state_tracker/st_cb_bitmap.c \
 	state_tracker/st_cb_bitmap.h \
+	state_tracker/st_cb_bitmap_shader.c \
 	state_tracker/st_cb_blit.c \
 	state_tracker/st_cb_blit.h \
 	state_tracker/st_cb_bufferobjects.c \
@@ -425,6 +426,7 @@ STATETRACKER_FILES = \
 	state_tracker/st_cb_condrender.h \
 	state_tracker/st_cb_drawpixels.c \
 	state_tracker/st_cb_drawpixels.h \
+	state_tracker/st_cb_drawpixels_shader.c \
 	state_tracker/st_cb_drawtex.c \
 	state_tracker/st_cb_drawtex.h \
 	state_tracker/st_cb_eglimage.c \
@@ -525,9 +527,7 @@ PROGRAM_FILES = \
 	program/sampler.h \
 	program/string_to_uint_map.cpp \
 	program/symbol_table.c \
-	program/symbol_table.h \
-	../glsl/shader_enums.c \
-	../glsl/shader_enums.h
+	program/symbol_table.h
 
 PROGRAM_NIR_FILES = \
 	program/prog_to_nir.c \
@@ -620,6 +620,7 @@ INCLUDE_DIRS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/glsl \
+	-I$(top_srcdir)/src/glsl/nir \
 	-I$(top_builddir)/src/glsl \
 	-I$(top_builddir)/src/glsl/nir \
 	-I$(top_srcdir)/src/glsl/glcpp \
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index 5b80a216fef..c986326d2bf 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -16,6 +16,7 @@ env.Append(CPPPATH = [
     '#/src',
     '#/src/mapi',
     '#/src/glsl',
+    '#/src/glsl/nir',
     '#/src/mesa',
     '#/src/gallium/include',
     '#/src/gallium/auxiliary',
diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c
index 33490ee6615..04b9cafe308 100644
--- a/src/mesa/drivers/common/meta_copy_image.c
+++ b/src/mesa/drivers/common/meta_copy_image.c
@@ -108,7 +108,11 @@ make_view(struct gl_context *ctx, struct gl_texture_image *tex_image,
       return false;
    }
 
+   assert(tex_obj->Target != 0);
+   assert(tex_obj->TargetIndex < NUM_TEXTURE_TARGETS);
+
    view_tex_obj->Target = tex_obj->Target;
+   view_tex_obj->TargetIndex = tex_obj->TargetIndex;
 
    *view_tex_image = _mesa_get_tex_image(ctx, view_tex_obj, tex_obj->Target, 0);
 
@@ -129,7 +133,6 @@ make_view(struct gl_context *ctx, struct gl_texture_image *tex_image,
    view_tex_obj->NumLayers = tex_obj->NumLayers;
    view_tex_obj->Immutable = tex_obj->Immutable;
    view_tex_obj->ImmutableLevels = tex_obj->ImmutableLevels;
-   view_tex_obj->Target = tex_obj->Target;
 
    if (ctx->Driver.TextureView != NULL &&
        !ctx->Driver.TextureView(ctx, view_tex_obj, tex_obj)) {
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index 1a5943c87fb..59d795998c6 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -1315,9 +1315,10 @@ static struct gl_program *
 i915NewProgram(struct gl_context * ctx, GLenum target, GLuint id)
 {
    switch (target) {
-   case GL_VERTEX_PROGRAM_ARB:
-      return _mesa_init_vertex_program(ctx, CALLOC_STRUCT(gl_vertex_program),
-                                       target, id);
+   case GL_VERTEX_PROGRAM_ARB: {
+      struct gl_vertex_program *prog = CALLOC_STRUCT(gl_vertex_program);
+      return _mesa_init_gl_program(&prog->Base, target, id);
+   }
 
    case GL_FRAGMENT_PROGRAM_ARB:{
          struct i915_fragment_program *prog =
@@ -1325,8 +1326,7 @@ i915NewProgram(struct gl_context * ctx, GLenum target, GLuint id)
          if (prog) {
             i915_init_program(I915_CONTEXT(ctx), prog);
 
-            return _mesa_init_fragment_program(ctx, &prog->FragProg,
-                                               target, id);
+            return _mesa_init_gl_program(&prog->FragProg.Base, target, id);
          }
          else
             return NULL;
diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk
index a9b963a9eca..d30a053e10f 100644
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -48,6 +48,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_DRI_C_INCLUDES)
 
 LOCAL_SRC_FILES := \
+	$(i965_compiler_FILES) \
 	$(i965_FILES)
 
 LOCAL_WHOLE_STATIC_LIBRARIES := \
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 2e241511049..04b3f9cc8ce 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -33,6 +33,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
 	-I$(top_srcdir)/src/gtest/include \
+	-I$(top_srcdir)/src/glsl/nir \
 	-I$(top_builddir)/src/glsl/nir \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index b242ab55aae..ccd540dabca 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -1,6 +1,7 @@
 i965_compiler_FILES = \
 	brw_cfg.cpp \
 	brw_cfg.h \
+	brw_compiler.h \
 	brw_cubemap_normalize.cpp \
 	brw_dead_control_flow.cpp \
 	brw_dead_control_flow.h \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index d458ad846bf..5308d175416 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -32,7 +32,7 @@ brw_blorp_eu_emitter::brw_blorp_eu_emitter(struct brw_context *brw,
      generator(brw->intelScreen->compiler, brw,
                mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key),
                (struct brw_stage_prog_data *) rzalloc(mem_ctx, struct brw_wm_prog_data),
-               NULL, 0, false, "BLORP")
+               0, false, "BLORP")
 {
    if (debug_flag)
       generator.enable_debug("blorp");
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
index 91d53eff5a7..10bcd4bafd4 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
@@ -305,6 +305,10 @@ cfg_t::cfg_t(exec_list *instructions)
 
          assert(cur_do != NULL && cur_while != NULL);
 	 cur->add_successor(mem_ctx, cur_do);
+
+         if (inst->predicate)
+            cur->add_successor(mem_ctx, cur_while);
+
 	 set_next_block(&cur, cur_while, ip);
 
 	 /* Pop the stack so we're in the previous loop */
@@ -422,7 +426,11 @@ cfg_t::dump(backend_shader *s)
       calculate_idom();
 
    foreach_block (block, this) {
-      fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num);
+      if (block->idom)
+         fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num);
+      else
+         fprintf(stderr, "START B%d IDOM(none)", block->num);
+
       foreach_list_typed(bblock_link, link, link, &block->parents) {
          fprintf(stderr, " <-B%d",
                  link->block->num);
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index 17a745d0373..b0119558c3a 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -241,7 +241,7 @@ brw_clear(struct gl_context *ctx, GLbitfield mask)
    }
 
    /* Clear color buffers with fast clear or at least rep16 writes. */
-   if (brw->gen >= 6 && brw->gen < 9 && (mask & BUFFER_BITS_COLOR)) {
+   if (brw->gen >= 6 && (mask & BUFFER_BITS_COLOR)) {
       if (brw_meta_fast_clear(brw, fb, mask, partial_clear)) {
          debug_mask("blorp color", mask & BUFFER_BITS_COLOR);
          mask &= ~BUFFER_BITS_COLOR;
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
new file mode 100644
index 00000000000..11c485d2f08
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -0,0 +1,661 @@
+/*
+ * Copyright © 2010 - 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "brw_device_info.h"
+#include "main/mtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ra_regs;
+struct nir_shader;
+struct brw_geometry_program;
+union gl_constant_value;
+
+struct brw_compiler {
+   const struct brw_device_info *devinfo;
+
+   struct {
+      struct ra_regs *regs;
+
+      /**
+       * Array of the ra classes for the unaligned contiguous register
+       * block sizes used.
+       */
+      int *classes;
+
+      /**
+       * Mapping for register-allocated objects in *regs to the first
+       * GRF for that object.
+       */
+      uint8_t *ra_reg_to_grf;
+   } vec4_reg_set;
+
+   struct {
+      struct ra_regs *regs;
+
+      /**
+       * Array of the ra classes for the unaligned contiguous register
+       * block sizes used, indexed by register size.
+       */
+      int classes[16];
+
+      /**
+       * Mapping from classes to ra_reg ranges.  Each of the per-size
+       * classes corresponds to a range of ra_reg nodes.  This array stores
+       * those ranges in the form of first ra_reg in each class and the
+       * total number of ra_reg elements in the last array element.  This
+       * way the range of the i'th class is given by:
+       * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] )
+       */
+      int class_to_ra_reg_range[17];
+
+      /**
+       * Mapping for register-allocated objects in *regs to the first
+       * GRF for that object.
+       */
+      uint8_t *ra_reg_to_grf;
+
+      /**
+       * ra class for the aligned pairs we use for PLN, which doesn't
+       * appear in *classes.
+       */
+      int aligned_pairs_class;
+   } fs_reg_sets[2];
+
+   void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+   void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+
+   bool scalar_vs;
+   struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
+};
+
+
+/**
+ * Program key structures.
+ *
+ * When drawing, we look for the currently bound shaders in the program
+ * cache.  This is essentially a hash table lookup, and these are the keys.
+ *
+ * Sometimes OpenGL features specified as state need to be simulated via
+ * shader code, due to a mismatch between the API and the hardware.  This
+ * is often referred to as "non-orthagonal state" or "NOS".  We store NOS
+ * in the program key so it's considered when searching for a program.  If
+ * we haven't seen a particular combination before, we have to recompile a
+ * new specialized version.
+ *
+ * Shader compilation should not look up state in gl_context directly, but
+ * instead use the copy in the program key.  This guarantees recompiles will
+ * happen correctly.
+ *
+ *  @{
+ */
+
+enum PACKED gen6_gather_sampler_wa {
+   WA_SIGN = 1,      /* whether we need to sign extend */
+   WA_8BIT = 2,      /* if we have an 8bit format needing wa */
+   WA_16BIT = 4,     /* if we have a 16bit format needing wa */
+};
+
+/**
+ * Sampler information needed by VS, WM, and GS program cache keys.
+ */
+struct brw_sampler_prog_key_data {
+   /**
+    * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
+    */
+   uint16_t swizzles[MAX_SAMPLERS];
+
+   uint32_t gl_clamp_mask[3];
+
+   /**
+    * For RG32F, gather4's channel select is broken.
+    */
+   uint32_t gather_channel_quirk_mask;
+
+   /**
+    * Whether this sampler uses the compressed multisample surface layout.
+    */
+   uint32_t compressed_multisample_layout_mask;
+
+   /**
+    * For Sandybridge, which shader w/a we need for gather quirks.
+    */
+   enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS];
+};
+
+
+/** The program key for Vertex Shaders. */
+struct brw_vs_prog_key {
+   unsigned program_string_id;
+
+   /*
+    * Per-attribute workaround flags
+    */
+   uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX];
+
+   bool copy_edgeflag:1;
+
+   bool clamp_vertex_color:1;
+
+   /**
+    * How many user clipping planes are being uploaded to the vertex shader as
+    * push constants.
+    *
+    * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
+    * clip distances.
+    */
+   unsigned nr_userclip_plane_consts:4;
+
+   /**
+    * For pre-Gen6 hardware, a bitfield indicating which texture coordinates
+    * are going to be replaced with point coordinates (as a consequence of a
+    * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)).  Because
+    * our SF thread requires exact matching between VS outputs and FS inputs,
+    * these texture coordinates will need to be unconditionally included in
+    * the VUE, even if they aren't written by the vertex shader.
+    */
+   uint8_t point_coord_replace;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Geometry Shaders. */
+struct brw_gs_prog_key
+{
+   unsigned program_string_id;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Fragment/Pixel Shaders. */
+struct brw_wm_prog_key {
+   uint8_t iz_lookup;
+   bool stats_wm:1;
+   bool flat_shade:1;
+   bool persample_shading:1;
+   bool persample_2x:1;
+   unsigned nr_color_regions:5;
+   bool replicate_alpha:1;
+   bool render_to_fbo:1;
+   bool clamp_fragment_color:1;
+   bool compute_pos_offset:1;
+   bool compute_sample_id:1;
+   unsigned line_aa:2;
+   bool high_quality_derivatives:1;
+
+   uint16_t drawable_height;
+   uint64_t input_slots_valid;
+   unsigned program_string_id;
+   GLenum alpha_test_func;          /* < For Gen4/5 MRT alpha test */
+   float alpha_test_ref;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+struct brw_cs_prog_key {
+   uint32_t program_string_id;
+   struct brw_sampler_prog_key_data tex;
+};
+
+/*
+ * Image metadata structure as laid out in the shader parameter
+ * buffer.  Entries have to be 16B-aligned for the vec4 back-end to be
+ * able to use them.  That's okay because the padding and any unused
+ * entries [most of them except when we're doing untyped surface
+ * access] will be removed by the uniform packing pass.
+ */
+#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET      0
+#define BRW_IMAGE_PARAM_OFFSET_OFFSET           4
+#define BRW_IMAGE_PARAM_SIZE_OFFSET             8
+#define BRW_IMAGE_PARAM_STRIDE_OFFSET           12
+#define BRW_IMAGE_PARAM_TILING_OFFSET           16
+#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET        20
+#define BRW_IMAGE_PARAM_SIZE                    24
+
+struct brw_image_param {
+   /** Surface binding table index. */
+   uint32_t surface_idx;
+
+   /** Offset applied to the X and Y surface coordinates. */
+   uint32_t offset[2];
+
+   /** Surface X, Y and Z dimensions. */
+   uint32_t size[3];
+
+   /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in
+    * pixels, vertical slice stride in pixels.
+    */
+   uint32_t stride[4];
+
+   /** Log2 of the tiling modulus in the X, Y and Z dimension. */
+   uint32_t tiling[3];
+
+   /**
+    * Right shift to apply for bit 6 address swizzling.  Two different
+    * swizzles can be specified and will be applied one after the other.  The
+    * resulting address will be:
+    *
+    *  addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^
+    *                              (addr >> swizzling[1])))
+    *
+    * Use \c 0xff if any of the swizzles is not required.
+    */
+   uint32_t swizzling[2];
+};
+
+struct brw_stage_prog_data {
+   struct {
+      /** size of our binding table. */
+      uint32_t size_bytes;
+
+      /** @{
+       * surface indices for the various groups of surfaces
+       */
+      uint32_t pull_constants_start;
+      uint32_t texture_start;
+      uint32_t gather_texture_start;
+      uint32_t ubo_start;
+      uint32_t ssbo_start;
+      uint32_t abo_start;
+      uint32_t image_start;
+      uint32_t shader_time_start;
+      /** @} */
+   } binding_table;
+
+   GLuint nr_params;       /**< number of float params/constants */
+   GLuint nr_pull_params;
+   unsigned nr_image_params;
+
+   unsigned curb_read_length;
+   unsigned total_scratch;
+
+   /**
+    * Register where the thread expects to find input data from the URB
+    * (typically uniforms, followed by vertex or fragment attributes).
+    */
+   unsigned dispatch_grf_start_reg;
+
+   bool use_alt_mode; /**< Use ALT floating point mode?  Otherwise, IEEE. */
+
+   /* Pointers to tracked values (only valid once
+    * _mesa_load_state_parameters has been called at runtime).
+    */
+   const union gl_constant_value **param;
+   const union gl_constant_value **pull_param;
+
+   /** Image metadata passed to the shader as uniforms. */
+   struct brw_image_param *image_param;
+};
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs.
+ */
+struct brw_wm_prog_data {
+   struct brw_stage_prog_data base;
+
+   GLuint num_varying_inputs;
+
+   GLuint dispatch_grf_start_reg_16;
+   GLuint reg_blocks;
+   GLuint reg_blocks_16;
+
+   struct {
+      /** @{
+       * surface indices the WM-specific surfaces
+       */
+      uint32_t render_target_start;
+      /** @} */
+   } binding_table;
+
+   uint8_t computed_depth_mode;
+
+   bool early_fragment_tests;
+   bool no_8;
+   bool dual_src_blend;
+   bool uses_pos_offset;
+   bool uses_omask;
+   bool uses_kill;
+   bool pulls_bary;
+   uint32_t prog_offset_16;
+
+   /**
+    * Mask of which interpolation modes are required by the fragment shader.
+    * Used in hardware setup on gen6+.
+    */
+   uint32_t barycentric_interp_modes;
+
+   /**
+    * Map from gl_varying_slot to the position within the FS setup data
+    * payload where the varying's attribute vertex deltas should be delivered.
+    * For varying slots that are not used by the FS, the value is -1.
+    */
+   int urb_setup[VARYING_SLOT_MAX];
+};
+
+struct brw_cs_prog_data {
+   struct brw_stage_prog_data base;
+
+   GLuint dispatch_grf_start_reg_16;
+   unsigned local_size[3];
+   unsigned simd_size;
+   bool uses_barrier;
+   bool uses_num_work_groups;
+   unsigned local_invocation_id_regs;
+
+   struct {
+      /** @{
+       * surface indices the CS-specific surfaces
+       */
+      uint32_t work_groups_start;
+      /** @} */
+   } binding_table;
+};
+
+/**
+ * Enum representing the i965-specific vertex results that don't correspond
+ * exactly to any element of gl_varying_slot.  The values of this enum are
+ * assigned such that they don't conflict with gl_varying_slot.
+ */
+typedef enum
+{
+   BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
+   BRW_VARYING_SLOT_PAD,
+   /**
+    * Technically this is not a varying but just a placeholder that
+    * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
+    * builtin variable to be compiled correctly. see compile_sf_prog() for
+    * more info.
+    */
+   BRW_VARYING_SLOT_PNTC,
+   BRW_VARYING_SLOT_COUNT
+} brw_varying_slot;
+
+/**
+ * Data structure recording the relationship between the gl_varying_slot enum
+ * and "slots" within the vertex URB entry (VUE).  A "slot" is defined as a
+ * single octaword within the VUE (128 bits).
+ *
+ * Note that each BRW register contains 256 bits (2 octawords), so when
+ * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two
+ * consecutive VUE slots.  When accessing the VUE in URB_INTERLEAVED mode (as
+ * in a vertex shader), each register corresponds to a single VUE slot, since
+ * it contains data for two separate vertices.
+ */
+struct brw_vue_map {
+   /**
+    * Bitfield representing all varying slots that are (a) stored in this VUE
+    * map, and (b) actually written by the shader.  Does not include any of
+    * the additional varying slots defined in brw_varying_slot.
+    */
+   GLbitfield64 slots_valid;
+
+   /**
+    * Is this VUE map for a separate shader pipeline?
+    *
+    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
+    * without the linker having a chance to dead code eliminate unused varyings.
+    *
+    * This means that we have to use a fixed slot layout, based on the output's
+    * location field, rather than assigning slots in a compact contiguous block.
+    */
+   bool separate;
+
+   /**
+    * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
+    * not stored in a slot (because they are not written, or because
+    * additional processing is applied before storing them in the VUE), the
+    * value is -1.
+    */
+   signed char varying_to_slot[BRW_VARYING_SLOT_COUNT];
+
+   /**
+    * Map from VUE slot to gl_varying_slot value.  For slots that do not
+    * directly correspond to a gl_varying_slot, the value comes from
+    * brw_varying_slot.
+    *
+    * For slots that are not in use, the value is BRW_VARYING_SLOT_COUNT (this
+    * simplifies code that uses the value stored in slot_to_varying to
+    * create a bit mask).
+    */
+   signed char slot_to_varying[BRW_VARYING_SLOT_COUNT];
+
+   /**
+    * Total number of VUE slots in use
+    */
+   int num_slots;
+};
+
+/**
+ * Convert a VUE slot number into a byte offset within the VUE.
+ */
+static inline GLuint brw_vue_slot_to_offset(GLuint slot)
+{
+   return 16*slot;
+}
+
+/**
+ * Convert a vertex output (brw_varying_slot) into a byte offset within the
+ * VUE.
+ */
+static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map,
+                                           GLuint varying)
+{
+   return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
+}
+
+void brw_compute_vue_map(const struct brw_device_info *devinfo,
+                         struct brw_vue_map *vue_map,
+                         GLbitfield64 slots_valid,
+                         bool separate_shader);
+
+enum shader_dispatch_mode {
+   DISPATCH_MODE_4X1_SINGLE = 0,
+   DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
+   DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
+   DISPATCH_MODE_SIMD8 = 3,
+};
+
+struct brw_vue_prog_data {
+   struct brw_stage_prog_data base;
+   struct brw_vue_map vue_map;
+
+   GLuint urb_read_length;
+   GLuint total_grf;
+
+   /* Used for calculating urb partitions.  In the VS, this is the size of the
+    * URB entry used for both input and output to the thread.  In the GS, this
+    * is the size of the URB entry used for output.
+    */
+   GLuint urb_entry_size;
+
+   enum shader_dispatch_mode dispatch_mode;
+};
+
+struct brw_vs_prog_data {
+   struct brw_vue_prog_data base;
+
+   GLbitfield64 inputs_read;
+
+   unsigned nr_attributes;
+
+   bool uses_vertexid;
+   bool uses_instanceid;
+};
+
+struct brw_gs_prog_data
+{
+   struct brw_vue_prog_data base;
+
+   /**
+    * Size of an output vertex, measured in HWORDS (32 bytes).
+    */
+   unsigned output_vertex_size_hwords;
+
+   unsigned output_topology;
+
+   /**
+    * Size of the control data (cut bits or StreamID bits), in hwords (32
+    * bytes).  0 if there is no control data.
+    */
+   unsigned control_data_header_size_hwords;
+
+   /**
+    * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
+    * if the control data is StreamID bits, or
+    * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
+    * Ignored if control_data_header_size is 0.
+    */
+   unsigned control_data_format;
+
+   bool include_primitive_id;
+
+   /**
+    * The number of vertices emitted, if constant - otherwise -1.
+    */
+   int static_vertex_count;
+
+   int invocations;
+
+   /**
+    * Gen6 transform feedback enabled flag.
+    */
+   bool gen6_xfb_enabled;
+
+   /**
+    * Gen6: Provoking vertex convention for odd-numbered triangles
+    * in tristrips.
+    */
+   GLuint pv_first:1;
+
+   /**
+    * Gen6: Number of varyings that are output to transform feedback.
+    */
+   GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
+
+   /**
+    * Gen6: Map from the index of a transform feedback binding table entry to the
+    * gl_varying_slot that should be streamed out through that binding table
+    * entry.
+    */
+   unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */];
+
+   /**
+    * Gen6: Map from the index of a transform feedback binding table entry to the
+    * swizzles that should be used when streaming out data through that
+    * binding table entry.
+    */
+   unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */];
+};
+
+
+/** @} */
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_vs_prog_key *key,
+               struct brw_vs_prog_data *prog_data,
+               const struct nir_shader *shader,
+               gl_clip_plane *clip_planes,
+               bool use_legacy_snorm_formula,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str);
+
+/**
+ * Scratch data used when compiling a GLSL geometry shader.
+ */
+struct brw_gs_compile
+{
+   struct brw_gs_prog_key key;
+   struct brw_gs_prog_data prog_data;
+   struct brw_vue_map input_vue_map;
+
+   struct brw_geometry_program *gp;
+
+   unsigned control_data_bits_per_vertex;
+   unsigned control_data_header_size_bits;
+};
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
+               struct brw_gs_compile *c,
+               const struct nir_shader *shader,
+               struct gl_shader_program *shader_prog,
+               void *mem_ctx,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str);
+
+/**
+ * Compile a fragment shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_wm_prog_key *key,
+               struct brw_wm_prog_data *prog_data,
+               const struct nir_shader *shader,
+               struct gl_program *prog,
+               int shader_time_index8,
+               int shader_time_index16,
+               bool use_rep_send,
+               unsigned *final_assembly_size,
+               char **error_str);
+
+/**
+ * Compile a compute shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_cs_prog_key *key,
+               struct brw_cs_prog_data *prog_data,
+               const struct nir_shader *shader,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 6b2bbd21703..3b125448e14 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -51,7 +51,7 @@
 
 #include "brw_context.h"
 #include "brw_defines.h"
-#include "brw_shader.h"
+#include "brw_compiler.h"
 #include "brw_draw.h"
 #include "brw_state.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index aa1284db3ce..4f503ae4869 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -40,6 +40,7 @@
 #include "main/mm.h"
 #include "main/mtypes.h"
 #include "brw_structs.h"
+#include "brw_compiler.h"
 #include "intel_aub.h"
 #include "program/prog_parameter.h"
 
@@ -340,260 +341,6 @@ struct brw_shader {
    bool compiled_once;
 };
 
-struct brw_stage_prog_data {
-   struct {
-      /** size of our binding table. */
-      uint32_t size_bytes;
-
-      /** @{
-       * surface indices for the various groups of surfaces
-       */
-      uint32_t pull_constants_start;
-      uint32_t texture_start;
-      uint32_t gather_texture_start;
-      uint32_t ubo_start;
-      uint32_t abo_start;
-      uint32_t image_start;
-      uint32_t shader_time_start;
-      /** @} */
-   } binding_table;
-
-   GLuint nr_params;       /**< number of float params/constants */
-   GLuint nr_pull_params;
-   unsigned nr_image_params;
-
-   unsigned curb_read_length;
-   unsigned total_scratch;
-
-   /**
-    * Register where the thread expects to find input data from the URB
-    * (typically uniforms, followed by vertex or fragment attributes).
-    */
-   unsigned dispatch_grf_start_reg;
-
-   bool use_alt_mode; /**< Use ALT floating point mode?  Otherwise, IEEE. */
-
-   /* Pointers to tracked values (only valid once
-    * _mesa_load_state_parameters has been called at runtime).
-    */
-   const gl_constant_value **param;
-   const gl_constant_value **pull_param;
-
-   /** Image metadata passed to the shader as uniforms. */
-   struct brw_image_param *image_param;
-};
-
-/*
- * Image metadata structure as laid out in the shader parameter
- * buffer.  Entries have to be 16B-aligned for the vec4 back-end to be
- * able to use them.  That's okay because the padding and any unused
- * entries [most of them except when we're doing untyped surface
- * access] will be removed by the uniform packing pass.
- */
-#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET      0
-#define BRW_IMAGE_PARAM_OFFSET_OFFSET           4
-#define BRW_IMAGE_PARAM_SIZE_OFFSET             8
-#define BRW_IMAGE_PARAM_STRIDE_OFFSET           12
-#define BRW_IMAGE_PARAM_TILING_OFFSET           16
-#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET        20
-#define BRW_IMAGE_PARAM_SIZE                    24
-
-struct brw_image_param {
-   /** Surface binding table index. */
-   uint32_t surface_idx;
-
-   /** Offset applied to the X and Y surface coordinates. */
-   uint32_t offset[2];
-
-   /** Surface X, Y and Z dimensions. */
-   uint32_t size[3];
-
-   /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in
-    * pixels, vertical slice stride in pixels.
-    */
-   uint32_t stride[4];
-
-   /** Log2 of the tiling modulus in the X, Y and Z dimension. */
-   uint32_t tiling[3];
-
-   /**
-    * Right shift to apply for bit 6 address swizzling.  Two different
-    * swizzles can be specified and will be applied one after the other.  The
-    * resulting address will be:
-    *
-    *  addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^
-    *                              (addr >> swizzling[1])))
-    *
-    * Use \c 0xff if any of the swizzles is not required.
-    */
-   uint32_t swizzling[2];
-};
-
-/* Data about a particular attempt to compile a program.  Note that
- * there can be many of these, each in a different GL state
- * corresponding to a different brw_wm_prog_key struct, with different
- * compiled programs.
- */
-struct brw_wm_prog_data {
-   struct brw_stage_prog_data base;
-
-   GLuint num_varying_inputs;
-
-   GLuint dispatch_grf_start_reg_16;
-   GLuint reg_blocks;
-   GLuint reg_blocks_16;
-
-   struct {
-      /** @{
-       * surface indices the WM-specific surfaces
-       */
-      uint32_t render_target_start;
-      /** @} */
-   } binding_table;
-
-   uint8_t computed_depth_mode;
-
-   bool early_fragment_tests;
-   bool no_8;
-   bool dual_src_blend;
-   bool uses_pos_offset;
-   bool uses_omask;
-   bool uses_kill;
-   bool pulls_bary;
-   uint32_t prog_offset_16;
-
-   /**
-    * Mask of which interpolation modes are required by the fragment shader.
-    * Used in hardware setup on gen6+.
-    */
-   uint32_t barycentric_interp_modes;
-
-   /**
-    * Map from gl_varying_slot to the position within the FS setup data
-    * payload where the varying's attribute vertex deltas should be delivered.
-    * For varying slots that are not used by the FS, the value is -1.
-    */
-   int urb_setup[VARYING_SLOT_MAX];
-};
-
-struct brw_cs_prog_data {
-   struct brw_stage_prog_data base;
-
-   GLuint dispatch_grf_start_reg_16;
-   unsigned local_size[3];
-   unsigned simd_size;
-   bool uses_barrier;
-   bool uses_num_work_groups;
-   unsigned local_invocation_id_regs;
-
-   struct {
-      /** @{
-       * surface indices the CS-specific surfaces
-       */
-      uint32_t work_groups_start;
-      /** @} */
-   } binding_table;
-};
-
-/**
- * Enum representing the i965-specific vertex results that don't correspond
- * exactly to any element of gl_varying_slot.  The values of this enum are
- * assigned such that they don't conflict with gl_varying_slot.
- */
-typedef enum
-{
-   BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
-   BRW_VARYING_SLOT_PAD,
-   /**
-    * Technically this is not a varying but just a placeholder that
-    * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
-    * builtin variable to be compiled correctly. see compile_sf_prog() for
-    * more info.
-    */
-   BRW_VARYING_SLOT_PNTC,
-   BRW_VARYING_SLOT_COUNT
-} brw_varying_slot;
-
-
-/**
- * Data structure recording the relationship between the gl_varying_slot enum
- * and "slots" within the vertex URB entry (VUE).  A "slot" is defined as a
- * single octaword within the VUE (128 bits).
- *
- * Note that each BRW register contains 256 bits (2 octawords), so when
- * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two
- * consecutive VUE slots.  When accessing the VUE in URB_INTERLEAVED mode (as
- * in a vertex shader), each register corresponds to a single VUE slot, since
- * it contains data for two separate vertices.
- */
-struct brw_vue_map {
-   /**
-    * Bitfield representing all varying slots that are (a) stored in this VUE
-    * map, and (b) actually written by the shader.  Does not include any of
-    * the additional varying slots defined in brw_varying_slot.
-    */
-   GLbitfield64 slots_valid;
-
-   /**
-    * Is this VUE map for a separate shader pipeline?
-    *
-    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
-    * without the linker having a chance to dead code eliminate unused varyings.
-    *
-    * This means that we have to use a fixed slot layout, based on the output's
-    * location field, rather than assigning slots in a compact contiguous block.
-    */
-   bool separate;
-
-   /**
-    * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
-    * not stored in a slot (because they are not written, or because
-    * additional processing is applied before storing them in the VUE), the
-    * value is -1.
-    */
-   signed char varying_to_slot[BRW_VARYING_SLOT_COUNT];
-
-   /**
-    * Map from VUE slot to gl_varying_slot value.  For slots that do not
-    * directly correspond to a gl_varying_slot, the value comes from
-    * brw_varying_slot.
-    *
-    * For slots that are not in use, the value is BRW_VARYING_SLOT_COUNT (this
-    * simplifies code that uses the value stored in slot_to_varying to
-    * create a bit mask).
-    */
-   signed char slot_to_varying[BRW_VARYING_SLOT_COUNT];
-
-   /**
-    * Total number of VUE slots in use
-    */
-   int num_slots;
-};
-
-/**
- * Convert a VUE slot number into a byte offset within the VUE.
- */
-static inline GLuint brw_vue_slot_to_offset(GLuint slot)
-{
-   return 16*slot;
-}
-
-/**
- * Convert a vertex output (brw_varying_slot) into a byte offset within the
- * VUE.
- */
-static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map,
-                                           GLuint varying)
-{
-   return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
-}
-
-void brw_compute_vue_map(const struct brw_device_info *devinfo,
-                         struct brw_vue_map *vue_map,
-                         GLbitfield64 slots_valid,
-                         bool separate_shader);
-
-
 /**
  * Bitmask indicating which fragment shader inputs represent varyings (and
  * hence have to be delivered to the fragment shader by the SF/SBE stage).
@@ -670,39 +417,6 @@ struct brw_ff_gs_prog_data {
    unsigned svbi_postincrement_value;
 };
 
-enum shader_dispatch_mode {
-   DISPATCH_MODE_4X1_SINGLE = 0,
-   DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
-   DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
-   DISPATCH_MODE_SIMD8 = 3,
-};
-
-struct brw_vue_prog_data {
-   struct brw_stage_prog_data base;
-   struct brw_vue_map vue_map;
-
-   GLuint urb_read_length;
-   GLuint total_grf;
-
-   /* Used for calculating urb partitions.  In the VS, this is the size of the
-    * URB entry used for both input and output to the thread.  In the GS, this
-    * is the size of the URB entry used for output.
-    */
-   GLuint urb_entry_size;
-
-   enum shader_dispatch_mode dispatch_mode;
-};
-
-
-struct brw_vs_prog_data {
-   struct brw_vue_prog_data base;
-
-   GLbitfield64 inputs_read;
-
-   bool uses_vertexid;
-   bool uses_instanceid;
-};
-
 /** Number of texture sampler units */
 #define BRW_MAX_TEX_UNIT 32
 
@@ -715,9 +429,6 @@ struct brw_vs_prog_data {
 /** Max number of SSBOs in a shader */
 #define BRW_MAX_SSBO 12
 
-/** Max number of combined UBOs and SSBOs in a shader */
-#define BRW_MAX_COMBINED_UBO_SSBO (BRW_MAX_UBO + BRW_MAX_SSBO)
-
 /** Max number of atomic counter buffer objects in a shader */
 #define BRW_MAX_ABO 16
 
@@ -763,71 +474,6 @@ struct brw_vs_prog_data {
 
 #define SURF_INDEX_GEN6_SOL_BINDING(t) (t)
 
-struct brw_gs_prog_data
-{
-   struct brw_vue_prog_data base;
-
-   /**
-    * Size of an output vertex, measured in HWORDS (32 bytes).
-    */
-   unsigned output_vertex_size_hwords;
-
-   unsigned output_topology;
-
-   /**
-    * Size of the control data (cut bits or StreamID bits), in hwords (32
-    * bytes).  0 if there is no control data.
-    */
-   unsigned control_data_header_size_hwords;
-
-   /**
-    * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
-    * if the control data is StreamID bits, or
-    * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
-    * Ignored if control_data_header_size is 0.
-    */
-   unsigned control_data_format;
-
-   bool include_primitive_id;
-
-   /**
-    * The number of vertices emitted, if constant - otherwise -1.
-    */
-   int static_vertex_count;
-
-   int invocations;
-
-   /**
-    * Gen6 transform feedback enabled flag.
-    */
-   bool gen6_xfb_enabled;
-
-   /**
-    * Gen6: Provoking vertex convention for odd-numbered triangles
-    * in tristrips.
-    */
-   GLuint pv_first:1;
-
-   /**
-    * Gen6: Number of varyings that are output to transform feedback.
-    */
-   GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
-
-   /**
-    * Gen6: Map from the index of a transform feedback binding table entry to the
-    * gl_varying_slot that should be streamed out through that binding table
-    * entry.
-    */
-   unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS];
-
-   /**
-    * Gen6: Map from the index of a transform feedback binding table entry to the
-    * swizzles that should be used when streaming out data through that
-    * binding table entry.
-    */
-   unsigned char transform_feedback_swizzles[BRW_MAX_SOL_BINDINGS];
-};
-
 /**
  * Stride in bytes between shader_time entries.
  *
@@ -953,6 +599,8 @@ struct intel_batchbuffer {
    } saved;
 };
 
+#define MAX_GS_INPUT_VERTICES 6
+
 #define BRW_MAX_XFB_STREAMS 4
 
 struct brw_transform_feedback_object {
diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c
index 45fb816c160..263d224e882 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@@ -105,9 +105,15 @@ brw_codegen_cs_prog(struct brw_context *brw,
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
       st_index = brw_get_shader_time_index(brw, prog, &cp->program.Base, ST_CS);
 
-   program = brw_cs_emit(brw, mem_ctx, key, &prog_data,
-                         &cp->program, prog, st_index, &program_size);
+   char *error_str;
+   program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx,
+                            key, &prog_data, cp->program.Base.nir,
+                            st_index, &program_size, &error_str);
    if (program == NULL) {
+      prog->LinkStatus = false;
+      ralloc_strcat(&prog->InfoLog, error_str);
+      _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str);
+
       ralloc_free(mem_ctx);
       return false;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h
index 17c2ff9871a..899e340f14e 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.h
+++ b/src/mesa/drivers/dri/i965/brw_cs.h
@@ -27,11 +27,6 @@
 
 #include "brw_program.h"
 
-struct brw_cs_prog_key {
-   uint32_t program_string_id;
-   struct brw_sampler_prog_key_data tex;
-};
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -39,16 +34,6 @@ extern "C" {
 void
 brw_upload_cs_prog(struct brw_context *brw);
 
-const unsigned *
-brw_cs_emit(struct brw_context *brw,
-            void *mem_ctx,
-            const struct brw_cs_prog_key *key,
-            struct brw_cs_prog_data *prog_data,
-            struct gl_compute_program *cp,
-            struct gl_shader_program *prog,
-            int shader_time_index,
-            unsigned *final_assembly_size);
-
 void
 brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
                              void *buffer, uint32_t threads, uint32_t stride);
diff --git a/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp b/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp
index 33571292007..33d2048e657 100644
--- a/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp
@@ -30,7 +30,7 @@
  * \author Eric Anholt <[email protected]>
  */
 
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/ir.h"
 #include "program/prog_instruction.h" /* For WRITEMASK_* */
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 761aa0ec5fa..0ac1ad9378b 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -461,7 +461,7 @@ brw_pixel_interpolator_query(struct brw_codegen *p,
                              struct brw_reg mrf,
                              bool noperspective,
                              unsigned mode,
-                             unsigned data,
+                             struct brw_reg data,
                              unsigned msg_length,
                              unsigned response_length);
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index dc699bb6321..bf2fee9ed48 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -3212,26 +3212,29 @@ brw_pixel_interpolator_query(struct brw_codegen *p,
                              struct brw_reg mrf,
                              bool noperspective,
                              unsigned mode,
-                             unsigned data,
+                             struct brw_reg data,
                              unsigned msg_length,
                              unsigned response_length)
 {
    const struct brw_device_info *devinfo = p->devinfo;
-   struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
-
-   brw_set_dest(p, insn, dest);
-   brw_set_src0(p, insn, mrf);
-   brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
-                              msg_length, response_length,
-                              false /* header is never present for PI */,
-                              false);
+   struct brw_inst *insn;
+   const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
 
-   brw_inst_set_pi_simd_mode(
-         devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16);
+   /* brw_send_indirect_message will automatically use a direct send message
+    * if data is actually immediate.
+    */
+   insn = brw_send_indirect_message(p,
+                                    GEN7_SFID_PIXEL_INTERPOLATOR,
+                                    dest,
+                                    mrf,
+                                    vec1(data));
+   brw_inst_set_mlen(devinfo, insn, msg_length);
+   brw_inst_set_rlen(devinfo, insn, response_length);
+
+   brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
    brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
    brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
    brw_inst_set_pi_message_type(devinfo, insn, mode);
-   brw_inst_set_pi_message_data(devinfo, insn, data);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5049851c617..0562c5a9981 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -47,7 +47,7 @@
 #include "brw_dead_control_flow.h"
 #include "main/uniforms.h"
 #include "brw_fs_live_variables.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "program/sampler.h"
 
 using namespace brw;
@@ -338,6 +338,18 @@ fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 }
 
 bool
+fs_inst::can_change_types() const
+{
+   return dst.type == src[0].type &&
+          !src[0].abs && !src[0].negate && !saturate &&
+          (opcode == BRW_OPCODE_MOV ||
+           (opcode == BRW_OPCODE_SEL &&
+            dst.type == src[1].type &&
+            predicate != BRW_PREDICATE_NONE &&
+            !src[1].abs && !src[1].negate));
+}
+
+bool
 fs_inst::has_side_effects() const
 {
    return this->eot || backend_instruction::has_side_effects();
@@ -1049,11 +1061,11 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
    unsigned int array_elements;
 
    if (type->is_array()) {
-      array_elements = type->length;
+      array_elements = type->arrays_of_arrays_size();
       if (array_elements == 0) {
          fail("dereferenced array '%s' has length 0\n", name);
       }
-      type = type->fields.array;
+      type = type->without_array();
    } else {
       array_elements = 1;
    }
@@ -1509,25 +1521,14 @@ void
 fs_visitor::assign_vs_urb_setup()
 {
    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
-   int grf, count, slot, channel, attr;
 
    assert(stage == MESA_SHADER_VERTEX);
-   count = _mesa_bitcount_64(vs_prog_data->inputs_read);
+   int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
       count++;
 
    /* Each attribute is 4 regs. */
-   this->first_non_payload_grf += count * 4;
-
-   unsigned vue_entries =
-      MAX2(count, vs_prog_data->base.vue_map.num_slots);
-
-   /* URB entry size is counted in units of 64 bytes (for the 3DSTATE_URB_VS
-    * command).  Each attribute is 16 bytes (4 floats/dwords), so each unit
-    * fits four attributes.
-    */
-   vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
-   vs_prog_data->base.urb_read_length = (count + 1) / 2;
+   this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
 
    assert(vs_prog_data->base.urb_read_length <= 15);
 
@@ -1535,25 +1536,10 @@ fs_visitor::assign_vs_urb_setup()
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == ATTR) {
-
-            if (inst->src[i].reg == VERT_ATTRIB_MAX) {
-               slot = count - 1;
-            } else {
-               /* Attributes come in in a contiguous block, ordered by their
-                * gl_vert_attrib value.  That means we can compute the slot
-                * number for an attribute by masking out the enabled
-                * attributes before it and counting the bits.
-                */
-               attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
-               slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
-                                        BITFIELD64_MASK(attr));
-            }
-
-            channel = inst->src[i].reg_offset & 3;
-
-            grf = payload.num_regs +
-               prog_data->curb_read_length +
-               slot * 4 + channel;
+            int grf = payload.num_regs +
+                      prog_data->curb_read_length +
+                      inst->src[i].reg +
+                      inst->src[i].reg_offset;
 
             inst->src[i].file = HW_REG;
             inst->src[i].fixed_hw_reg =
@@ -5134,41 +5120,140 @@ fs_visitor::run_cs()
    return !failed;
 }
 
+/**
+ * Return a bitfield where bit n is set if barycentric interpolation mode n
+ * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
+ */
+static unsigned
+brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
+                                     bool shade_model_flat,
+                                     bool persample_shading,
+                                     const nir_shader *shader)
+{
+   unsigned barycentric_interp_modes = 0;
+
+   nir_foreach_variable(var, &shader->inputs) {
+      enum glsl_interp_qualifier interp_qualifier =
+         (enum glsl_interp_qualifier)var->data.interpolation;
+      bool is_centroid = var->data.centroid && !persample_shading;
+      bool is_sample = var->data.sample || persample_shading;
+      bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
+                         (var->data.location == VARYING_SLOT_COL1);
+
+      /* Ignore WPOS and FACE, because they don't require interpolation. */
+      if (var->data.location == VARYING_SLOT_POS ||
+          var->data.location == VARYING_SLOT_FACE)
+         continue;
+
+      /* Determine the set (or sets) of barycentric coordinates needed to
+       * interpolate this variable.  Note that when
+       * brw->needs_unlit_centroid_workaround is set, centroid interpolation
+       * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
+       * for lit pixels, so we need both sets of barycentric coordinates.
+       */
+      if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) {
+         if (is_centroid) {
+            barycentric_interp_modes |=
+               1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
+         } else if (is_sample) {
+            barycentric_interp_modes |=
+               1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
+         }
+         if ((!is_centroid && !is_sample) ||
+             devinfo->needs_unlit_centroid_workaround) {
+            barycentric_interp_modes |=
+               1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
+         }
+      } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH ||
+                 (!(shade_model_flat && is_gl_Color) &&
+                  interp_qualifier == INTERP_QUALIFIER_NONE)) {
+         if (is_centroid) {
+            barycentric_interp_modes |=
+               1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
+         } else if (is_sample) {
+            barycentric_interp_modes |=
+               1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
+         }
+         if ((!is_centroid && !is_sample) ||
+             devinfo->needs_unlit_centroid_workaround) {
+            barycentric_interp_modes |=
+               1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
+         }
+      }
+   }
+
+   return barycentric_interp_modes;
+}
+
+static uint8_t
+computed_depth_mode(const nir_shader *shader)
+{
+   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      switch (shader->info.fs.depth_layout) {
+      case FRAG_DEPTH_LAYOUT_NONE:
+      case FRAG_DEPTH_LAYOUT_ANY:
+         return BRW_PSCDEPTH_ON;
+      case FRAG_DEPTH_LAYOUT_GREATER:
+         return BRW_PSCDEPTH_ON_GE;
+      case FRAG_DEPTH_LAYOUT_LESS:
+         return BRW_PSCDEPTH_ON_LE;
+      case FRAG_DEPTH_LAYOUT_UNCHANGED:
+         return BRW_PSCDEPTH_OFF;
+      }
+   }
+   return BRW_PSCDEPTH_OFF;
+}
+
 const unsigned *
-brw_wm_fs_emit(struct brw_context *brw,
+brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
                const struct brw_wm_prog_key *key,
                struct brw_wm_prog_data *prog_data,
-               struct gl_fragment_program *fp,
-               struct gl_shader_program *prog,
+               const nir_shader *shader,
+               struct gl_program *prog,
                int shader_time_index8, int shader_time_index16,
-               unsigned *final_assembly_size)
+               bool use_rep_send,
+               unsigned *final_assembly_size,
+               char **error_str)
 {
-   /* Now the main event: Visit the shader IR and generate our FS IR for it.
+   /* key->alpha_test_func means simulating alpha testing via discards,
+    * so the shader definitely kills pixels.
     */
-   fs_visitor v(brw->intelScreen->compiler, brw, mem_ctx, key,
-                &prog_data->base, &fp->Base, fp->Base.nir, 8, shader_time_index8);
+   prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func;
+   prog_data->uses_omask =
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
+   prog_data->computed_depth_mode = computed_depth_mode(shader);
+
+   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
+
+   prog_data->barycentric_interp_modes =
+      brw_compute_barycentric_interp_modes(compiler->devinfo,
+                                           key->flat_shade,
+                                           key->persample_shading,
+                                           shader);
+
+   fs_visitor v(compiler, log_data, mem_ctx, key,
+                &prog_data->base, prog, shader, 8,
+                shader_time_index8);
    if (!v.run_fs(false /* do_rep_send */)) {
-      if (prog) {
-         prog->LinkStatus = false;
-         ralloc_strcat(&prog->InfoLog, v.fail_msg);
-      }
-
-      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
-                    v.fail_msg);
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
 
       return NULL;
    }
 
    cfg_t *simd16_cfg = NULL;
-   fs_visitor v2(brw->intelScreen->compiler, brw, mem_ctx, key,
-                 &prog_data->base, &fp->Base, fp->Base.nir, 16, shader_time_index16);
-   if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
+   fs_visitor v2(compiler, log_data, mem_ctx, key,
+                 &prog_data->base, prog, shader, 16,
+                 shader_time_index16);
+   if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
       if (!v.simd16_unsupported) {
          /* Try a SIMD16 compile */
          v2.import_uniforms(&v);
-         if (!v2.run_fs(brw->use_rep_send)) {
-            perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
+         if (!v2.run_fs(use_rep_send)) {
+            compiler->shader_perf_log(log_data,
+                                      "SIMD16 shader failed to compile: %s",
+                                      v2.fail_msg);
          } else {
             simd16_cfg = v2.cfg;
          }
@@ -5176,8 +5261,8 @@ brw_wm_fs_emit(struct brw_context *brw,
    }
 
    cfg_t *simd8_cfg;
-   int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
-   if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
+   int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
+   if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
       simd8_cfg = NULL;
       prog_data->no_8 = true;
    } else {
@@ -5185,20 +5270,14 @@ brw_wm_fs_emit(struct brw_context *brw,
       prog_data->no_8 = false;
    }
 
-   fs_generator g(brw->intelScreen->compiler, brw,
-                  mem_ctx, (void *) key, &prog_data->base,
-                  &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
+   fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
+                  v.promoted_constants, v.runtime_check_aads_emit, "FS");
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      char *name;
-      if (prog)
-         name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
-                                prog->Label ? prog->Label : "unnamed",
-                                prog->Name);
-      else
-         name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
-
-      g.enable_debug(name);
+      g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
+                                     shader->info.label ? shader->info.label :
+                                                          "unnamed",
+                                     shader->info.name));
    }
 
    if (simd8_cfg)
@@ -5283,29 +5362,32 @@ fs_visitor::emit_cs_work_group_id_setup()
 }
 
 const unsigned *
-brw_cs_emit(struct brw_context *brw,
-            void *mem_ctx,
-            const struct brw_cs_prog_key *key,
-            struct brw_cs_prog_data *prog_data,
-            struct gl_compute_program *cp,
-            struct gl_shader_program *prog,
-            int shader_time_index,
-            unsigned *final_assembly_size)
+brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_cs_prog_key *key,
+               struct brw_cs_prog_data *prog_data,
+               const nir_shader *shader,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str)
 {
-   prog_data->local_size[0] = cp->LocalSize[0];
-   prog_data->local_size[1] = cp->LocalSize[1];
-   prog_data->local_size[2] = cp->LocalSize[2];
+   prog_data->local_size[0] = shader->info.cs.local_size[0];
+   prog_data->local_size[1] = shader->info.cs.local_size[1];
+   prog_data->local_size[2] = shader->info.cs.local_size[2];
    unsigned local_workgroup_size =
-      cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
-   unsigned max_cs_threads = brw->intelScreen->compiler->devinfo->max_cs_threads;
+      shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
+      shader->info.cs.local_size[2];
+
+   unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
 
    cfg_t *cfg = NULL;
    const char *fail_msg = NULL;
 
    /* Now the main event: Visit the shader IR and generate our CS IR for it.
     */
-   fs_visitor v8(brw->intelScreen->compiler, brw, mem_ctx, key,
-                 &prog_data->base, &cp->Base, cp->Base.nir, 8, shader_time_index);
+   fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
+                 NULL, /* Never used in core profile */
+                 shader, 8, shader_time_index);
    if (!v8.run_cs()) {
       fail_msg = v8.fail_msg;
    } else if (local_workgroup_size <= 8 * max_cs_threads) {
@@ -5313,15 +5395,18 @@ brw_cs_emit(struct brw_context *brw,
       prog_data->simd_size = 8;
    }
 
-   fs_visitor v16(brw->intelScreen->compiler, brw, mem_ctx, key,
-                  &prog_data->base, &cp->Base, cp->Base.nir, 16, shader_time_index);
+   fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
+                 NULL, /* Never used in core profile */
+                 shader, 16, shader_time_index);
    if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
        !fail_msg && !v8.simd16_unsupported &&
        local_workgroup_size <= 16 * max_cs_threads) {
       /* Try a SIMD16 compile */
       v16.import_uniforms(&v8);
       if (!v16.run_cs()) {
-         perf_debug("SIMD16 shader failed to compile: %s", v16.fail_msg);
+         compiler->shader_perf_log(log_data,
+                                   "SIMD16 shader failed to compile: %s",
+                                   v16.fail_msg);
          if (!cfg) {
             fail_msg =
                "Couldn't generate SIMD16 program and not "
@@ -5335,20 +5420,19 @@ brw_cs_emit(struct brw_context *brw,
 
    if (unlikely(cfg == NULL)) {
       assert(fail_msg);
-      prog->LinkStatus = false;
-      ralloc_strcat(&prog->InfoLog, fail_msg);
-      _mesa_problem(NULL, "Failed to compile compute shader: %s\n",
-                    fail_msg);
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, fail_msg);
+
       return NULL;
    }
 
-   fs_generator g(brw->intelScreen->compiler, brw,
-                  mem_ctx, (void*) key, &prog_data->base, &cp->Base,
+   fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
                   v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
    if (INTEL_DEBUG & DEBUG_CS) {
-      char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
-                                   prog->Label ? prog->Label : "unnamed",
-                                   prog->Name);
+      char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
+                                   shader->info.label ? shader->info.label :
+                                                        "unnamed",
+                                   shader->info.name);
       g.enable_debug(name);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index e8b511f9ce6..171338dcc0b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -48,7 +48,7 @@ extern "C" {
 #include "brw_wm.h"
 #include "intel_asm_annotation.h"
 }
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/ir.h"
 #include "glsl/nir/nir.h"
 #include "program/sampler.h"
@@ -96,7 +96,7 @@ public:
               const void *key,
               struct brw_stage_prog_data *prog_data,
               struct gl_program *prog,
-              nir_shader *shader,
+              const nir_shader *shader,
               unsigned dispatch_width,
               int shader_time_index);
 
@@ -400,7 +400,6 @@ public:
                 void *mem_ctx,
                 const void *key,
                 struct brw_stage_prog_data *prog_data,
-                struct gl_program *fp,
                 unsigned promoted_constants,
                 bool runtime_check_aads_emit,
                 const char *stage_abbrev);
@@ -499,8 +498,6 @@ private:
    const void * const key;
    struct brw_stage_prog_data * const prog_data;
 
-   const struct gl_program *prog;
-
    unsigned dispatch_width; /**< 8 or 16 */
 
    exec_list discard_halt_patches;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 277b6cc3a60..a13d001291c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -45,7 +45,7 @@
 #include "brw_wm.h"
 #include "glsl/ir.h"
 #include "glsl/ir_expression_flattening.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 
 class ir_channel_expressions_visitor : public ir_hierarchical_visitor {
 public:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 230b0caec47..5589716239a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -275,17 +275,6 @@ is_logic_op(enum opcode opcode)
            opcode == BRW_OPCODE_NOT);
 }
 
-static bool
-can_change_source_types(fs_inst *inst)
-{
-   return !inst->src[0].abs && !inst->src[0].negate &&
-          inst->dst.type == inst->src[0].type &&
-          (inst->opcode == BRW_OPCODE_MOV ||
-           (inst->opcode == BRW_OPCODE_SEL &&
-            inst->predicate != BRW_PREDICATE_NONE &&
-            !inst->src[1].abs && !inst->src[1].negate));
-}
-
 bool
 fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 {
@@ -368,7 +357,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 
    if (has_source_modifiers &&
        entry->dst.type != inst->src[arg].type &&
-       !can_change_source_types(inst))
+       !inst->can_change_types())
       return false;
 
    if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) &&
@@ -438,7 +427,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
           * type.  If we got here, then we can just change the source and
           * destination types of the instruction and keep going.
           */
-         assert(can_change_source_types(inst));
+         assert(inst->can_change_types());
          for (int i = 0; i < inst->sources; i++) {
             inst->src[i].type = entry->dst.type;
          }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 6f8b75e339f..13c495cd395 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -131,7 +131,6 @@ fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
                            void *mem_ctx,
                            const void *key,
                            struct brw_stage_prog_data *prog_data,
-                           struct gl_program *prog,
                            unsigned promoted_constants,
                            bool runtime_check_aads_emit,
                            const char *stage_abbrev)
@@ -139,7 +138,7 @@ fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
    : compiler(compiler), log_data(log_data),
      devinfo(compiler->devinfo), key(key),
      prog_data(prog_data),
-     prog(prog), promoted_constants(promoted_constants),
+     promoted_constants(promoted_constants),
      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
      stage_abbrev(stage_abbrev), mem_ctx(mem_ctx)
 {
@@ -1377,15 +1376,14 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
                                                 struct brw_reg msg_data,
                                                 unsigned msg_type)
 {
-   assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
-          msg_data.type == BRW_REGISTER_TYPE_UD);
+   assert(msg_data.type == BRW_REGISTER_TYPE_UD);
 
    brw_pixel_interpolator_query(p,
          retype(dst, BRW_REGISTER_TYPE_UW),
          src,
          inst->pi_noperspective,
          msg_type,
-         msg_data.dw1.ud,
+         msg_data,
          inst->mlen,
          inst->regs_written);
 }
@@ -2188,7 +2186,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
               100.0f * (before_size - after_size) / before_size);
 
       dump_assembly(p->store, annotation.ann_count, annotation.ann,
-                    p->devinfo, prog);
+                    p->devinfo);
       ralloc_free(annotation.ann);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index 19aec92fad1..ce066a9778e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -259,16 +259,15 @@ fs_live_variables::compute_start_end()
       struct block_data *bd = &block_data[block->num];
 
       for (int i = 0; i < num_vars; i++) {
-	 if (BITSET_TEST(bd->livein, i)) {
-	    start[i] = MIN2(start[i], block->start_ip);
-	    end[i] = MAX2(end[i], block->start_ip);
-	 }
-
-	 if (BITSET_TEST(bd->liveout, i)) {
-	    start[i] = MIN2(start[i], block->end_ip);
-	    end[i] = MAX2(end[i], block->end_ip);
-	 }
+         if (BITSET_TEST(bd->livein, i)) {
+            start[i] = MIN2(start[i], block->start_ip);
+            end[i] = MAX2(end[i], block->start_ip);
+         }
 
+         if (BITSET_TEST(bd->liveout, i)) {
+            start[i] = MIN2(start[i], block->end_ip);
+            end[i] = MAX2(end[i], block->end_ip);
+         }
       }
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 45c3f4ef3b4..feedbfbb2e3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -56,61 +56,25 @@ fs_visitor::emit_nir_code()
 void
 fs_visitor::nir_setup_inputs()
 {
+   if (stage != MESA_SHADER_FRAGMENT)
+      return;
+
    nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
 
    nir_foreach_variable(var, &nir->inputs) {
-      enum brw_reg_type type = brw_type_for_base_type(var->type);
       fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
 
       fs_reg reg;
-      switch (stage) {
-      case MESA_SHADER_VERTEX: {
-         /* Our ATTR file is indexed by VERT_ATTRIB_*, which is the value
-          * stored in nir_variable::location.
-          *
-          * However, NIR's load_input intrinsics use a different index - an
-          * offset into a single contiguous array containing all inputs.
-          * This index corresponds to the nir_variable::driver_location field.
-          *
-          * So, we need to copy from fs_reg(ATTR, var->location) to
-          * offset(nir_inputs, var->data.driver_location).
-          */
-         const glsl_type *const t = var->type->without_array();
-         const unsigned components = t->components();
-         const unsigned cols = t->matrix_columns;
-         const unsigned elts = t->vector_elements;
-         unsigned array_length = var->type->is_array() ? var->type->length : 1;
-         for (unsigned i = 0; i < array_length; i++) {
-            for (unsigned j = 0; j < cols; j++) {
-               for (unsigned k = 0; k < elts; k++) {
-                  bld.MOV(offset(retype(input, type), bld,
-                                 components * i + elts * j + k),
-                          offset(fs_reg(ATTR, var->data.location + i, type),
-                                 bld, 4 * j + k));
-               }
-            }
-         }
-         break;
-      }
-      case MESA_SHADER_GEOMETRY:
-      case MESA_SHADER_COMPUTE:
-      case MESA_SHADER_TESS_CTRL:
-      case MESA_SHADER_TESS_EVAL:
-         unreachable("fs_visitor not used for these stages yet.");
-         break;
-      case MESA_SHADER_FRAGMENT:
-         if (var->data.location == VARYING_SLOT_POS) {
-            reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
-                                                var->data.origin_upper_left);
-            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
-                                      input, reg), 0xF);
-         } else {
-            emit_general_interpolation(input, var->name, var->type,
-                                       (glsl_interp_qualifier) var->data.interpolation,
-                                       var->data.location, var->data.centroid,
-                                       var->data.sample);
-         }
-         break;
+      if (var->data.location == VARYING_SLOT_POS) {
+         reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
+                                             var->data.origin_upper_left);
+         emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
+                                   input, reg), 0xF);
+      } else {
+         emit_general_interpolation(input, var->name, var->type,
+                                    (glsl_interp_qualifier) var->data.interpolation,
+                                    var->data.location, var->data.centroid,
+                                    var->data.sample);
       }
    }
 }
@@ -125,9 +89,7 @@ fs_visitor::nir_setup_outputs()
    nir_foreach_variable(var, &nir->outputs) {
       fs_reg reg = offset(nir_outputs, bld, var->data.driver_location);
 
-      int vector_elements =
-         var->type->is_array() ? var->type->fields.array->vector_elements
-                               : var->type->vector_elements;
+      int vector_elements = var->type->without_array()->vector_elements;
 
       switch (stage) {
       case MESA_SHADER_VERTEX:
@@ -1180,6 +1142,36 @@ get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
    }
 }
 
+static fs_inst *
+emit_pixel_interpolater_send(const fs_builder &bld,
+                             enum opcode opcode,
+                             const fs_reg &dst,
+                             const fs_reg &src,
+                             const fs_reg &desc,
+                             glsl_interp_qualifier interpolation)
+{
+   fs_inst *inst;
+   fs_reg payload;
+   int mlen;
+
+   if (src.file == BAD_FILE) {
+      /* Dummy payload */
+      payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+      mlen = 1;
+   } else {
+      payload = src;
+      mlen = 2 * bld.dispatch_width() / 8;
+   }
+
+   inst = bld.emit(opcode, dst, payload, desc);
+   inst->mlen = mlen;
+   /* 2 floats per slot returned */
+   inst->regs_written = 2 * bld.dispatch_width() / 8;
+   inst->pi_noperspective = interpolation == INTERP_QUALIFIER_NOPERSPECTIVE;
+
+   return inst;
+}
+
 void
 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
 {
@@ -1440,7 +1432,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           */
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ubo_start +
-                               nir->info.num_ssbos - 1);
+                               nir->info.num_ubos - 1);
       }
 
       if (has_indirect) {
@@ -1488,21 +1480,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
       fs_reg surf_index;
       if (const_uniform_block) {
-         unsigned index = stage_prog_data->binding_table.ubo_start +
+         unsigned index = stage_prog_data->binding_table.ssbo_start +
                           const_uniform_block->u[0];
          surf_index = fs_reg(index);
          brw_mark_surface_used(prog_data, index);
       } else {
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[0]),
-                 fs_reg(stage_prog_data->binding_table.ubo_start));
+                 fs_reg(stage_prog_data->binding_table.ssbo_start));
          surf_index = bld.emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
           */
          brw_mark_surface_used(prog_data,
-                               stage_prog_data->binding_table.ubo_start +
+                               stage_prog_data->binding_table.ssbo_start +
                                nir->info.num_ssbos - 1);
       }
 
@@ -1545,8 +1537,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_load_input: {
       unsigned index = 0;
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = offset(retype(nir_inputs, dest.type), bld,
-                             instr->const_index[0] + index);
+         fs_reg src;
+         if (stage == MESA_SHADER_VERTEX) {
+            src = offset(fs_reg(ATTR, instr->const_index[0], dest.type), bld, index);
+         } else {
+            src = offset(retype(nir_inputs, dest.type), bld,
+                         instr->const_index[0] + index);
+         }
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
          index++;
@@ -1583,28 +1580,81 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true;
 
       fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
-
-      /* For most messages, we need one reg of ignored data; the hardware
-       * requires mlen==1 even when there is no payload. in the per-slot
-       * offset case, we'll replace this with the proper source data.
-       */
-      fs_reg src = vgrf(glsl_type::float_type);
-      int mlen = 1;     /* one reg unless overriden */
-      fs_inst *inst;
+      const glsl_interp_qualifier interpolation =
+         (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation;
 
       switch (instr->intrinsic) {
       case nir_intrinsic_interp_var_at_centroid:
-         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_CENTROID,
-                         dst_xy, src, fs_reg(0u));
+         emit_pixel_interpolater_send(bld,
+                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,
+                                      dst_xy,
+                                      fs_reg(), /* src */
+                                      fs_reg(0u),
+                                      interpolation);
          break;
 
       case nir_intrinsic_interp_var_at_sample: {
-         /* XXX: We should probably handle non-constant sample id's */
          nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
-         assert(const_sample);
-         unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
-         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
-                         fs_reg(msg_data));
+
+         if (const_sample) {
+            unsigned msg_data = const_sample->i[0] << 4;
+
+            emit_pixel_interpolater_send(bld,
+                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                         dst_xy,
+                                         fs_reg(), /* src */
+                                         fs_reg(msg_data),
+                                         interpolation);
+         } else {
+            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
+                                             BRW_REGISTER_TYPE_UD);
+
+            if (nir_src_is_dynamically_uniform(instr->src[0])) {
+               const fs_reg sample_id = bld.emit_uniformize(sample_src);
+               const fs_reg msg_data = vgrf(glsl_type::uint_type);
+               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
+               emit_pixel_interpolater_send(bld,
+                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                            dst_xy,
+                                            fs_reg(), /* src */
+                                            msg_data,
+                                            interpolation);
+            } else {
+               /* Make a loop that sends a message to the pixel interpolater
+                * for the sample number in each live channel. If there are
+                * multiple channels with the same sample number then these
+                * will be handled simultaneously with a single interation of
+                * the loop.
+                */
+               bld.emit(BRW_OPCODE_DO);
+
+               /* Get the next live sample number into sample_id_reg */
+               const fs_reg sample_id = bld.emit_uniformize(sample_src);
+
+               /* Set the flag register so that we can perform the send
+                * message on all channels that have the same sample number
+                */
+               bld.CMP(bld.null_reg_ud(),
+                       sample_src, sample_id,
+                       BRW_CONDITIONAL_EQ);
+               const fs_reg msg_data = vgrf(glsl_type::uint_type);
+               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
+               fs_inst *inst =
+                  emit_pixel_interpolater_send(bld,
+                                               FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                               dst_xy,
+                                               fs_reg(), /* src */
+                                               msg_data,
+                                               interpolation);
+               set_predicate(BRW_PREDICATE_NORMAL, inst);
+
+               /* Continue the loop if there are any live channels left */
+               set_predicate_inv(BRW_PREDICATE_NORMAL,
+                                 true, /* inverse */
+                                 bld.emit(BRW_OPCODE_WHILE));
+            }
+         }
+
          break;
       }
 
@@ -1615,10 +1665,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
             unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
             unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
 
-            inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
-                            fs_reg(off_x | (off_y << 4)));
+            emit_pixel_interpolater_send(bld,
+                                         FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+                                         dst_xy,
+                                         fs_reg(), /* src */
+                                         fs_reg(off_x | (off_y << 4)),
+                                         interpolation);
          } else {
-            src = vgrf(glsl_type::ivec2_type);
+            fs_reg src = vgrf(glsl_type::ivec2_type);
             fs_reg offset_src = retype(get_nir_src(instr->src[0]),
                                        BRW_REGISTER_TYPE_F);
             for (int i = 0; i < 2; i++) {
@@ -1646,9 +1700,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                            bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
             }
 
-            mlen = 2 * dispatch_width / 8;
-            inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
-                            fs_reg(0u));
+            const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
+            emit_pixel_interpolater_send(bld,
+                                         opcode,
+                                         dst_xy,
+                                         src,
+                                         fs_reg(0u),
+                                         interpolation);
          }
          break;
       }
@@ -1657,12 +1715,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          unreachable("Invalid intrinsic");
       }
 
-      inst->mlen = mlen;
-      /* 2 floats per slot returned */
-      inst->regs_written = 2 * dispatch_width / 8;
-      inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
-                               INTERP_QUALIFIER_NOPERSPECTIVE;
-
       for (unsigned j = 0; j < instr->num_components; j++) {
          fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
          src.type = dest.type;
@@ -1684,18 +1736,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       nir_const_value *const_uniform_block =
          nir_src_as_const_value(instr->src[1]);
       if (const_uniform_block) {
-         unsigned index = stage_prog_data->binding_table.ubo_start +
+         unsigned index = stage_prog_data->binding_table.ssbo_start +
                           const_uniform_block->u[0];
          surf_index = fs_reg(index);
          brw_mark_surface_used(prog_data, index);
       } else {
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[1]),
-                  fs_reg(stage_prog_data->binding_table.ubo_start));
+                  fs_reg(stage_prog_data->binding_table.ssbo_start));
          surf_index = bld.emit_uniformize(surf_index);
 
          brw_mark_surface_used(prog_data,
-                               stage_prog_data->binding_table.ubo_start +
+                               stage_prog_data->binding_table.ssbo_start +
                                nir->info.num_ssbos - 1);
       }
 
@@ -1780,17 +1832,17 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_ssbo_atomic_add:
       nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
       break;
-   case nir_intrinsic_ssbo_atomic_min:
-      if (dest.type == BRW_REGISTER_TYPE_D)
-         nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
-      else
-         nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
+   case nir_intrinsic_ssbo_atomic_imin:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
       break;
-   case nir_intrinsic_ssbo_atomic_max:
-      if (dest.type == BRW_REGISTER_TYPE_D)
-         nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
-      else
-         nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
+   case nir_intrinsic_ssbo_atomic_umin:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_imax:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_umax:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
       break;
    case nir_intrinsic_ssbo_atomic_and:
       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
@@ -1810,7 +1862,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
    case nir_intrinsic_get_buffer_size: {
       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
-      unsigned ubo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
+      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
       int reg_width = dispatch_width / 8;
 
       /* Set LOD = 0 */
@@ -1821,7 +1873,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                                   BRW_REGISTER_TYPE_UD);
       bld.LOAD_PAYLOAD(src_payload, &source, 1, 0);
 
-      fs_reg surf_index = fs_reg(prog_data->binding_table.ubo_start + ubo_index);
+      fs_reg surf_index = fs_reg(prog_data->binding_table.ssbo_start + ssbo_index);
       fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, dest,
                                src_payload, surf_index);
       inst->header_size = 0;
@@ -1874,20 +1926,20 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
    fs_reg surface;
    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
    if (const_surface) {
-      unsigned surf_index = stage_prog_data->binding_table.ubo_start +
+      unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
                             const_surface->u[0];
       surface = fs_reg(surf_index);
       brw_mark_surface_used(prog_data, surf_index);
    } else {
       surface = vgrf(glsl_type::uint_type);
       bld.ADD(surface, get_nir_src(instr->src[0]),
-              fs_reg(stage_prog_data->binding_table.ubo_start));
+              fs_reg(stage_prog_data->binding_table.ssbo_start));
 
-      /* Assume this may touch any UBO. This is the same we do for other
+      /* Assume this may touch any SSBO. This is the same we do for other
        * UBO/SSBO accesses with non-constant surface.
        */
       brw_mark_surface_used(prog_data,
-                            stage_prog_data->binding_table.ubo_start +
+                            stage_prog_data->binding_table.ssbo_start +
                             nir->info.num_ssbos - 1);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index c3a037be4b1..36388fad98d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -27,7 +27,7 @@
 
 #include "brw_fs.h"
 #include "brw_cfg.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/ir_optimization.h"
 
 using namespace brw;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
index e406c2899e8..8792a8c7b1d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
@@ -52,11 +52,12 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
       ip--;
 
       if (inst->opcode != BRW_OPCODE_MOV ||
+          !inst->saturate ||
           inst->dst.file != GRF ||
+          inst->dst.type != inst->src[0].type ||
           inst->src[0].file != GRF ||
           inst->src[0].abs ||
-          inst->src[0].negate ||
-          !inst->saturate)
+          inst->src[0].negate)
          continue;
 
       int src_var = v->live_intervals->var_from_reg(inst->src[0]);
@@ -65,7 +66,9 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
       bool interfered = false;
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
          if (scan_inst->overwrites_reg(inst->src[0])) {
-            if (scan_inst->is_partial_write())
+            if (scan_inst->is_partial_write() ||
+                (scan_inst->dst.type != inst->dst.type &&
+                 !scan_inst->can_change_types()))
                break;
 
             if (scan_inst->saturate) {
@@ -73,6 +76,12 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
                progress = true;
             } else if (src_end_ip <= ip || inst->dst.equals(inst->src[0])) {
                if (scan_inst->can_do_saturate()) {
+                  if (scan_inst->dst.type != inst->dst.type) {
+                     scan_inst->dst.type = inst->dst.type;
+                     for (int i = 0; i < scan_inst->sources; i++) {
+                        scan_inst->src[i].type = inst->dst.type;
+                     }
+                  }
                   scan_inst->saturate = true;
                   inst->saturate = false;
                   progress = true;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
index d0e04f3bf47..814c551f1be 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
@@ -32,7 +32,7 @@
 
 #define fsv_assert(cond) \
    if (!(cond)) { \
-      fprintf(stderr, "ASSERT: FS validation failed!\n"); \
+      fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", stage_abbrev); \
       dump_instruction(inst, stderr); \
       fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, #cond); \
       abort(); \
diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
index 6000e35b9b9..cab5af318a2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
@@ -42,7 +42,7 @@
 #include "glsl/ir.h"
 #include "glsl/ir_visitor.h"
 #include "glsl/ir_rvalue_visitor.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "util/hash_table.h"
 
 static bool debug = false;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index df1a7ed9b59..f825fed4daf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -43,7 +43,7 @@
 #include "brw_vec4.h"
 #include "brw_fs.h"
 #include "main/uniforms.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/ir_optimization.h"
 #include "program/sampler.h"
 
@@ -53,7 +53,8 @@ fs_reg *
 fs_visitor::emit_vs_system_value(int location)
 {
    fs_reg *reg = new(this->mem_ctx)
-      fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
+      fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info.inputs_read),
+             BRW_REGISTER_TYPE_D);
    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
 
    switch (location) {
@@ -903,12 +904,9 @@ fs_visitor::emit_urb_writes()
    urb_offset = 0;
    flush = false;
    for (slot = 0; slot < vue_map->num_slots; slot++) {
-      fs_reg reg, src, zero;
-
       int varying = vue_map->slot_to_varying[slot];
       switch (varying) {
-      case VARYING_SLOT_PSIZ:
-
+      case VARYING_SLOT_PSIZ: {
          /* The point size varying slot is the vue header and is always in the
           * vue map.  But often none of the special varyings that live there
           * are written and in that case we can skip writing to the vue
@@ -920,7 +918,7 @@ fs_visitor::emit_urb_writes()
             break;
          }
 
-         zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+         fs_reg zero(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
          bld.MOV(zero, fs_reg(0u));
 
          sources[length++] = zero;
@@ -939,7 +937,7 @@ fs_visitor::emit_urb_writes()
          else
             sources[length++] = zero;
          break;
-
+      }
       case BRW_VARYING_SLOT_NDC:
       case VARYING_SLOT_EDGE:
          unreachable("unexpected scalar vs output");
@@ -972,8 +970,8 @@ fs_visitor::emit_urb_writes()
              * temp register and use that for the payload.
              */
             for (int i = 0; i < 4; i++) {
-               reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
-               src = offset(this->outputs[varying], bld, i);
+               fs_reg reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
+               fs_reg src = offset(this->outputs[varying], bld, i);
                set_saturate(true, bld.MOV(reg, src));
                sources[length++] = reg;
             }
@@ -1069,7 +1067,7 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
                        const void *key,
                        struct brw_stage_prog_data *prog_data,
                        struct gl_program *prog,
-                       nir_shader *shader,
+                       const nir_shader *shader,
                        unsigned dispatch_width,
                        int shader_time_index)
    : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index e0165fb4a23..10a7f28fdab 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -57,6 +57,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
                     struct brw_geometry_program *gp,
                     struct brw_gs_prog_key *key)
 {
+   struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
    struct brw_stage_state *stage_state = &brw->gs.base;
    struct brw_gs_compile c;
    memset(&c, 0, sizeof(c));
@@ -300,8 +301,11 @@ brw_codegen_gs_prog(struct brw_context *brw,
 
    void *mem_ctx = ralloc_context(NULL);
    unsigned program_size;
+   char *error_str;
    const unsigned *program =
-      brw_gs_emit(brw, prog, &c, mem_ctx, st_index, &program_size);
+      brw_compile_gs(brw->intelScreen->compiler, brw, &c,
+                     shader->Program->nir, prog,
+                     mem_ctx, st_index, &program_size, &error_str);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
index 0bb307432d0..00125c0f405 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
@@ -129,7 +129,7 @@ brw_upload_gs_image_surfaces(struct brw_context *brw)
       ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
 
    if (prog) {
-      /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */
       brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY],
                                 &brw->gs.base, &brw->gs.prog_data->base.base);
    }
@@ -137,6 +137,7 @@ brw_upload_gs_image_surfaces(struct brw_context *brw)
 
 const struct brw_tracked_state brw_gs_image_surfaces = {
    .dirty = {
+      .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_GEOMETRY_PROGRAM |
              BRW_NEW_GS_PROG_DATA |
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 97c6f8b2500..7726e4b78a0 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -204,6 +204,7 @@ public:
    unsigned components_read(unsigned i) const;
    int regs_read(int arg) const;
    bool can_do_source_mods(const struct brw_device_info *devinfo);
+   bool can_change_types() const;
    bool has_side_effects() const;
 
    bool reads_flag() const;
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index 96dd633e117..1b57b65db27 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -179,6 +179,7 @@ public:
                       int swizzle, int swizzle_mask);
    void reswizzle(int dst_writemask, int swizzle);
    bool can_do_source_mods(const struct brw_device_info *devinfo);
+   bool can_change_types() const;
 
    bool reads_flag()
    {
diff --git a/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp b/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp
index 8c59b9e415b..4219d471def 100644
--- a/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp
+++ b/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp
@@ -31,7 +31,7 @@
  * \author Chris Forbes <[email protected]>
  */
 
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/ir.h"
 #include "glsl/ir_builder.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index eb201736c6e..fbde3f04204 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -451,6 +451,11 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
       if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_NO_MCS)
          clear_type = REP_CLEAR;
 
+      if (brw->gen >= 9 && clear_type == FAST_CLEAR) {
+         perf_debug("fast MCS clears are disabled on gen9");
+         clear_type = REP_CLEAR;
+      }
+
       /* We can't do scissored fast clears because of the restrictions on the
        * fast clear rectangle size.
        */
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 0a9c09f1075..dc497770914 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -27,30 +27,112 @@
 #include "glsl/nir/glsl_to_nir.h"
 #include "program/prog_to_nir.h"
 
+static bool
+remap_vs_attrs(nir_block *block, void *closure)
+{
+   GLbitfield64 inputs_read = *((GLbitfield64 *) closure);
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      /* We set EmitNoIndirect for VS inputs, so there are no indirects. */
+      assert(intrin->intrinsic != nir_intrinsic_load_input_indirect);
+
+      if (intrin->intrinsic == nir_intrinsic_load_input) {
+         /* Attributes come in a contiguous block, ordered by their
+          * gl_vert_attrib value.  That means we can compute the slot
+          * number for an attribute by masking out the enabled attributes
+          * before it and counting the bits.
+          */
+         int attr = intrin->const_index[0];
+         int slot = _mesa_bitcount_64(inputs_read & BITFIELD64_MASK(attr));
+         intrin->const_index[0] = 4 * slot;
+      }
+   }
+   return true;
+}
+
 static void
 brw_nir_lower_inputs(nir_shader *nir, bool is_scalar)
 {
    switch (nir->stage) {
+   case MESA_SHADER_VERTEX:
+      /* For now, leave the vec4 backend doing the old method. */
+      if (!is_scalar) {
+         nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
+                                  type_size_vec4);
+         break;
+      }
+
+      /* Start with the location of the variable's base. */
+      foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+         var->data.driver_location = var->data.location;
+      }
+
+      /* Now use nir_lower_io to walk dereference chains.  Attribute arrays
+       * are loaded as one vec4 per element (or matrix column), so we use
+       * type_size_vec4 here.
+       */
+      nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
+
+      /* Finally, translate VERT_ATTRIB_* values into the actual registers.
+       *
+       * Note that we can use nir->info.inputs_read instead of key->inputs_read
+       * since the two are identical aside from Gen4-5 edge flag differences.
+       */
+      GLbitfield64 inputs_read = nir->info.inputs_read;
+      nir_foreach_overload(nir, overload) {
+         if (overload->impl) {
+            nir_foreach_block(overload->impl, remap_vs_attrs, &inputs_read);
+         }
+      }
+      break;
    case MESA_SHADER_GEOMETRY:
       foreach_list_typed(nir_variable, var, node, &nir->inputs) {
          var->data.driver_location = var->data.location;
       }
       break;
-   default:
+   case MESA_SHADER_FRAGMENT:
+      assert(is_scalar);
       nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
-                               is_scalar ? type_size_scalar : type_size_vec4);
+                               type_size_scalar);
+      break;
+   case MESA_SHADER_COMPUTE:
+      /* Compute shaders have no inputs. */
+      assert(exec_list_is_empty(&nir->inputs));
       break;
+   default:
+      unreachable("unsupported shader stage");
    }
 }
 
 static void
 brw_nir_lower_outputs(nir_shader *nir, bool is_scalar)
 {
-   if (is_scalar) {
-      nir_assign_var_locations(&nir->outputs, &nir->num_outputs, type_size_scalar);
-   } else {
-      nir_foreach_variable(var, &nir->outputs)
-         var->data.driver_location = var->data.location;
+   switch (nir->stage) {
+   case MESA_SHADER_VERTEX:
+   case MESA_SHADER_GEOMETRY:
+      if (is_scalar) {
+         nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
+                                  type_size_scalar);
+      } else {
+         nir_foreach_variable(var, &nir->outputs)
+            var->data.driver_location = var->data.location;
+      }
+      break;
+   case MESA_SHADER_FRAGMENT:
+      nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
+                               type_size_scalar);
+      break;
+   case MESA_SHADER_COMPUTE:
+      /* Compute shaders have no outputs. */
+      assert(exec_list_is_empty(&nir->outputs));
+      break;
+   default:
+      unreachable("unsupported shader stage");
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index dbd0e50228b..22b0227756e 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -69,8 +69,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx,
       if (prog) {
 	 prog->id = get_new_program_id(brw->intelScreen);
 
-	 return _mesa_init_vertex_program( ctx, &prog->program,
-					     target, id );
+	 return _mesa_init_gl_program(&prog->program.Base, target, id);
       }
       else
 	 return NULL;
@@ -81,8 +80,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx,
       if (prog) {
 	 prog->id = get_new_program_id(brw->intelScreen);
 
-	 return _mesa_init_fragment_program( ctx, &prog->program,
-					     target, id );
+	 return _mesa_init_gl_program(&prog->program.Base, target, id);
       }
       else
 	 return NULL;
@@ -93,7 +91,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx,
       if (prog) {
          prog->id = get_new_program_id(brw->intelScreen);
 
-         return _mesa_init_geometry_program(ctx, &prog->program, target, id);
+         return _mesa_init_gl_program(&prog->program, target, id);
       } else {
          return NULL;
       }
@@ -104,7 +102,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx,
       if (prog) {
          prog->id = get_new_program_id(brw->intelScreen);
 
-         return _mesa_init_compute_program(ctx, &prog->program, target, id);
+         return _mesa_init_gl_program(&prog->program.Base, target, id);
       } else {
          return NULL;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h
index cf0522a8b10..f8cf2b062c8 100644
--- a/src/mesa/drivers/dri/i965/brw_program.h
+++ b/src/mesa/drivers/dri/i965/brw_program.h
@@ -24,129 +24,7 @@
 #ifndef BRW_PROGRAM_H
 #define BRW_PROGRAM_H
 
-/**
- * Program key structures.
- *
- * When drawing, we look for the currently bound shaders in the program
- * cache.  This is essentially a hash table lookup, and these are the keys.
- *
- * Sometimes OpenGL features specified as state need to be simulated via
- * shader code, due to a mismatch between the API and the hardware.  This
- * is often referred to as "non-orthagonal state" or "NOS".  We store NOS
- * in the program key so it's considered when searching for a program.  If
- * we haven't seen a particular combination before, we have to recompile a
- * new specialized version.
- *
- * Shader compilation should not look up state in gl_context directly, but
- * instead use the copy in the program key.  This guarantees recompiles will
- * happen correctly.
- *
- *  @{
- */
-
-enum PACKED gen6_gather_sampler_wa {
-   WA_SIGN = 1,      /* whether we need to sign extend */
-   WA_8BIT = 2,      /* if we have an 8bit format needing wa */
-   WA_16BIT = 4,     /* if we have a 16bit format needing wa */
-};
-
-/**
- * Sampler information needed by VS, WM, and GS program cache keys.
- */
-struct brw_sampler_prog_key_data {
-   /**
-    * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
-    */
-   uint16_t swizzles[MAX_SAMPLERS];
-
-   uint32_t gl_clamp_mask[3];
-
-   /**
-    * For RG32F, gather4's channel select is broken.
-    */
-   uint32_t gather_channel_quirk_mask;
-
-   /**
-    * Whether this sampler uses the compressed multisample surface layout.
-    */
-   uint32_t compressed_multisample_layout_mask;
-
-   /**
-    * For Sandybridge, which shader w/a we need for gather quirks.
-    */
-   enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS];
-};
-
-
-/** The program key for Vertex Shaders. */
-struct brw_vs_prog_key {
-   unsigned program_string_id;
-
-   /*
-    * Per-attribute workaround flags
-    */
-   uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX];
-
-   bool copy_edgeflag:1;
-
-   bool clamp_vertex_color:1;
-
-   /**
-    * How many user clipping planes are being uploaded to the vertex shader as
-    * push constants.
-    *
-    * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
-    * clip distances.
-    */
-   unsigned nr_userclip_plane_consts:4;
-
-   /**
-    * For pre-Gen6 hardware, a bitfield indicating which texture coordinates
-    * are going to be replaced with point coordinates (as a consequence of a
-    * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)).  Because
-    * our SF thread requires exact matching between VS outputs and FS inputs,
-    * these texture coordinates will need to be unconditionally included in
-    * the VUE, even if they aren't written by the vertex shader.
-    */
-   uint8_t point_coord_replace;
-
-   struct brw_sampler_prog_key_data tex;
-};
-
-/** The program key for Geometry Shaders. */
-struct brw_gs_prog_key
-{
-   unsigned program_string_id;
-
-   struct brw_sampler_prog_key_data tex;
-};
-
-/** The program key for Fragment/Pixel Shaders. */
-struct brw_wm_prog_key {
-   uint8_t iz_lookup;
-   bool stats_wm:1;
-   bool flat_shade:1;
-   bool persample_shading:1;
-   bool persample_2x:1;
-   unsigned nr_color_regions:5;
-   bool replicate_alpha:1;
-   bool render_to_fbo:1;
-   bool clamp_fragment_color:1;
-   bool compute_pos_offset:1;
-   bool compute_sample_id:1;
-   unsigned line_aa:2;
-   bool high_quality_derivatives:1;
-
-   uint16_t drawable_height;
-   uint64_t input_slots_valid;
-   unsigned program_string_id;
-   GLenum alpha_test_func;          /* < For Gen4/5 MRT alpha test */
-   float alpha_test_ref;
-
-   struct brw_sampler_prog_key_data tex;
-};
-
-/** @} */
+#include "brw_compiler.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index c2db5f69560..6d73444dad0 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -44,6 +44,7 @@
 
 #include "main/macros.h"
 #include "main/samplerobj.h"
+#include "util/half_float.h"
 
 /**
  * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 4e43e5ccdbd..b710c60148c 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -29,7 +29,7 @@
 #include "brw_vec4.h"
 #include "brw_cfg.h"
 #include "brw_shader.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/ir_optimization.h"
 
 using namespace brw;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 3a58a58a00b..6be2a6e5b55 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -660,7 +660,7 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
 backend_shader::backend_shader(const struct brw_compiler *compiler,
                                void *log_data,
                                void *mem_ctx,
-                               nir_shader *shader,
+                               const nir_shader *shader,
                                struct brw_stage_prog_data *stage_prog_data)
    : compiler(compiler),
      log_data(log_data),
@@ -1131,11 +1131,16 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage,
    next_binding_table_offset += num_textures;
 
    if (shader) {
-      assert(shader->NumUniformBlocks <= BRW_MAX_COMBINED_UBO_SSBO);
+      assert(shader->NumUniformBlocks <= BRW_MAX_UBO);
       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
       next_binding_table_offset += shader->NumUniformBlocks;
+
+      assert(shader->NumShaderStorageBlocks <= BRW_MAX_SSBO);
+      stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
+      next_binding_table_offset += shader->NumShaderStorageBlocks;
    } else {
       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
+      stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
    }
 
    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index ad2de5eae2d..b33b08f40d7 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -38,64 +38,6 @@
 #define MAX_SAMPLER_MESSAGE_SIZE 11
 #define MAX_VGRF_SIZE 16
 
-struct brw_compiler {
-   const struct brw_device_info *devinfo;
-
-   struct {
-      struct ra_regs *regs;
-
-      /**
-       * Array of the ra classes for the unaligned contiguous register
-       * block sizes used.
-       */
-      int *classes;
-
-      /**
-       * Mapping for register-allocated objects in *regs to the first
-       * GRF for that object.
-       */
-      uint8_t *ra_reg_to_grf;
-   } vec4_reg_set;
-
-   struct {
-      struct ra_regs *regs;
-
-      /**
-       * Array of the ra classes for the unaligned contiguous register
-       * block sizes used, indexed by register size.
-       */
-      int classes[16];
-
-      /**
-       * Mapping from classes to ra_reg ranges.  Each of the per-size
-       * classes corresponds to a range of ra_reg nodes.  This array stores
-       * those ranges in the form of first ra_reg in each class and the
-       * total number of ra_reg elements in the last array element.  This
-       * way the range of the i'th class is given by:
-       * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] )
-       */
-      int class_to_ra_reg_range[17];
-
-      /**
-       * Mapping for register-allocated objects in *regs to the first
-       * GRF for that object.
-       */
-      uint8_t *ra_reg_to_grf;
-
-      /**
-       * ra class for the aligned pairs we use for PLN, which doesn't
-       * appear in *classes.
-       */
-      int aligned_pairs_class;
-   } fs_reg_sets[2];
-
-   void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
-   void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
-
-   bool scalar_vs;
-   struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
-};
-
 enum PACKED register_file {
    BAD_FILE,
    GRF,
@@ -225,7 +167,7 @@ protected:
    backend_shader(const struct brw_compiler *compiler,
                   void *log_data,
                   void *mem_ctx,
-                  nir_shader *shader,
+                  const nir_shader *shader,
                   struct brw_stage_prog_data *stage_prog_data);
 
 public:
@@ -234,7 +176,7 @@ public:
    void *log_data; /* Passed to compiler->*_log functions */
 
    const struct brw_device_info * const devinfo;
-   nir_shader *nir;
+   const nir_shader *nir;
    struct brw_stage_prog_data * const stage_prog_data;
 
    /** ralloc context for temporary data used during compile */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index e966b96a5ca..befc92445d3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -280,6 +280,18 @@ vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
    return true;
 }
 
+bool
+vec4_instruction::can_change_types() const
+{
+   return dst.type == src[0].type &&
+          !src[0].abs && !src[0].negate && !saturate &&
+          (opcode == BRW_OPCODE_MOV ||
+           (opcode == BRW_OPCODE_SEL &&
+            dst.type == src[1].type &&
+            predicate != BRW_PREDICATE_NONE &&
+            !src[1].abs && !src[1].negate));
+}
+
 /**
  * Returns how many MRFs an opcode will write over.
  *
@@ -1632,28 +1644,11 @@ vec4_vs_visitor::setup_attributes(int payload_reg)
     */
    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
       attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
-      nr_attributes++;
    }
 
    lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
 
-   /* The BSpec says we always have to read at least one thing from
-    * the VF, and it appears that the hardware wedges otherwise.
-    */
-   if (nr_attributes == 0)
-      nr_attributes = 1;
-
-   prog_data->urb_read_length = (nr_attributes + 1) / 2;
-
-   unsigned vue_entries =
-      MAX2(nr_attributes, prog_data->vue_map.num_slots);
-
-   if (devinfo->gen == 6)
-      prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8;
-   else
-      prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4;
-
-   return payload_reg + nr_attributes;
+   return payload_reg + vs_prog_data->nr_attributes;
 }
 
 int
@@ -1937,51 +1932,76 @@ extern "C" {
  * Returns the final assembly and the program's size.
  */
 const unsigned *
-brw_vs_emit(struct brw_context *brw,
-            void *mem_ctx,
-            const struct brw_vs_prog_key *key,
-            struct brw_vs_prog_data *prog_data,
-            struct gl_vertex_program *vp,
-            struct gl_shader_program *prog,
-            int shader_time_index,
-            unsigned *final_assembly_size)
+brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_vs_prog_key *key,
+               struct brw_vs_prog_data *prog_data,
+               const nir_shader *shader,
+               gl_clip_plane *clip_planes,
+               bool use_legacy_snorm_formula,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str)
 {
    const unsigned *assembly = NULL;
 
-   if (brw->intelScreen->compiler->scalar_vs) {
+   unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
+
+   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
+    * incoming vertex attribute.  So, add an extra slot.
+    */
+   if (shader->info.system_values_read &
+       (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
+        BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
+      nr_attributes++;
+   }
+
+   /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
+    * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
+    * vec4 mode, the hardware appears to wedge unless we read something.
+    */
+   if (compiler->scalar_vs)
+      prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
+   else
+      prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
+
+   prog_data->nr_attributes = nr_attributes;
+
+   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
+    * (overwriting the original contents), we need to make sure the size is
+    * the larger of the two.
+    */
+   const unsigned vue_entries =
+      MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots);
+
+   if (compiler->devinfo->gen == 6)
+      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
+   else
+      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+
+   if (compiler->scalar_vs) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
-      fs_visitor v(brw->intelScreen->compiler, brw,
-                   mem_ctx, key, &prog_data->base.base,
+      fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
                    NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
-                   vp->Base.nir, 8, shader_time_index);
-      if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) {
-         if (prog) {
-            prog->LinkStatus = false;
-            ralloc_strcat(&prog->InfoLog, v.fail_msg);
-         }
-
-         _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
-                       v.fail_msg);
+                   shader, 8, shader_time_index);
+      if (!v.run_vs(clip_planes)) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
 
          return NULL;
       }
 
-      fs_generator g(brw->intelScreen->compiler, brw,
-                     mem_ctx, (void *) key, &prog_data->base.base,
-                     &vp->Base, v.promoted_constants,
+      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+                     &prog_data->base.base, v.promoted_constants,
                      v.runtime_check_aads_emit, "VS");
       if (INTEL_DEBUG & DEBUG_VS) {
-         char *name;
-         if (prog) {
-            name = ralloc_asprintf(mem_ctx, "%s vertex shader %d",
-                                   prog->Label ? prog->Label : "unnamed",
-                                   prog->Name);
-         } else {
-            name = ralloc_asprintf(mem_ctx, "vertex program %d",
-                                   vp->Base.Id);
-         }
-         g.enable_debug(name);
+         const char *debug_name =
+            ralloc_asprintf(mem_ctx, "%s vertex shader %s",
+                            shader->info.label ? shader->info.label : "unnamed",
+                            shader->info.name);
+
+         g.enable_debug(debug_name);
       }
       g.generate_code(v.cfg, 8);
       assembly = g.get_assembly(final_assembly_size);
@@ -1990,26 +2010,19 @@ brw_vs_emit(struct brw_context *brw,
    if (!assembly) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-      vec4_vs_visitor v(brw->intelScreen->compiler, brw, key, prog_data,
-                        vp->Base.nir, brw_select_clip_planes(&brw->ctx),
-                        mem_ctx, shader_time_index,
-                        !_mesa_is_gles3(&brw->ctx));
+      vec4_vs_visitor v(compiler, log_data, key, prog_data,
+                        shader, clip_planes, mem_ctx,
+                        shader_time_index, use_legacy_snorm_formula);
       if (!v.run()) {
-         if (prog) {
-            prog->LinkStatus = false;
-            ralloc_strcat(&prog->InfoLog, v.fail_msg);
-         }
-
-         _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
-                       v.fail_msg);
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
 
          return NULL;
       }
 
-      vec4_generator g(brw->intelScreen->compiler, brw,
-                       prog, &vp->Base, &prog_data->base,
+      vec4_generator g(compiler, log_data, &prog_data->base,
                        mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
-      assembly = g.generate_assembly(v.cfg, final_assembly_size);
+      assembly = g.generate_assembly(v.cfg, final_assembly_size, shader);
    }
 
    return assembly;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 5e3500c0c9a..d861b2e85df 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -72,7 +72,7 @@ public:
                 void *log_data,
                 const struct brw_sampler_prog_key_data *key,
                 struct brw_vue_prog_data *prog_data,
-                nir_shader *shader,
+                const nir_shader *shader,
 		void *mem_ctx,
                 bool no_spills,
                 int shader_time_index);
@@ -391,8 +391,6 @@ class vec4_generator
 {
 public:
    vec4_generator(const struct brw_compiler *compiler, void *log_data,
-                  struct gl_shader_program *shader_prog,
-                  struct gl_program *prog,
                   struct brw_vue_prog_data *prog_data,
                   void *mem_ctx,
                   bool debug_flag,
@@ -400,10 +398,11 @@ public:
                   const char *stage_abbrev);
    ~vec4_generator();
 
-   const unsigned *generate_assembly(const cfg_t *cfg, unsigned *asm_size);
+   const unsigned *generate_assembly(const cfg_t *cfg, unsigned *asm_size,
+                                     const nir_shader *nir);
 
 private:
-   void generate_code(const cfg_t *cfg);
+   void generate_code(const cfg_t *cfg, const nir_shader *nir);
 
    void generate_math1_gen4(vec4_instruction *inst,
 			    struct brw_reg dst,
@@ -485,9 +484,6 @@ private:
 
    struct brw_codegen *p;
 
-   struct gl_shader_program *shader_prog;
-   const struct gl_program *prog;
-
    struct brw_vue_prog_data *prog_data;
 
    void *mem_ctx;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
index 610caef7dce..db99ecba35a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
@@ -256,18 +256,6 @@ try_constant_propagate(const struct brw_device_info *devinfo,
 }
 
 static bool
-can_change_source_types(vec4_instruction *inst)
-{
-   return inst->dst.type == inst->src[0].type &&
-      !inst->src[0].abs && !inst->src[0].negate && !inst->saturate &&
-      (inst->opcode == BRW_OPCODE_MOV ||
-       (inst->opcode == BRW_OPCODE_SEL &&
-        inst->dst.type == inst->src[1].type &&
-        inst->predicate != BRW_PREDICATE_NONE &&
-        !inst->src[1].abs && !inst->src[1].negate));
-}
-
-static bool
 try_copy_propagate(const struct brw_device_info *devinfo,
                    vec4_instruction *inst,
                    int arg, struct copy_entry *entry)
@@ -325,7 +313,7 @@ try_copy_propagate(const struct brw_device_info *devinfo,
 
    if (has_source_modifiers &&
        value.type != inst->src[arg].type &&
-       !can_change_source_types(inst))
+       !inst->can_change_types())
       return false;
 
    if (has_source_modifiers &&
@@ -394,7 +382,7 @@ try_copy_propagate(const struct brw_device_info *devinfo,
    value.swizzle = composed_swizzle;
    if (has_source_modifiers &&
        value.type != inst->src[arg].type) {
-      assert(can_change_source_types(inst));
+      assert(inst->can_change_types());
       for (int i = 0; i < 3; i++) {
          inst->src[i].type = value.type;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index dcacc900540..a84f6c47471 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -21,6 +21,7 @@
  */
 
 #include <ctype.h>
+#include "glsl/glsl_parser_extras.h"
 #include "brw_vec4.h"
 #include "brw_cfg.h"
 
@@ -137,15 +138,13 @@ vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i)
 
 vec4_generator::vec4_generator(const struct brw_compiler *compiler,
                                void *log_data,
-                               struct gl_shader_program *shader_prog,
-                               struct gl_program *prog,
                                struct brw_vue_prog_data *prog_data,
                                void *mem_ctx,
                                bool debug_flag,
                                const char *stage_name,
                                const char *stage_abbrev)
    : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo),
-     shader_prog(shader_prog), prog(prog), prog_data(prog_data),
+     prog_data(prog_data),
      mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev),
      debug_flag(debug_flag)
 {
@@ -1142,7 +1141,7 @@ vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst,
 }
 
 void
-vec4_generator::generate_code(const cfg_t *cfg)
+vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
 {
    struct annotation_info annotation;
    memset(&annotation, 0, sizeof(annotation));
@@ -1648,14 +1647,10 @@ vec4_generator::generate_code(const cfg_t *cfg)
    int after_size = p->next_insn_offset;
 
    if (unlikely(debug_flag)) {
-      if (shader_prog) {
-         fprintf(stderr, "Native code for %s %s shader %d:\n",
-                 shader_prog->Label ? shader_prog->Label : "unnamed",
-                 stage_name, shader_prog->Name);
-      } else {
-         fprintf(stderr, "Native code for %s program %d:\n", stage_name,
-                 prog->Id);
-      }
+      fprintf(stderr, "Native code for %s %s shader %s:\n",
+              nir->info.label ? nir->info.label : "unnamed",
+              _mesa_shader_stage_to_string(nir->stage), nir->info.name);
+
       fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d to %d"
                       " bytes (%.0f%%)\n",
               stage_abbrev,
@@ -1663,7 +1658,7 @@ vec4_generator::generate_code(const cfg_t *cfg)
               100.0f * (before_size - after_size) / before_size);
 
       dump_assembly(p->store, annotation.ann_count, annotation.ann,
-                    p->devinfo, prog);
+                    p->devinfo);
       ralloc_free(annotation.ann);
    }
 
@@ -1676,10 +1671,11 @@ vec4_generator::generate_code(const cfg_t *cfg)
 
 const unsigned *
 vec4_generator::generate_assembly(const cfg_t *cfg,
-                                  unsigned *assembly_size)
+                                  unsigned *assembly_size,
+                                  const nir_shader *nir)
 {
    brw_set_default_access_mode(p, BRW_ALIGN_16);
-   generate_code(cfg);
+   generate_code(cfg, nir);
 
    return brw_get_program(p, assembly_size);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 4ce471e0669..a715cf5a6cb 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -30,14 +30,12 @@
 #include "brw_vec4_gs_visitor.h"
 #include "gen6_gs_visitor.h"
 
-const unsigned MAX_GS_INPUT_VERTICES = 6;
-
 namespace brw {
 
 vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
                                  void *log_data,
                                  struct brw_gs_compile *c,
-                                 nir_shader *shader,
+                                 const nir_shader *shader,
                                  void *mem_ctx,
                                  bool no_spills,
                                  int shader_time_index)
@@ -598,32 +596,17 @@ vec4_gs_visitor::gs_end_primitive()
    emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
 }
 
-static const unsigned *
-generate_assembly(struct brw_context *brw,
-                  struct gl_shader_program *shader_prog,
-                  struct gl_program *prog,
-                  struct brw_vue_prog_data *prog_data,
-                  void *mem_ctx,
-                  const cfg_t *cfg,
-                  unsigned *final_assembly_size)
-{
-   vec4_generator g(brw->intelScreen->compiler, brw,
-                    shader_prog, prog, prog_data, mem_ctx,
-                    INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
-   return g.generate_assembly(cfg, final_assembly_size);
-}
-
 extern "C" const unsigned *
-brw_gs_emit(struct brw_context *brw,
-            struct gl_shader_program *prog,
-            struct brw_gs_compile *c,
-            void *mem_ctx,
-            int shader_time_index,
-            unsigned *final_assembly_size)
+brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
+               struct brw_gs_compile *c,
+               const nir_shader *shader,
+               struct gl_shader_program *shader_prog,
+               void *mem_ctx,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str)
 {
-   struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
-
-   if (brw->gen >= 7) {
+   if (compiler->devinfo->gen >= 7) {
       /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
        * so without spilling. If the GS invocations count > 1, then we can't use
        * dual object mode.
@@ -632,13 +615,12 @@ brw_gs_emit(struct brw_context *brw,
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
          c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(brw->intelScreen->compiler, brw,
-                           c, shader->Program->nir,
+         vec4_gs_visitor v(compiler, log_data, c, shader,
                            mem_ctx, true /* no_spills */, shader_time_index);
          if (v.run()) {
-            return generate_assembly(brw, prog, &c->gp->program.Base,
-                                     &c->prog_data.base, mem_ctx, v.cfg,
-                                     final_assembly_size);
+            vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx,
+                             INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
+            return g.generate_assembly(v.cfg, final_assembly_size, shader);
          }
       }
    }
@@ -666,7 +648,7 @@ brw_gs_emit(struct brw_context *brw,
     * mode is more performant when invocations > 1. Gen6 only supports
     * SINGLE mode.
     */
-   if (c->prog_data.invocations <= 1 || brw->gen < 7)
+   if (c->prog_data.invocations <= 1 || compiler->devinfo->gen < 7)
       c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
    else
       c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
@@ -674,24 +656,22 @@ brw_gs_emit(struct brw_context *brw,
    vec4_gs_visitor *gs = NULL;
    const unsigned *ret = NULL;
 
-   if (brw->gen >= 7)
-      gs = new vec4_gs_visitor(brw->intelScreen->compiler, brw,
-                               c, shader->Program->nir,
+   if (compiler->devinfo->gen >= 7)
+      gs = new vec4_gs_visitor(compiler, log_data, c, shader,
                                mem_ctx, false /* no_spills */,
                                shader_time_index);
    else
-      gs = new gen6_gs_visitor(brw->intelScreen->compiler, brw,
-                               c, prog, shader->Program->nir,
+      gs = new gen6_gs_visitor(compiler, log_data, c, shader_prog, shader,
                                mem_ctx, false /* no_spills */,
                                shader_time_index);
 
    if (!gs->run()) {
-      prog->LinkStatus = false;
-      ralloc_strcat(&prog->InfoLog, gs->fail_msg);
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
    } else {
-      ret = generate_assembly(brw, prog, &c->gp->program.Base,
-                              &c->prog_data.base, mem_ctx, gs->cfg,
-                              final_assembly_size);
+      vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx,
+                       INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
+      ret = g.generate_assembly(gs->cfg, final_assembly_size, shader);
    }
 
    delete gs;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index 3ff195c3e68..c52552768c8 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -32,36 +32,6 @@
 
 #include "brw_vec4.h"
 
-/**
- * Scratch data used when compiling a GLSL geometry shader.
- */
-struct brw_gs_compile
-{
-   struct brw_gs_prog_key key;
-   struct brw_gs_prog_data prog_data;
-   struct brw_vue_map input_vue_map;
-
-   struct brw_geometry_program *gp;
-
-   unsigned control_data_bits_per_vertex;
-   unsigned control_data_header_size_bits;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-const unsigned *brw_gs_emit(struct brw_context *brw,
-                            struct gl_shader_program *prog,
-                            struct brw_gs_compile *c,
-                            void *mem_ctx,
-                            int shader_time_index,
-                            unsigned *final_assembly_size);
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
 #ifdef __cplusplus
 namespace brw {
 
@@ -71,7 +41,7 @@ public:
    vec4_gs_visitor(const struct brw_compiler *compiler,
                    void *log_data,
                    struct brw_gs_compile *c,
-                   nir_shader *shader,
+                   const nir_shader *shader,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
index cc688ef8083..678237901f2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
@@ -291,15 +291,15 @@ vec4_visitor::calculate_live_intervals()
       struct block_data *bd = &live_intervals->block_data[block->num];
 
       for (int i = 0; i < live_intervals->num_vars; i++) {
-	 if (BITSET_TEST(bd->livein, i)) {
-	    start[i] = MIN2(start[i], block->start_ip);
-	    end[i] = MAX2(end[i], block->start_ip);
-	 }
+         if (BITSET_TEST(bd->livein, i)) {
+            start[i] = MIN2(start[i], block->start_ip);
+            end[i] = MAX2(end[i], block->start_ip);
+         }
 
-	 if (BITSET_TEST(bd->liveout, i)) {
-	    start[i] = MIN2(start[i], block->end_ip);
-	    end[i] = MAX2(end[i], block->end_ip);
-	 }
+         if (BITSET_TEST(bd->liveout, i)) {
+            start[i] = MIN2(start[i], block->end_ip);
+            end[i] = MAX2(end[i], block->end_ip);
+         }
       }
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 41bd80df377..ea1e3e7bbcf 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -423,10 +423,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
    case nir_intrinsic_get_buffer_size: {
       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
-      unsigned ubo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
+      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
 
-      src_reg surf_index = src_reg(prog_data->base.binding_table.ubo_start +
-                                   ubo_index);
+      src_reg surf_index = src_reg(prog_data->base.binding_table.ssbo_start +
+                                   ssbo_index);
       dst_reg result_dst = get_nir_dest(instr->dest);
       vec4_instruction *inst = new(mem_ctx)
          vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst);
@@ -456,18 +456,18 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       nir_const_value *const_uniform_block =
          nir_src_as_const_value(instr->src[1]);
       if (const_uniform_block) {
-         unsigned index = prog_data->base.binding_table.ubo_start +
+         unsigned index = prog_data->base.binding_table.ssbo_start +
                           const_uniform_block->u[0];
          surf_index = src_reg(index);
          brw_mark_surface_used(&prog_data->base, index);
       } else {
          surf_index = src_reg(this, glsl_type::uint_type);
          emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1),
-                  src_reg(prog_data->base.binding_table.ubo_start)));
+                  src_reg(prog_data->base.binding_table.ssbo_start)));
          surf_index = emit_uniformize(surf_index);
 
          brw_mark_surface_used(&prog_data->base,
-                               prog_data->base.binding_table.ubo_start +
+                               prog_data->base.binding_table.ssbo_start +
                                nir->info.num_ssbos - 1);
       }
 
@@ -599,7 +599,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       src_reg surf_index;
       if (const_uniform_block) {
-         unsigned index = prog_data->base.binding_table.ubo_start +
+         unsigned index = prog_data->base.binding_table.ssbo_start +
                           const_uniform_block->u[0];
          surf_index = src_reg(index);
 
@@ -607,14 +607,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       } else {
          surf_index = src_reg(this, glsl_type::uint_type);
          emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1),
-                  src_reg(prog_data->base.binding_table.ubo_start)));
+                  src_reg(prog_data->base.binding_table.ssbo_start)));
          surf_index = emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
           */
          brw_mark_surface_used(&prog_data->base,
-                               prog_data->base.binding_table.ubo_start +
+                               prog_data->base.binding_table.ssbo_start +
                                nir->info.num_ssbos - 1);
       }
 
@@ -645,17 +645,17 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    case nir_intrinsic_ssbo_atomic_add:
       nir_emit_ssbo_atomic(BRW_AOP_ADD, instr);
       break;
-   case nir_intrinsic_ssbo_atomic_min:
-      if (dest.type == BRW_REGISTER_TYPE_D)
-         nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr);
-      else
-         nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr);
+   case nir_intrinsic_ssbo_atomic_imin:
+      nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_umin:
+      nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr);
       break;
-   case nir_intrinsic_ssbo_atomic_max:
-      if (dest.type == BRW_REGISTER_TYPE_D)
-         nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr);
-      else
-         nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr);
+   case nir_intrinsic_ssbo_atomic_imax:
+      nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_umax:
+      nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr);
       break;
    case nir_intrinsic_ssbo_atomic_and:
       nir_emit_ssbo_atomic(BRW_AOP_AND, instr);
@@ -765,7 +765,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           */
          brw_mark_surface_used(&prog_data->base,
                                prog_data->base.binding_table.ubo_start +
-                               nir->info.num_ssbos - 1);
+                               nir->info.num_ubos - 1);
       }
 
       unsigned const_offset = instr->const_index[0];
@@ -821,20 +821,20 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
    src_reg surface;
    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
    if (const_surface) {
-      unsigned surf_index = prog_data->base.binding_table.ubo_start +
+      unsigned surf_index = prog_data->base.binding_table.ssbo_start +
                             const_surface->u[0];
       surface = src_reg(surf_index);
       brw_mark_surface_used(&prog_data->base, surf_index);
    } else {
       surface = src_reg(this, glsl_type::uint_type);
       emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]),
-               src_reg(prog_data->base.binding_table.ubo_start)));
+               src_reg(prog_data->base.binding_table.ssbo_start)));
 
       /* Assume this may touch any UBO. This is the same we do for other
        * UBO/SSBO accesses with non-constant surface.
        */
       brw_mark_surface_used(&prog_data->base,
-                            prog_data->base.binding_table.ubo_start +
+                            prog_data->base.binding_table.ssbo_start +
                             nir->info.num_ssbos - 1);
    }
 
@@ -1237,14 +1237,8 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_b2i:
-      emit(AND(dst, op[0], src_reg(1)));
-      break;
-
    case nir_op_b2f:
-      op[0].type = BRW_REGISTER_TYPE_D;
-      dst.type = BRW_REGISTER_TYPE_D;
-      emit(AND(dst, op[0], src_reg(0x3f800000u)));
-      dst.type = BRW_REGISTER_TYPE_F;
+      emit(MOV(dst, negate(op[0])));
       break;
 
    case nir_op_f2b:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 98ea9be6ee4..5be9c6a6b2d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1815,7 +1815,7 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
                            void *log_data,
                            const struct brw_sampler_prog_key_data *key_tex,
                            struct brw_vue_prog_data *prog_data,
-                           nir_shader *shader,
+                           const nir_shader *shader,
 			   void *mem_ctx,
                            bool no_spills,
                            int shader_time_index)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index b6e1971c2ee..485a80ee2fc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -301,7 +301,7 @@ vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
                                  void *log_data,
                                  const struct brw_vs_prog_key *key,
                                  struct brw_vs_prog_data *vs_prog_data,
-                                 nir_shader *shader,
+                                 const nir_shader *shader,
                                  gl_clip_plane *clip_planes,
                                  void *mem_ctx,
                                  int shader_time_index,
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 38de98fab86..ba680a98f7e 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -31,6 +31,7 @@
 
 
 #include "main/compiler.h"
+#include "main/context.h"
 #include "brw_context.h"
 #include "brw_vs.h"
 #include "brw_util.h"
@@ -57,18 +58,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
    bool start_busy = false;
    double start_time = 0;
 
-   if (!vp->program.Base.nir) {
-      /* Normally we generate NIR in LinkShader() or
-       * ProgramStringNotify(), but Mesa's fixed-function vertex program
-       * handling doesn't notify the driver at all.  Just do it here, at
-       * the last minute, even though it's lame.
-       */
-      assert(vp->program.Base.Id == 0 && prog == NULL);
-      vp->program.Base.nir =
-         brw_create_nir(brw, NULL, &vp->program.Base, MESA_SHADER_VERTEX,
-                        brw->intelScreen->compiler->scalar_vs);
-   }
-
    if (prog)
       vs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
 
@@ -171,7 +160,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_VS))
-      brw_dump_ir("vertex", prog, &vs->base, &vp->program.Base);
+      brw_dump_ir("vertex", prog, vs ? &vs->base : NULL, &vp->program.Base);
 
    int st_index = -1;
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
@@ -179,9 +168,20 @@ brw_codegen_vs_prog(struct brw_context *brw,
 
    /* Emit GEN4 code.
     */
-   program = brw_vs_emit(brw, mem_ctx, key, &prog_data,
-                         &vp->program, prog, st_index, &program_size);
+   char *error_str;
+   program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx, key,
+                            &prog_data, vp->program.Base.nir,
+                            brw_select_clip_planes(&brw->ctx),
+                            !_mesa_is_gles3(&brw->ctx),
+                            st_index, &program_size, &error_str);
    if (program == NULL) {
+      if (prog) {
+         prog->LinkStatus = false;
+         ralloc_strcat(&prog->InfoLog, error_str);
+      }
+
+      _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", error_str);
+
       ralloc_free(mem_ctx);
       return false;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index f1242f61b33..bcb5e7b0b2a 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -54,14 +54,6 @@
 extern "C" {
 #endif
 
-const unsigned *brw_vs_emit(struct brw_context *brw,
-                            void *mem_ctx,
-                            const struct brw_vs_prog_key *key,
-                            struct brw_vs_prog_data *prog_data,
-                            struct gl_vertex_program *vp,
-                            struct gl_shader_program *shader_prog,
-                            int shader_time_index,
-                            unsigned *program_size);
 void brw_vs_debug_recompile(struct brw_context *brw,
                             struct gl_shader_program *prog,
                             const struct brw_vs_prog_key *key);
@@ -88,7 +80,7 @@ public:
                    void *log_data,
                    const struct brw_vs_prog_key *key,
                    struct brw_vs_prog_data *vs_prog_data,
-                   nir_shader *shader,
+                   const nir_shader *shader,
                    gl_clip_plane *clip_planes,
                    void *mem_ctx,
                    int shader_time_index,
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index 9bb48eb2e27..f65258a52a5 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -201,7 +201,7 @@ brw_upload_vs_image_surfaces(struct brw_context *brw)
       ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
 
    if (prog) {
-      /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */
       brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX],
                                 &brw->vs.base, &brw->vs.prog_data->base.base);
    }
@@ -209,6 +209,7 @@ brw_upload_vs_image_surfaces(struct brw_context *brw)
 
 const struct brw_tracked_state brw_vs_image_surfaces = {
    .dirty = {
+      .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_IMAGE_UNITS |
              BRW_NEW_VERTEX_PROGRAM |
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 4d5e7f67bd6..5c49db9e63e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -39,89 +39,6 @@
 
 #include "util/ralloc.h"
 
-/**
- * Return a bitfield where bit n is set if barycentric interpolation mode n
- * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
- */
-static unsigned
-brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
-                                     bool shade_model_flat,
-                                     bool persample_shading,
-                                     nir_shader *shader)
-{
-   unsigned barycentric_interp_modes = 0;
-
-   nir_foreach_variable(var, &shader->inputs) {
-      enum glsl_interp_qualifier interp_qualifier = var->data.interpolation;
-      bool is_centroid = var->data.centroid && !persample_shading;
-      bool is_sample = var->data.sample || persample_shading;
-      bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
-                         (var->data.location == VARYING_SLOT_COL1);
-
-      /* Ignore WPOS and FACE, because they don't require interpolation. */
-      if (var->data.location == VARYING_SLOT_POS ||
-          var->data.location == VARYING_SLOT_FACE)
-         continue;
-
-      /* Determine the set (or sets) of barycentric coordinates needed to
-       * interpolate this variable.  Note that when
-       * brw->needs_unlit_centroid_workaround is set, centroid interpolation
-       * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
-       * for lit pixels, so we need both sets of barycentric coordinates.
-       */
-      if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) {
-         if (is_centroid) {
-            barycentric_interp_modes |=
-               1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
-         } else if (is_sample) {
-            barycentric_interp_modes |=
-               1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
-         }
-         if ((!is_centroid && !is_sample) ||
-             devinfo->needs_unlit_centroid_workaround) {
-            barycentric_interp_modes |=
-               1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
-         }
-      } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH ||
-                 (!(shade_model_flat && is_gl_Color) &&
-                  interp_qualifier == INTERP_QUALIFIER_NONE)) {
-         if (is_centroid) {
-            barycentric_interp_modes |=
-               1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
-         } else if (is_sample) {
-            barycentric_interp_modes |=
-               1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
-         }
-         if ((!is_centroid && !is_sample) ||
-             devinfo->needs_unlit_centroid_workaround) {
-            barycentric_interp_modes |=
-               1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
-         }
-      }
-   }
-
-   return barycentric_interp_modes;
-}
-
-static uint8_t
-computed_depth_mode(struct gl_fragment_program *fp)
-{
-   if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-      switch (fp->FragDepthLayout) {
-      case FRAG_DEPTH_LAYOUT_NONE:
-      case FRAG_DEPTH_LAYOUT_ANY:
-         return BRW_PSCDEPTH_ON;
-      case FRAG_DEPTH_LAYOUT_GREATER:
-         return BRW_PSCDEPTH_ON_GE;
-      case FRAG_DEPTH_LAYOUT_LESS:
-         return BRW_PSCDEPTH_ON_LE;
-      case FRAG_DEPTH_LAYOUT_UNCHANGED:
-         return BRW_PSCDEPTH_OFF;
-      }
-   }
-   return BRW_PSCDEPTH_OFF;
-}
-
 static void
 assign_fs_binding_table_offsets(const struct brw_device_info *devinfo,
                                 const struct gl_shader_program *shader_prog,
@@ -166,15 +83,6 @@ brw_codegen_wm_prog(struct brw_context *brw,
       fs = (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
 
    memset(&prog_data, 0, sizeof(prog_data));
-   /* key->alpha_test_func means simulating alpha testing via discards,
-    * so the shader definitely kills pixels.
-    */
-   prog_data.uses_kill = fp->program.UsesKill || key->alpha_test_func;
-   prog_data.uses_omask =
-      fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
-   prog_data.computed_depth_mode = computed_depth_mode(&fp->program);
-
-   prog_data.early_fragment_tests = fs && fs->base.EarlyFragmentTests;
 
    /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
    if (!prog)
@@ -209,12 +117,6 @@ brw_codegen_wm_prog(struct brw_context *brw,
                                  &prog_data.base);
    }
 
-   prog_data.barycentric_interp_modes =
-      brw_compute_barycentric_interp_modes(brw->intelScreen->devinfo,
-                                           key->flat_shade,
-                                           key->persample_shading,
-                                           fp->program.Base.nir);
-
    if (unlikely(brw->perf_debug)) {
       start_busy = (brw->batch.last_bo &&
                     drm_intel_bo_busy(brw->batch.last_bo));
@@ -222,7 +124,7 @@ brw_codegen_wm_prog(struct brw_context *brw,
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM))
-      brw_dump_ir("fragment", prog, &fs->base, &fp->program.Base);
+      brw_dump_ir("fragment", prog, fs ? &fs->base : NULL, &fp->program.Base);
 
    int st_index8 = -1, st_index16 = -1;
    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
@@ -230,9 +132,19 @@ brw_codegen_wm_prog(struct brw_context *brw,
       st_index16 = brw_get_shader_time_index(brw, prog, &fp->program.Base, ST_FS16);
    }
 
-   program = brw_wm_fs_emit(brw, mem_ctx, key, &prog_data,
-                            &fp->program, prog, st_index8, st_index16, &program_size);
+   char *error_str = NULL;
+   program = brw_compile_fs(brw->intelScreen->compiler, brw, mem_ctx,
+                            key, &prog_data, fp->program.Base.nir,
+                            &fp->program.Base, st_index8, st_index16,
+                            brw->use_rep_send, &program_size, &error_str);
    if (program == NULL) {
+      if (prog) {
+         prog->LinkStatus = false;
+         ralloc_strcat(&prog->InfoLog, error_str);
+      }
+
+      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", error_str);
+
       ralloc_free(mem_ctx);
       return false;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 6ee22b2f907..53a642ee8bb 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -61,21 +61,6 @@
 extern "C" {
 #endif
 
-/**
- * Compile a fragment shader.
- *
- * Returns the final assembly and the program's size.
- */
-const unsigned *brw_wm_fs_emit(struct brw_context *brw,
-                               void *mem_ctx,
-                               const struct brw_wm_prog_key *key,
-                               struct brw_wm_prog_data *prog_data,
-                               struct gl_fragment_program *fp,
-                               struct gl_shader_program *prog,
-                               int shader_time_index8,
-                               int shader_time_index16,
-                               unsigned *final_assembly_size);
-
 GLboolean brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog);
 struct gl_shader *brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type);
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index c671e23827e..6ebe6481c32 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -34,6 +34,7 @@
 #include "main/blend.h"
 #include "main/mtypes.h"
 #include "main/samplerobj.h"
+#include "main/shaderimage.h"
 #include "program/prog_parameter.h"
 #include "main/framebuffer.h"
 
@@ -925,54 +926,53 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
    if (!shader)
       return;
 
-   uint32_t *surf_offsets =
+   uint32_t *ubo_surf_offsets =
       &stage_state->surf_offset[prog_data->binding_table.ubo_start];
 
    for (int i = 0; i < shader->NumUniformBlocks; i++) {
-      struct intel_buffer_object *intel_bo;
+      struct gl_uniform_buffer_binding *binding =
+         &ctx->UniformBufferBindings[shader->UniformBlocks[i]->Binding];
 
-      /* Because behavior for referencing outside of the binding's size in the
-       * glBindBufferRange case is undefined, we can just bind the whole buffer
-       * glBindBufferBase wants and be a correct implementation.
-       */
-      if (!shader->UniformBlocks[i].IsShaderStorage) {
-         struct gl_uniform_buffer_binding *binding;
-         binding =
-            &ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding];
-         if (binding->BufferObject == ctx->Shared->NullBufferObj) {
-            brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &surf_offsets[i]);
-         } else {
-            intel_bo = intel_buffer_object(binding->BufferObject);
-            drm_intel_bo *bo =
-               intel_bufferobj_buffer(brw, intel_bo,
-                                      binding->Offset,
-                                      binding->BufferObject->Size - binding->Offset);
-            brw_create_constant_surface(brw, bo, binding->Offset,
-                                        binding->BufferObject->Size - binding->Offset,
-                                        &surf_offsets[i],
-                                        dword_pitch);
-         }
+      if (binding->BufferObject == ctx->Shared->NullBufferObj) {
+         brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &ubo_surf_offsets[i]);
       } else {
-         struct gl_shader_storage_buffer_binding *binding;
-         binding =
-            &ctx->ShaderStorageBufferBindings[shader->UniformBlocks[i].Binding];
-         if (binding->BufferObject == ctx->Shared->NullBufferObj) {
-            brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &surf_offsets[i]);
-         } else {
-            intel_bo = intel_buffer_object(binding->BufferObject);
-            drm_intel_bo *bo =
-               intel_bufferobj_buffer(brw, intel_bo,
-                                      binding->Offset,
-                                      binding->BufferObject->Size - binding->Offset);
-            brw_create_buffer_surface(brw, bo, binding->Offset,
-                                      binding->BufferObject->Size - binding->Offset,
-                                      &surf_offsets[i],
-                                      dword_pitch);
-         }
+         struct intel_buffer_object *intel_bo =
+            intel_buffer_object(binding->BufferObject);
+         drm_intel_bo *bo =
+            intel_bufferobj_buffer(brw, intel_bo,
+                                   binding->Offset,
+                                   binding->BufferObject->Size - binding->Offset);
+         brw_create_constant_surface(brw, bo, binding->Offset,
+                                     binding->BufferObject->Size - binding->Offset,
+                                     &ubo_surf_offsets[i],
+                                     dword_pitch);
+      }
+   }
+
+   uint32_t *ssbo_surf_offsets =
+      &stage_state->surf_offset[prog_data->binding_table.ssbo_start];
+
+   for (int i = 0; i < shader->NumShaderStorageBlocks; i++) {
+      struct gl_shader_storage_buffer_binding *binding =
+         &ctx->ShaderStorageBufferBindings[shader->ShaderStorageBlocks[i]->Binding];
+
+      if (binding->BufferObject == ctx->Shared->NullBufferObj) {
+         brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &ssbo_surf_offsets[i]);
+      } else {
+         struct intel_buffer_object *intel_bo =
+            intel_buffer_object(binding->BufferObject);
+         drm_intel_bo *bo =
+            intel_bufferobj_buffer(brw, intel_bo,
+                                   binding->Offset,
+                                   binding->BufferObject->Size - binding->Offset);
+         brw_create_buffer_surface(brw, bo, binding->Offset,
+                                   binding->BufferObject->Size - binding->Offset,
+                                   &ssbo_surf_offsets[i],
+                                   dword_pitch);
       }
    }
 
-   if (shader->NumUniformBlocks)
+   if (shader->NumUniformBlocks || shader->NumShaderStorageBlocks)
       brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
 }
 
@@ -1112,7 +1112,7 @@ brw_upload_cs_image_surfaces(struct brw_context *brw)
       ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
 
    if (prog) {
-      /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */
       brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_COMPUTE],
                                 &brw->cs.base, &brw->cs.prog_data->base);
    }
@@ -1120,7 +1120,7 @@ brw_upload_cs_image_surfaces(struct brw_context *brw)
 
 const struct brw_tracked_state brw_cs_image_surfaces = {
    .dirty = {
-      .mesa = _NEW_PROGRAM,
+      .mesa = _NEW_TEXTURE | _NEW_PROGRAM,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_CS_PROG_DATA |
              BRW_NEW_IMAGE_UNITS
@@ -1253,7 +1253,7 @@ update_image_surface(struct brw_context *brw,
                      uint32_t *surf_offset,
                      struct brw_image_param *param)
 {
-   if (u->_Valid) {
+   if (_mesa_is_image_unit_valid(&brw->ctx, u)) {
       struct gl_texture_object *obj = u->TexObj;
       const unsigned format = get_image_format(brw, u->_ActualFormat, access);
 
@@ -1338,7 +1338,7 @@ brw_upload_wm_image_surfaces(struct brw_context *brw)
    struct gl_shader_program *prog = ctx->Shader._CurrentFragmentProgram;
 
    if (prog) {
-      /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */
       brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_FRAGMENT],
                                 &brw->wm.base, &brw->wm.prog_data->base);
    }
@@ -1346,6 +1346,7 @@ brw_upload_wm_image_surfaces(struct brw_context *brw)
 
 const struct brw_tracked_state brw_wm_image_surfaces = {
    .dirty = {
+      .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_FRAGMENT_PROGRAM |
              BRW_NEW_FS_PROG_DATA |
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 59a76559103..671a535a5bd 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -31,8 +31,6 @@
 
 #include "gen6_gs_visitor.h"
 
-const unsigned MAX_GS_INPUT_VERTICES = 6;
-
 namespace brw {
 
 void
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index e75d6aa10b8..d02c67d8a74 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -39,7 +39,7 @@ public:
                    void *log_data,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
-                   nir_shader *shader,
+                   const nir_shader *shader,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index) :
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index 497ecec8e45..8d6d3fe1d34 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -59,9 +59,7 @@ upload_gs_state(struct brw_context *brw)
       OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((brw->gs.prog_data->base.base.binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
-                (brw->is_haswell && prog_data->base.nr_image_params ?
-                 HSW_GS_UAV_ACCESS_ENABLE : 0));
+                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
 
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index b7e48585482..a18dc697651 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -126,9 +126,7 @@ upload_vs_state(struct brw_context *brw)
 	     ((ALIGN(stage_state->sampler_count, 4)/4) <<
               GEN6_VS_SAMPLER_COUNT_SHIFT) |
              ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
-              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
-             (brw->is_haswell && prog_data->base.nr_image_params ?
-              HSW_VS_UAV_ACCESS_ENABLE : 0));
+              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
 
    if (prog_data->base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index fd6dab5be8b..06d5e65786b 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -113,7 +113,14 @@ upload_wm_state(struct brw_context *brw)
    else if (prog_data->base.nr_image_params)
       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
 
-   /* _NEW_BUFFERS | _NEW_COLOR */
+   /* The "UAV access enable" bits are unnecessary on HSW because they only
+    * seem to have an effect on the HW-assisted coherency mechanism which we
+    * don't need, and the rasterization-related UAV_ONLY flag and the
+    * DISPATCH_ENABLE bit can be set independently from it.
+    * C.f. gen8_upload_ps_extra().
+    *
+    * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | _NEW_COLOR
+    */
    if (brw->is_haswell &&
        !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
        prog_data->base.nr_image_params)
@@ -221,9 +228,6 @@ gen7_upload_ps_state(struct brw_context *brw,
       _mesa_get_min_invocations_per_fragment(ctx, fp, false);
    assert(min_inv_per_frag >= 1);
 
-   if (brw->is_haswell && prog_data->base.nr_image_params)
-      dw4 |= HSW_PS_UAV_ACCESS_ENABLE;
-
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
       if (!prog_data->no_8 && min_inv_per_frag == 1) {
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 4195f4cf4a7..d766ca7bebf 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -52,9 +52,7 @@ gen8_upload_gs_state(struct brw_context *brw)
                 ((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((prog_data->base.binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
-                (prog_data->base.nr_image_params ?
-                 HSW_GS_UAV_ACCESS_ENABLE : 0));
+                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
 
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index a686fed704f..8f0507413a7 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -25,6 +25,7 @@
 #include "program/program.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_wm.h"
 #include "intel_batchbuffer.h"
 
 void
@@ -65,8 +66,33 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (brw->gen >= 9 && prog_data->pulls_bary)
       dw1 |= GEN9_PSX_SHADER_PULLS_BARY;
 
-   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) ||
-       prog_data->base.nr_image_params)
+   /* The stricter cross-primitive coherency guarantees that the hardware
+    * gives us with the "Accesses UAV" bit set for at least one shader stage
+    * and the "UAV coherency required" bit set on the 3DPRIMITIVE command are
+    * redundant within the current image, atomic counter and SSBO GL APIs,
+    * which all have very loose ordering and coherency requirements and
+    * generally rely on the application to insert explicit barriers when a
+    * shader invocation is expected to see the memory writes performed by the
+    * invocations of some previous primitive.  Regardless of the value of "UAV
+    * coherency required", the "Accesses UAV" bits will implicitly cause an in
+    * most cases useless DC flush when the lowermost stage with the bit set
+    * finishes execution.
+    *
+    * It would be nice to disable it, but in some cases we can't because on
+    * Gen8+ it also has an influence on rasterization via the PS UAV-only
+    * signal (which could be set independently from the coherency mechanism in
+    * the 3DSTATE_WM command on Gen7), and because in some cases it will
+    * determine whether the hardware skips execution of the fragment shader or
+    * not via the ThreadDispatchEnable signal.  However if we know that
+    * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
+    * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
+    * difference so we may just disable it here.
+    *
+    * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR
+    */
+   if ((_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) ||
+        prog_data->base.nr_image_params) &&
+       !brw_color_buffer_write_enabled(brw))
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 
    BEGIN_BATCH(2);
@@ -91,7 +117,7 @@ upload_ps_extra(struct brw_context *brw)
 
 const struct brw_tracked_state gen8_ps_extra = {
    .dirty = {
-      .mesa  = 0,
+      .mesa  = _NEW_BUFFERS | _NEW_COLOR,
       .brw   = BRW_NEW_CONTEXT |
                BRW_NEW_FRAGMENT_PROGRAM |
                BRW_NEW_FS_PROG_DATA |
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index e1e7704655d..18b86652fd2 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -221,8 +221,8 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
        * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN
        *  16 must be used."
        */
-      assert(brw->gen < 9 || mt->halign == 16);
-      assert(brw->gen < 8 || mt->num_samples > 1 || mt->halign == 16);
+      if (brw->gen >= 9 || mt->num_samples == 1)
+         assert(mt->halign == 16);
    }
 
    const uint32_t surf_type = translate_tex_target(target);
@@ -470,8 +470,8 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
        * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN
        *  16 must be used."
        */
-      assert(brw->gen < 9 || mt->halign == 16);
-      assert(brw->gen < 8 || mt->num_samples > 1 || mt->halign == 16);
+      if (brw->gen >= 9 || mt->num_samples == 1)
+         assert(mt->halign == 16);
    }
 
    uint32_t *surf = allocate_surface_state(brw, &offset, surf_index);
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index 8b5048bee7e..28f5adddf14 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -53,9 +53,7 @@ upload_vs_state(struct brw_context *brw)
              ((ALIGN(stage_state->sampler_count, 4) / 4) <<
                GEN6_VS_SAMPLER_COUNT_SHIFT) |
              ((prog_data->base.binding_table.size_bytes / 4) <<
-               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
-             (prog_data->base.nr_image_params ?
-              HSW_VS_UAV_ACCESS_ENABLE : 0));
+               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
 
    if (prog_data->base.total_scratch) {
       OUT_RELOC64(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.c b/src/mesa/drivers/dri/i965/intel_asm_annotation.c
index bb8bb8d38c9..b3d6324a5fe 100644
--- a/src/mesa/drivers/dri/i965/intel_asm_annotation.c
+++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.c
@@ -33,8 +33,7 @@
 
 void
 dump_assembly(void *assembly, int num_annotations, struct annotation *annotation,
-              const struct brw_device_info *devinfo,
-              const struct gl_program *prog)
+              const struct brw_device_info *devinfo)
 {
    const char *last_annotation_string = NULL;
    const void *last_annotation_ir = NULL;
@@ -57,19 +56,7 @@ dump_assembly(void *assembly, int num_annotations, struct annotation *annotation
          last_annotation_ir = annotation[i].ir;
          if (last_annotation_ir) {
             fprintf(stderr, "   ");
-            if (prog->nir)
-               nir_print_instr(annotation[i].ir, stderr);
-            else if (!prog->Instructions)
-               fprint_ir(stderr, annotation[i].ir);
-            else {
-               const struct prog_instruction *pi =
-                  (const struct prog_instruction *)annotation[i].ir;
-               fprintf(stderr, "%d: ",
-                       (int)(pi - prog->Instructions));
-               _mesa_fprint_instruction_opt(stderr,
-                                            pi,
-                                            0, PROG_PRINT_DEBUG, NULL);
-            }
+            nir_print_instr(annotation[i].ir, stderr);
             fprintf(stderr, "\n");
          }
       }
diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.h b/src/mesa/drivers/dri/i965/intel_asm_annotation.h
index d9c69bc41b0..6c72326f058 100644
--- a/src/mesa/drivers/dri/i965/intel_asm_annotation.h
+++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.h
@@ -60,8 +60,7 @@ struct annotation_info {
 
 void
 dump_assembly(void *assembly, int num_annotations, struct annotation *annotation,
-              const struct brw_device_info *devinfo,
-              const struct gl_program *prog);
+              const struct brw_device_info *devinfo);
 
 void
 annotate(const struct brw_device_info *devinfo,
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index a169c41790e..b6e35205727 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -201,6 +201,14 @@ intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw,
    if (brw->gen < 7)
       return false;
 
+   if (brw->gen >= 9) {
+      /* FINISHME: Enable singlesample fast MCS clears on SKL after all GPU
+       * FINISHME: hangs are resolved.
+       */
+      perf_debug("singlesample fast MCS clears disabled on gen9");
+      return false;
+   }
+
    if (mt->disable_aux_buffers)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 8adb626d420..5f80f90a91d 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -66,7 +66,7 @@ void cmod_propagation_test::SetUp()
 
    v = new cmod_propagation_fs_visitor(compiler, prog_data, shader);
 
-   _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
+   _mesa_init_gl_program(&fp->program.Base, GL_FRAGMENT_SHADER, 0);
 
    devinfo->gen = 4;
 }
diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
index f77b18e7db8..32e8b8f8867 100644
--- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
@@ -66,7 +66,7 @@ void saturate_propagation_test::SetUp()
 
    v = new saturate_propagation_fs_visitor(compiler, prog_data, shader);
 
-   _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
+   _mesa_init_gl_program(&fp->program.Base, GL_FRAGMENT_SHADER, 0);
 
    devinfo->gen = 4;
 }
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index 40253961a65..e80b71b558d 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -98,7 +98,7 @@ void copy_propagation_test::SetUp()
 
    v = new copy_propagation_vec4_visitor(compiler, shader);
 
-   _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
+   _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0);
 
    devinfo->gen = 4;
 }
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index 76028d36311..2f824617454 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -101,7 +101,7 @@ void register_coalesce_test::SetUp()
 
    v = new register_coalesce_vec4_visitor(compiler, shader);
 
-   _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
+   _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0);
 
    devinfo->gen = 4;
 }
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index d43eaf977fc..628c5708090 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -1200,18 +1200,19 @@ r200BindProgram(struct gl_context *ctx, GLenum target, struct gl_program *prog)
 static struct gl_program *
 r200NewProgram(struct gl_context *ctx, GLenum target, GLuint id)
 {
-   struct r200_vertex_program *vp;
-
    switch(target){
-   case GL_VERTEX_PROGRAM_ARB:
-      vp = CALLOC_STRUCT(r200_vertex_program);
-      return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id);
-   case GL_FRAGMENT_PROGRAM_ARB:
-      return _mesa_init_fragment_program( ctx, CALLOC_STRUCT(gl_fragment_program), target, id );
+   case GL_VERTEX_PROGRAM_ARB: {
+      struct r200_vertex_program *vp = CALLOC_STRUCT(r200_vertex_program);
+      return _mesa_init_gl_program(&vp->mesa_program.Base, target, id);
+   }
+   case GL_FRAGMENT_PROGRAM_ARB: {
+      struct gl_fragment_program *prog = CALLOC_STRUCT(gl_fragment_program);
+      return _mesa_init_gl_program(&prog->Base, target, id);
+   }
    default:
       _mesa_problem(ctx, "Bad target in r200NewProgram");
+      return NULL;
    }
-   return NULL;	
 }
 
 
diff --git a/src/mesa/drivers/x11/SConscript b/src/mesa/drivers/x11/SConscript
index d29f9874f44..cd5cccda0d1 100644
--- a/src/mesa/drivers/x11/SConscript
+++ b/src/mesa/drivers/x11/SConscript
@@ -4,6 +4,8 @@ env = env.Clone()
 
 env.Append(CPPPATH = [
     '#/src',
+    '#/src/glsl',
+    '#/src/glsl/nir',
     '#/src/mapi',
     '#/src/mesa',
     '#/src/mesa/main',
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index dee5e29d5b8..20aa4980935 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -190,6 +190,19 @@ update_uses_dual_src(struct gl_context *ctx, int buf)
        blend_factor_is_dual_src(ctx->Color.Blend[buf].DstA));
 }
 
+
+/**
+ * Return the number of per-buffer blend states to update in
+ * glBlendFunc, glBlendFuncSeparate, glBlendEquation, etc.
+ */
+static inline unsigned
+num_buffers(const struct gl_context *ctx)
+{
+   return ctx->Extensions.ARB_draw_buffers_blend
+      ? ctx->Const.MaxDrawBuffers : 1;
+}
+
+
 /**
  * Set the separate blend source/dest factors for all draw buffers.
  *
@@ -202,9 +215,10 @@ void GLAPIENTRY
 _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
                             GLenum sfactorA, GLenum dfactorA )
 {
-   GLuint buf, numBuffers;
-   GLboolean changed;
    GET_CURRENT_CONTEXT(ctx);
+   const unsigned numBuffers = num_buffers(ctx);
+   unsigned buf;
+   bool changed = false;
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendFuncSeparate %s %s %s %s\n",
@@ -213,28 +227,38 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
                   _mesa_enum_to_string(sfactorA),
                   _mesa_enum_to_string(dfactorA));
 
-   if (!validate_blend_factors(ctx, "glBlendFuncSeparate",
-                               sfactorRGB, dfactorRGB,
-                               sfactorA, dfactorA)) {
-      return;
+   /* Check if we're really changing any state.  If not, return early. */
+   if (ctx->Color._BlendFuncPerBuffer) {
+      /* Check all per-buffer states */
+      for (buf = 0; buf < numBuffers; buf++) {
+         if (ctx->Color.Blend[buf].SrcRGB != sfactorRGB ||
+             ctx->Color.Blend[buf].DstRGB != dfactorRGB ||
+             ctx->Color.Blend[buf].SrcA != sfactorA ||
+             ctx->Color.Blend[buf].DstA != dfactorA) {
+            changed = true;
+            break;
+         }
+      }
    }
-
-   numBuffers = ctx->Extensions.ARB_draw_buffers_blend
-      ? ctx->Const.MaxDrawBuffers : 1;
-
-   changed = GL_FALSE;
-   for (buf = 0; buf < numBuffers; buf++) {
-      if (ctx->Color.Blend[buf].SrcRGB != sfactorRGB ||
-          ctx->Color.Blend[buf].DstRGB != dfactorRGB ||
-          ctx->Color.Blend[buf].SrcA != sfactorA ||
-          ctx->Color.Blend[buf].DstA != dfactorA) {
-         changed = GL_TRUE;
-         break;
+   else {
+      /* only need to check 0th per-buffer state */
+      if (ctx->Color.Blend[0].SrcRGB != sfactorRGB ||
+          ctx->Color.Blend[0].DstRGB != dfactorRGB ||
+          ctx->Color.Blend[0].SrcA != sfactorA ||
+          ctx->Color.Blend[0].DstA != dfactorA) {
+         changed = true;
       }
    }
+
    if (!changed)
       return;
 
+   if (!validate_blend_factors(ctx, "glBlendFuncSeparate",
+                               sfactorRGB, dfactorRGB,
+                               sfactorA, dfactorA)) {
+      return;
+   }
+
    FLUSH_VERTICES(ctx, _NEW_COLOR);
 
    for (buf = 0; buf < numBuffers; buf++) {
@@ -242,8 +266,13 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
       ctx->Color.Blend[buf].DstRGB = dfactorRGB;
       ctx->Color.Blend[buf].SrcA = sfactorA;
       ctx->Color.Blend[buf].DstA = dfactorA;
-      update_uses_dual_src(ctx, buf);
    }
+
+   update_uses_dual_src(ctx, 0);
+   for (buf = 1; buf < numBuffers; buf++) {
+      ctx->Color.Blend[buf]._UsesDualSrc = ctx->Color.Blend[0]._UsesDualSrc;
+   }
+
    ctx->Color._BlendFuncPerBuffer = GL_FALSE;
 
    if (ctx->Driver.BlendFuncSeparate) {
@@ -283,18 +312,18 @@ _mesa_BlendFuncSeparateiARB(GLuint buf, GLenum sfactorRGB, GLenum dfactorRGB,
       return;
    }
 
-   if (!validate_blend_factors(ctx, "glBlendFuncSeparatei",
-                               sfactorRGB, dfactorRGB,
-                               sfactorA, dfactorA)) {
-      return;
-   }
-
    if (ctx->Color.Blend[buf].SrcRGB == sfactorRGB &&
        ctx->Color.Blend[buf].DstRGB == dfactorRGB &&
        ctx->Color.Blend[buf].SrcA == sfactorA &&
        ctx->Color.Blend[buf].DstA == dfactorA)
       return; /* no change */
 
+   if (!validate_blend_factors(ctx, "glBlendFuncSeparatei",
+                               sfactorRGB, dfactorRGB,
+                               sfactorA, dfactorA)) {
+      return;
+   }
+
    FLUSH_VERTICES(ctx, _NEW_COLOR);
 
    ctx->Color.Blend[buf].SrcRGB = sfactorRGB;
@@ -331,34 +360,43 @@ legal_blend_equation(const struct gl_context *ctx, GLenum mode)
 void GLAPIENTRY
 _mesa_BlendEquation( GLenum mode )
 {
-   GLuint buf, numBuffers;
-   GLboolean changed;
    GET_CURRENT_CONTEXT(ctx);
+   const unsigned numBuffers = num_buffers(ctx);
+   unsigned buf;
+   bool changed = false;
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquation(%s)\n",
                   _mesa_enum_to_string(mode));
 
-   if (!legal_blend_equation(ctx, mode)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glBlendEquation");
-      return;
+   if (ctx->Color._BlendEquationPerBuffer) {
+      /* Check all per-buffer states */
+      for (buf = 0; buf < numBuffers; buf++) {
+         if (ctx->Color.Blend[buf].EquationRGB != mode ||
+             ctx->Color.Blend[buf].EquationA != mode) {
+            changed = true;
+            break;
+         }
+      }
    }
-
-   numBuffers = ctx->Extensions.ARB_draw_buffers_blend
-      ? ctx->Const.MaxDrawBuffers : 1;
-
-   changed = GL_FALSE;
-   for (buf = 0; buf < numBuffers; buf++) {
-      if (ctx->Color.Blend[buf].EquationRGB != mode ||
-          ctx->Color.Blend[buf].EquationA != mode) {
-         changed = GL_TRUE;
-         break;
+   else {
+      /* only need to check 0th per-buffer state */
+      if (ctx->Color.Blend[0].EquationRGB != mode ||
+          ctx->Color.Blend[0].EquationA != mode) {
+         changed = true;
       }
    }
+
    if (!changed)
       return;
 
+   if (!legal_blend_equation(ctx, mode)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glBlendEquation");
+      return;
+   }
+
    FLUSH_VERTICES(ctx, _NEW_COLOR);
+
    for (buf = 0; buf < numBuffers; buf++) {
       ctx->Color.Blend[buf].EquationRGB = mode;
       ctx->Color.Blend[buf].EquationA = mode;
@@ -383,7 +421,7 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode)
                   buf, _mesa_enum_to_string(mode));
 
    if (buf >= ctx->Const.MaxDrawBuffers) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glBlendFuncSeparatei(buffer=%u)",
+      _mesa_error(ctx, GL_INVALID_VALUE, "glBlendEquationi(buffer=%u)",
                   buf);
       return;
    }
@@ -407,15 +445,37 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode)
 void GLAPIENTRY
 _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA )
 {
-   GLuint buf, numBuffers;
-   GLboolean changed;
    GET_CURRENT_CONTEXT(ctx);
+   const unsigned numBuffers = num_buffers(ctx);
+   unsigned buf;
+   bool changed = false;
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationSeparateEXT(%s %s)\n",
                   _mesa_enum_to_string(modeRGB),
                   _mesa_enum_to_string(modeA));
 
+   if (ctx->Color._BlendEquationPerBuffer) {
+      /* Check all per-buffer states */
+      for (buf = 0; buf < numBuffers; buf++) {
+         if (ctx->Color.Blend[buf].EquationRGB != modeRGB ||
+             ctx->Color.Blend[buf].EquationA != modeA) {
+            changed = true;
+            break;
+         }
+      }
+   }
+   else {
+      /* only need to check 0th per-buffer state */
+      if (ctx->Color.Blend[0].EquationRGB != modeRGB ||
+          ctx->Color.Blend[0].EquationA != modeA) {
+         changed = true;
+      }
+   }
+
+   if (!changed)
+      return;
+
    if ( (modeRGB != modeA) && !ctx->Extensions.EXT_blend_equation_separate ) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
 		  "glBlendEquationSeparateEXT not supported by driver");
@@ -432,21 +492,8 @@ _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA )
       return;
    }
 
-   numBuffers = ctx->Extensions.ARB_draw_buffers_blend
-      ? ctx->Const.MaxDrawBuffers : 1;
-
-   changed = GL_FALSE;
-   for (buf = 0; buf < numBuffers; buf++) {
-      if (ctx->Color.Blend[buf].EquationRGB != modeRGB ||
-          ctx->Color.Blend[buf].EquationA != modeA) {
-         changed = GL_TRUE;
-         break;
-      }
-   }
-   if (!changed)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_COLOR);
+
    for (buf = 0; buf < numBuffers; buf++) {
       ctx->Color.Blend[buf].EquationRGB = modeRGB;
       ctx->Color.Blend[buf].EquationA = modeA;
diff --git a/src/mesa/main/es1_conversion.c b/src/mesa/main/es1_conversion.c
index b254a6ef1c7..1dfe8278e71 100644
--- a/src/mesa/main/es1_conversion.c
+++ b/src/mesa/main/es1_conversion.c
@@ -1,3 +1,4 @@
+
 #include <stdbool.h>
 
 #include "api_loopback.h"
@@ -326,7 +327,24 @@ _mesa_GetTexEnvxv(GLenum target, GLenum pname, GLfixed *params)
       }
       break;
    case GL_TEXTURE_ENV:
-      if (pname != GL_TEXTURE_ENV_COLOR && pname != GL_RGB_SCALE && pname != GL_ALPHA_SCALE && pname != GL_TEXTURE_ENV_MODE && pname != GL_COMBINE_RGB && pname != GL_COMBINE_ALPHA && pname != GL_SRC0_RGB && pname != GL_SRC1_RGB && pname != GL_SRC2_RGB && pname != GL_SRC0_ALPHA && pname != GL_SRC1_ALPHA && pname != GL_SRC2_ALPHA && pname != GL_OPERAND0_RGB && pname != GL_OPERAND1_RGB && pname != GL_OPERAND2_RGB && pname != GL_OPERAND0_ALPHA && pname != GL_OPERAND1_ALPHA && pname != GL_OPERAND2_ALPHA) {
+      if (pname != GL_TEXTURE_ENV_COLOR &&
+          pname != GL_RGB_SCALE &&
+          pname != GL_ALPHA_SCALE &&
+          pname != GL_TEXTURE_ENV_MODE &&
+          pname != GL_COMBINE_RGB &&
+          pname != GL_COMBINE_ALPHA &&
+          pname != GL_SRC0_RGB &&
+          pname != GL_SRC1_RGB &&
+          pname != GL_SRC2_RGB &&
+          pname != GL_SRC0_ALPHA &&
+          pname != GL_SRC1_ALPHA &&
+          pname != GL_SRC2_ALPHA &&
+          pname != GL_OPERAND0_RGB &&
+          pname != GL_OPERAND1_RGB &&
+          pname != GL_OPERAND2_RGB &&
+          pname != GL_OPERAND0_ALPHA &&
+          pname != GL_OPERAND1_ALPHA &&
+          pname != GL_OPERAND2_ALPHA) {
          _mesa_error(_mesa_get_current_context(), GL_INVALID_ENUM,
                      "glGetTexEnvxv(target=0x%x)", target);
          return;
diff --git a/src/mesa/main/ff_fragment_shader.cpp b/src/mesa/main/ff_fragment_shader.cpp
index e4e2a18c1da..e63d0f1ec55 100644
--- a/src/mesa/main/ff_fragment_shader.cpp
+++ b/src/mesa/main/ff_fragment_shader.cpp
@@ -40,7 +40,7 @@
 #include "glsl/ir_optimization.h"
 #include "glsl/glsl_parser_extras.h"
 #include "glsl/glsl_symbol_table.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "program/ir_to_mesa.h"
 #include "program/program.h"
 #include "program/programopt.h"
@@ -975,13 +975,11 @@ static void load_texture( texenv_fragment_program *p, GLuint unit )
 						      ir_var_uniform);
    p->top_instructions->push_head(sampler);
 
-   /* Set the texture unit for this sampler.  The linker will pick this value
-    * up and do-the-right-thing.
-    *
-    * NOTE: The cast to int is important.  Without it, the constant will have
-    * type uint, and things later on may get confused.
+   /* Set the texture unit for this sampler in the same way that
+    * layout(binding=X) would.
     */
-   sampler->constant_value = new(p->mem_ctx) ir_constant(int(unit));
+   sampler->data.explicit_binding = true;
+   sampler->data.binding = unit;
 
    deref = new(p->mem_ctx) ir_dereference_variable(sampler);
    tex->set_sampler(deref, glsl_type::vec4_type);
diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c
index a6183b47e2e..34cc9218add 100644
--- a/src/mesa/main/ffvertex_prog.c
+++ b/src/mesa/main/ffvertex_prog.c
@@ -1690,11 +1690,10 @@ _mesa_get_fixed_func_vertex_program(struct gl_context *ctx)
                           ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS,
                           ctx->Const.Program[MESA_SHADER_VERTEX].MaxTemps );
 
-#if 0
       if (ctx->Driver.ProgramStringNotify)
          ctx->Driver.ProgramStringNotify( ctx, GL_VERTEX_PROGRAM_ARB,
                                           &prog->Base );
-#endif
+
       _mesa_program_cache_insert(ctx, ctx->VertexProgram.Cache,
                                  &key, sizeof(key), &prog->Base);
    }
diff --git a/src/mesa/main/format_utils.h b/src/mesa/main/format_utils.h
index 618f43d0aaa..378997b38b2 100644
--- a/src/mesa/main/format_utils.h
+++ b/src/mesa/main/format_utils.h
@@ -34,6 +34,7 @@
 #include "imports.h"
 #include "macros.h"
 #include "util/rounding.h"
+#include "util/half_float.h"
 
 extern const mesa_array_format RGBA32_FLOAT;
 extern const mesa_array_format RGBA8_UBYTE;
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index 350e6752c8b..230ebbc67f4 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -307,154 +307,6 @@ _mesa_bitcount_64(uint64_t n)
 }
 #endif
 
-
-/**
- * Convert a 4-byte float to a 2-byte half float.
- *
- * Not all float32 values can be represented exactly as a float16 value. We
- * round such intermediate float32 values to the nearest float16. When the
- * float32 lies exactly between to float16 values, we round to the one with
- * an even mantissa.
- *
- * This rounding behavior has several benefits:
- *   - It has no sign bias.
- *
- *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
- *     GPU ISA.
- *
- *   - By reproducing the behavior of the GPU (at least on Intel hardware),
- *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
- *     result in the same value as if the expression were executed on the GPU.
- */
-GLhalfARB
-_mesa_float_to_half(float val)
-{
-   const fi_type fi = {val};
-   const int flt_m = fi.i & 0x7fffff;
-   const int flt_e = (fi.i >> 23) & 0xff;
-   const int flt_s = (fi.i >> 31) & 0x1;
-   int s, e, m = 0;
-   GLhalfARB result;
-   
-   /* sign bit */
-   s = flt_s;
-
-   /* handle special cases */
-   if ((flt_e == 0) && (flt_m == 0)) {
-      /* zero */
-      /* m = 0; - already set */
-      e = 0;
-   }
-   else if ((flt_e == 0) && (flt_m != 0)) {
-      /* denorm -- denorm float maps to 0 half */
-      /* m = 0; - already set */
-      e = 0;
-   }
-   else if ((flt_e == 0xff) && (flt_m == 0)) {
-      /* infinity */
-      /* m = 0; - already set */
-      e = 31;
-   }
-   else if ((flt_e == 0xff) && (flt_m != 0)) {
-      /* NaN */
-      m = 1;
-      e = 31;
-   }
-   else {
-      /* regular number */
-      const int new_exp = flt_e - 127;
-      if (new_exp < -14) {
-         /* The float32 lies in the range (0.0, min_normal16) and is rounded
-          * to a nearby float16 value. The result will be either zero, subnormal,
-          * or normal.
-          */
-         e = 0;
-         m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f));
-      }
-      else if (new_exp > 15) {
-         /* map this value to infinity */
-         /* m = 0; - already set */
-         e = 31;
-      }
-      else {
-         /* The float32 lies in the range
-          *   [min_normal16, max_normal16 + max_step16)
-          * and is rounded to a nearby float16 value. The result will be
-          * either normal or infinite.
-          */
-         e = new_exp + 15;
-         m = _mesa_lroundevenf(flt_m / (float) (1 << 13));
-      }
-   }
-
-   assert(0 <= m && m <= 1024);
-   if (m == 1024) {
-      /* The float32 was rounded upwards into the range of the next exponent,
-       * so bump the exponent. This correctly handles the case where f32
-       * should be rounded up to float16 infinity.
-       */
-      ++e;
-      m = 0;
-   }
-
-   result = (s << 15) | (e << 10) | m;
-   return result;
-}
-
-
-/**
- * Convert a 2-byte half float to a 4-byte float.
- * Based on code from:
- * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html
- */
-float
-_mesa_half_to_float(GLhalfARB val)
-{
-   /* XXX could also use a 64K-entry lookup table */
-   const int m = val & 0x3ff;
-   const int e = (val >> 10) & 0x1f;
-   const int s = (val >> 15) & 0x1;
-   int flt_m, flt_e, flt_s;
-   fi_type fi;
-   float result;
-
-   /* sign bit */
-   flt_s = s;
-
-   /* handle special cases */
-   if ((e == 0) && (m == 0)) {
-      /* zero */
-      flt_m = 0;
-      flt_e = 0;
-   }
-   else if ((e == 0) && (m != 0)) {
-      /* denorm -- denorm half will fit in non-denorm single */
-      const float half_denorm = 1.0f / 16384.0f; /* 2^-14 */
-      float mantissa = ((float) (m)) / 1024.0f;
-      float sign = s ? -1.0f : 1.0f;
-      return sign * mantissa * half_denorm;
-   }
-   else if ((e == 31) && (m == 0)) {
-      /* infinity */
-      flt_e = 0xff;
-      flt_m = 0;
-   }
-   else if ((e == 31) && (m != 0)) {
-      /* NaN */
-      flt_e = 0xff;
-      flt_m = 1;
-   }
-   else {
-      /* regular */
-      flt_e = e + 112;
-      flt_m = m << 13;
-   }
-
-   fi.i = (flt_s << 31) | (flt_e << 23) | flt_m;
-   result = fi.f;
-   return result;
-}
-
 /*@}*/
 
 
diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h
index 90247587be3..042147fd8bb 100644
--- a/src/mesa/main/imports.h
+++ b/src/mesa/main/imports.h
@@ -396,13 +396,6 @@ _mesa_flsll(uint64_t n)
 #endif
 }
 
-
-extern GLhalfARB
-_mesa_float_to_half(float f);
-
-extern float
-_mesa_half_to_float(GLhalfARB h);
-
 static inline bool
 _mesa_half_is_negative(GLhalfARB h)
 {
diff --git a/src/mesa/main/matrix.c b/src/mesa/main/matrix.c
index 2b8016a4a72..5ff5ac5bfe1 100644
--- a/src/mesa/main/matrix.c
+++ b/src/mesa/main/matrix.c
@@ -151,7 +151,6 @@ _mesa_MatrixMode( GLenum mode )
 
    if (ctx->Transform.MatrixMode == mode && mode != GL_TEXTURE)
       return;
-   FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
 
    switch (mode) {
    case GL_MODELVIEW:
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index ab16c2854a8..50469956c6e 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -37,6 +37,7 @@
 #include "texstore.h"
 #include "image.h"
 #include "macros.h"
+#include "util/half_float.h"
 #include "../../gallium/auxiliary/util/u_format_rgb9e5.h"
 #include "../../gallium/auxiliary/util/u_format_r11g11b10f.h"
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index cbfb15522f0..e57b98a412d 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -42,7 +42,7 @@
 #include "main/config.h"
 #include "glapi/glapi.h"
 #include "math/m_matrix.h"	/* GLmatrix */
-#include "glsl/shader_enums.h"
+#include "glsl/nir/shader_enums.h"
 #include "main/formats.h"       /* MESA_FORMAT_COUNT */
 
 
@@ -94,11 +94,6 @@ struct vbo_context;
 #define PRIM_OUTSIDE_BEGIN_END   (PRIM_MAX + 1)
 #define PRIM_UNKNOWN             (PRIM_MAX + 2)
 
-#define VARYING_SLOT_MAX	(VARYING_SLOT_VAR0 + MAX_VARYING)
-#define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
-#define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)
-#define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
-
 /**
  * Determine if the given gl_varying_slot appears in the fragment shader.
  */
@@ -487,26 +482,24 @@ struct gl_colorbuffer_attrib
 struct gl_current_attrib
 {
    /**
-    * \name Current vertex attributes.
+    * \name Current vertex attributes (color, texcoords, etc).
     * \note Values are valid only after FLUSH_VERTICES has been called.
     * \note Index and Edgeflag current values are stored as floats in the 
     * SIX and SEVEN attribute slots.
+    * \note We need double storage for 64-bit vertex attributes
     */
-   /* we need double storage for this for vertex attrib 64bit */
-   GLfloat Attrib[VERT_ATTRIB_MAX][4*2];	/**< Position, color, texcoords, etc */
+   GLfloat Attrib[VERT_ATTRIB_MAX][4*2];
 
    /**
-    * \name Current raster position attributes (always valid).
-    * \note This set of attributes is very similar to the SWvertex struct.
+    * \name Current raster position attributes (always up to date after a
+    * glRasterPos call).
     */
-   /*@{*/
    GLfloat RasterPos[4];
    GLfloat RasterDistance;
    GLfloat RasterColor[4];
    GLfloat RasterSecondaryColor[4];
    GLfloat RasterTexCoords[MAX_TEXTURE_COORD_UNITS][4];
    GLboolean RasterPosValid;
-   /*@}*/
 };
 
 
@@ -1866,24 +1859,6 @@ typedef enum
 
 
 /**
- * \brief Layout qualifiers for gl_FragDepth.
- *
- * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
- * a layout qualifier.
- *
- * \see enum ir_depth_layout
- */
-enum gl_frag_depth_layout
-{
-   FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
-   FRAG_DEPTH_LAYOUT_ANY,
-   FRAG_DEPTH_LAYOUT_GREATER,
-   FRAG_DEPTH_LAYOUT_LESS,
-   FRAG_DEPTH_LAYOUT_UNCHANGED
-};
-
-
-/**
  * Base class for any kind of program object
  */
 struct gl_program
@@ -2286,12 +2261,34 @@ struct gl_shader
    unsigned num_combined_uniform_components;
 
    /**
-    * This shader's uniform block information.
+    * This shader's uniform/ssbo block information.
     *
     * These fields are only set post-linking.
+    *
+    * BufferInterfaceBlocks is a list containing both UBOs and SSBOs. This is
+    * useful during the linking process so that we don't have to handle SSBOs
+    * specifically.
+    *
+    * UniformBlocks is a list of UBOs. This is useful for backends that need
+    * or prefer to see separate index spaces for UBOS and SSBOs like the GL
+    * API specifies.
+    *
+    * ShaderStorageBlocks is a list of SSBOs. This is useful for backends that
+    * need or prefer to see separate index spaces for UBOS and SSBOs like the
+    * GL API specifies.
+    *
+    * UniformBlocks and ShaderStorageBlocks only have pointers into
+    * BufferInterfaceBlocks so the actual resource information is not
+    * duplicated.
     */
+   unsigned NumBufferInterfaceBlocks;
+   struct gl_uniform_block *BufferInterfaceBlocks;
+
    unsigned NumUniformBlocks;
-   struct gl_uniform_block *UniformBlocks;
+   struct gl_uniform_block **UniformBlocks;
+
+   unsigned NumShaderStorageBlocks;
+   struct gl_uniform_block **ShaderStorageBlocks;
 
    struct exec_list *ir;
    struct exec_list *packed_varyings;
@@ -2694,8 +2691,33 @@ struct gl_shader_program
     */
    unsigned LastClipDistanceArraySize;
 
+   /**
+    * This shader's uniform/ssbo block information.
+    *
+    * BufferInterfaceBlocks is a list containing both UBOs and SSBOs. This is
+    * useful during the linking process so that we don't have to handle SSBOs
+    * specifically.
+    *
+    * UniformBlocks is a list of UBOs. This is useful for backends that need
+    * or prefer to see separate index spaces for UBOS and SSBOs like the GL
+    * API specifies.
+    *
+    * ShaderStorageBlocks is a list of SSBOs. This is useful for backends that
+    * need or prefer to see separate index spaces for UBOS and SSBOs like the
+    * GL API specifies.
+    *
+    * UniformBlocks and ShaderStorageBlocks only have pointers into
+    * BufferInterfaceBlocks so the actual resource information is not
+    * duplicated and are only set after linking.
+    */
    unsigned NumBufferInterfaceBlocks;
-   struct gl_uniform_block *UniformBlocks;
+   struct gl_uniform_block *BufferInterfaceBlocks;
+
+   unsigned NumUniformBlocks;
+   struct gl_uniform_block **UniformBlocks;
+
+   unsigned NumShaderStorageBlocks;
+   struct gl_uniform_block **ShaderStorageBlocks;
 
    /**
     * Indices into the _LinkedShaders's UniformBlocks[] array for each stage
@@ -4076,13 +4098,6 @@ struct gl_image_unit
    GLboolean Layered;
 
    /**
-    * GL_TRUE if the state of this image unit is valid and access from
-    * the shader is allowed.  Otherwise loads from this unit should
-    * return zero and stores should have no effect.
-    */
-   GLboolean _Valid;
-
-   /**
     * Layer of the texture object bound to this unit as specified by the
     * application.
     */
diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
index 00e31b05c99..89faf515443 100644
--- a/src/mesa/main/pack.c
+++ b/src/mesa/main/pack.c
@@ -1073,6 +1073,21 @@ _mesa_pack_depth_span( struct gl_context *ctx, GLuint n, GLvoid *dest,
          }
       }
       break;
+   case GL_UNSIGNED_INT_24_8:
+      {
+         const GLdouble scale = (GLdouble) 0xffffff;
+         GLuint *dst = (GLuint *) dest;
+         GLuint i;
+         for (i = 0; i < n; i++) {
+            GLuint z = (GLuint) (depthSpan[i] * scale);
+            assert(z <= 0xffffff);
+            dst[i] = (z << 8);
+         }
+         if (dstPacking->SwapBytes) {
+            _mesa_swap4( (GLuint *) dst, n );
+         }
+         break;
+      }
    case GL_UNSIGNED_INT:
       {
          GLuint *dst = (GLuint *) dest;
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 6d73e3bdcf2..8182d3dcc04 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -485,8 +485,14 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
    case GL_COMPUTE_SUBROUTINE_UNIFORM:
    case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
    case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-   case GL_BUFFER_VARIABLE:
       return RESOURCE_UNI(res)->array_elements;
+   case GL_BUFFER_VARIABLE:
+      /* Unsized arrays */
+      if (RESOURCE_UNI(res)->array_stride > 0 &&
+          RESOURCE_UNI(res)->array_elements == 0)
+         return 1;
+      else
+         return RESOURCE_UNI(res)->array_elements;
    case GL_VERTEX_SUBROUTINE:
    case GL_GEOMETRY_SUBROUTINE:
    case GL_FRAGMENT_SUBROUTINE:
@@ -833,193 +839,6 @@ program_resource_location(struct gl_shader_program *shProg,
    }
 }
 
-static char*
-get_top_level_name(const char *name)
-{
-   const char *first_dot = strchr(name, '.');
-   const char *first_square_bracket = strchr(name, '[');
-   int name_size = 0;
-   /* From ARB_program_interface_query spec:
-    *
-    * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying the
-    *  number of active array elements of the top-level shader storage block
-    *  member containing to the active variable is written to <params>.  If the
-    *  top-level block member is not declared as an array, the value one is
-    *  written to <params>.  If the top-level block member is an array with no
-    *  declared size, the value zero is written to <params>.
-    */
-
-   /* The buffer variable is on top level.*/
-   if (!first_square_bracket && !first_dot)
-      name_size = strlen(name);
-   else if ((!first_square_bracket ||
-            (first_dot && first_dot < first_square_bracket)))
-      name_size = first_dot - name;
-   else
-      name_size = first_square_bracket - name;
-
-   return strndup(name, name_size);
-}
-
-static char*
-get_var_name(const char *name)
-{
-   const char *first_dot = strchr(name, '.');
-
-   if (!first_dot)
-      return strdup(name);
-
-   return strndup(first_dot+1, strlen(first_dot) - 1);
-}
-
-static GLint
-program_resource_top_level_array_size(struct gl_shader_program *shProg,
-                                      struct gl_program_resource *res,
-                                      const char *name)
-{
-   int block_index = RESOURCE_UNI(res)->block_index;
-   int array_size = -1;
-   char *var_name = get_top_level_name(name);
-   char *interface_name =
-      get_top_level_name(shProg->UniformBlocks[block_index].Name);
-
-   if (strcmp(var_name, interface_name) == 0) {
-      /* Deal with instanced array of SSBOs */
-      char *temp_name = get_var_name(name);
-      free(var_name);
-      var_name = get_top_level_name(temp_name);
-      free(temp_name);
-   }
-
-   for (unsigned i = 0; i < shProg->NumShaders; i++) {
-      if (shProg->Shaders[i] == NULL)
-         continue;
-
-      const gl_shader *stage = shProg->Shaders[i];
-      foreach_in_list(ir_instruction, node, stage->ir) {
-         ir_variable *var = node->as_variable();
-         if (!var || !var->get_interface_type() ||
-             var->data.mode != ir_var_shader_storage)
-            continue;
-
-         const glsl_type *interface = var->get_interface_type();
-
-         if (strcmp(interface_name, interface->name) != 0)
-            continue;
-
-         for (unsigned i = 0; i < interface->length; i++) {
-            const glsl_struct_field *field = &interface->fields.structure[i];
-            if (strcmp(field->name, var_name) != 0)
-               continue;
-            /* From GL_ARB_program_interface_query spec:
-             *
-             * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer
-             * identifying the number of active array elements of the top-level
-             * shader storage block member containing to the active variable is
-             * written to <params>.  If the top-level block member is not
-             * declared as an array, the value one is written to <params>.  If
-             * the top-level block member is an array with no declared size,
-             * the value zero is written to <params>.
-             */
-            if (field->type->is_unsized_array())
-               array_size = 0;
-            else if (field->type->is_array())
-               array_size = field->type->length;
-            else
-               array_size = 1;
-            goto found_top_level_array_size;
-         }
-      }
-   }
-found_top_level_array_size:
-   free(interface_name);
-   free(var_name);
-   return array_size;
-}
-
-static GLint
-program_resource_top_level_array_stride(struct gl_shader_program *shProg,
-                                        struct gl_program_resource *res,
-                                        const char *name)
-{
-   int block_index = RESOURCE_UNI(res)->block_index;
-   int array_stride = -1;
-   char *var_name = get_top_level_name(name);
-   char *interface_name =
-      get_top_level_name(shProg->UniformBlocks[block_index].Name);
-
-   if (strcmp(var_name, interface_name) == 0) {
-      /* Deal with instanced array of SSBOs */
-      char *temp_name = get_var_name(name);
-      free(var_name);
-      var_name = get_top_level_name(temp_name);
-      free(temp_name);
-   }
-
-   for (unsigned i = 0; i < shProg->NumShaders; i++) {
-      if (shProg->Shaders[i] == NULL)
-         continue;
-
-      const gl_shader *stage = shProg->Shaders[i];
-      foreach_in_list(ir_instruction, node, stage->ir) {
-         ir_variable *var = node->as_variable();
-         if (!var || !var->get_interface_type() ||
-             var->data.mode != ir_var_shader_storage)
-            continue;
-
-         const glsl_type *interface = var->get_interface_type();
-
-         if (strcmp(interface_name, interface->name) != 0) {
-            continue;
-         }
-
-         for (unsigned i = 0; i < interface->length; i++) {
-            const glsl_struct_field *field = &interface->fields.structure[i];
-            if (strcmp(field->name, var_name) != 0)
-               continue;
-            /* From GL_ARB_program_interface_query:
-             *
-             * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer
-             *  identifying the stride between array elements of the top-level
-             *  shader storage block member containing the active variable is
-             *  written to <params>.  For top-level block members declared as
-             *  arrays, the value written is the difference, in basic machine
-             *  units, between the offsets of the active variable for
-             *  consecutive elements in the top-level array.  For top-level
-             *  block members not declared as an array, zero is written to
-             *  <params>."
-             */
-            if (field->type->is_array()) {
-               const enum glsl_matrix_layout matrix_layout =
-                  glsl_matrix_layout(field->matrix_layout);
-               bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;
-               const glsl_type *array_type = field->type->fields.array;
-
-               if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) {
-                  if (array_type->is_record() || array_type->is_array()) {
-                     array_stride = array_type->std140_size(row_major);
-                     array_stride = glsl_align(array_stride, 16);
-                  } else {
-                     unsigned element_base_align = 0;
-                     element_base_align = array_type->std140_base_alignment(row_major);
-                     array_stride = MAX2(element_base_align, 16);
-                  }
-               } else {
-                  array_stride = array_type->std430_array_stride(row_major);
-               }
-            } else {
-               array_stride = 0;
-            }
-            goto found_top_level_array_size;
-         }
-      }
-   }
-found_top_level_array_size:
-   free(interface_name);
-   free(var_name);
-   return array_stride;
-}
-
 /**
  * Function implements following location queries:
  *    glGetUniformLocation
@@ -1133,7 +952,8 @@ get_buffer_property(struct gl_shader_program *shProg,
             (*val)++;
          }
          return 1;
-      case GL_ACTIVE_VARIABLES:
+      case GL_ACTIVE_VARIABLES: {
+         unsigned num_values = 0;
          for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) {
             const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName;
             struct gl_program_resource *uni =
@@ -1143,8 +963,10 @@ get_buffer_property(struct gl_shader_program *shProg,
                continue;
             *val++ =
                _mesa_program_resource_index(shProg, uni);
+            num_values++;
          }
-         return RESOURCE_UBO(res)->NumUniforms;
+         return num_values;
+      }
       }
    } else if (res->Type == GL_SHADER_STORAGE_BLOCK) {
       switch (prop) {
@@ -1166,7 +988,8 @@ get_buffer_property(struct gl_shader_program *shProg,
             (*val)++;
          }
          return 1;
-      case GL_ACTIVE_VARIABLES:
+      case GL_ACTIVE_VARIABLES: {
+         unsigned num_values = 0;
          for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) {
             const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName;
             struct gl_program_resource *uni =
@@ -1176,8 +999,10 @@ get_buffer_property(struct gl_shader_program *shProg,
                continue;
             *val++ =
                _mesa_program_resource_index(shProg, uni);
+            num_values++;
          }
-         return RESOURCE_UBO(res)->NumUniforms;
+         return num_values;
+      }
       }
    } else if (res->Type == GL_ATOMIC_COUNTER_BUFFER) {
       switch (prop) {
@@ -1251,8 +1076,15 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       switch (res->Type) {
       case GL_UNIFORM:
       case GL_BUFFER_VARIABLE:
+         /* Test if a buffer variable is an array or an unsized array.
+          * Unsized arrays return zero as array size.
+          */
+         if (RESOURCE_UNI(res)->is_shader_storage &&
+             RESOURCE_UNI(res)->array_stride > 0)
+            *val = RESOURCE_UNI(res)->array_elements;
+         else
             *val = MAX2(RESOURCE_UNI(res)->array_elements, 1);
-            return 1;
+         return 1;
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
          *val = MAX2(_mesa_program_resource_array_size(res), 1);
@@ -1374,14 +1206,12 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
 
    case GL_TOP_LEVEL_ARRAY_SIZE:
       VALIDATE_TYPE(GL_BUFFER_VARIABLE);
-      *val = program_resource_top_level_array_size(shProg, res,
-                                                   _mesa_program_resource_name(res));
+      *val = RESOURCE_UNI(res)->top_level_array_size;
       return 1;
 
    case GL_TOP_LEVEL_ARRAY_STRIDE:
       VALIDATE_TYPE(GL_BUFFER_VARIABLE);
-      *val = program_resource_top_level_array_stride(shProg, res,
-                                                     _mesa_program_resource_name(res));
+      *val = RESOURCE_UNI(res)->top_level_array_stride;
       return 1;
 
    /* GL_ARB_tessellation_shader */
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 9dd1054c8ee..18e463d4ccc 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -713,10 +713,10 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
       if (!has_ubo)
          break;
 
-      for (i = 0; i < shProg->NumBufferInterfaceBlocks; i++) {
+      for (i = 0; i < shProg->NumUniformBlocks; i++) {
 	 /* Add one for the terminating NUL character.
 	  */
-	 const GLint len = strlen(shProg->UniformBlocks[i].Name) + 1;
+	 const GLint len = strlen(shProg->UniformBlocks[i]->Name) + 1;
 
 	 if (len > max_len)
 	    max_len = len;
@@ -729,11 +729,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
       if (!has_ubo)
          break;
 
-      *params = 0;
-      for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) {
-         if (!shProg->UniformBlocks[i].IsShaderStorage)
-            (*params)++;
-      }
+      *params = shProg->NumUniformBlocks;
       return;
    case GL_PROGRAM_BINARY_RETRIEVABLE_HINT:
       /* This enum isn't part of the OES extension for OpenGL ES 2.0.  It is
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index bd4b7c7be3b..c4ebf4201fb 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -415,8 +415,8 @@ _mesa_init_image_units(struct gl_context *ctx)
       ctx->ImageUnits[i] = _mesa_default_image_unit(ctx);
 }
 
-static GLboolean
-validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u)
+GLboolean
+_mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u)
 {
    struct gl_texture_object *t = u->TexObj;
    mesa_format tex_format;
@@ -424,7 +424,8 @@ validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u)
    if (!t)
       return GL_FALSE;
 
-   _mesa_test_texobj_completeness(ctx, t);
+   if (!t->_BaseComplete && !t->_MipmapComplete)
+       _mesa_test_texobj_completeness(ctx, t);
 
    if (u->Level < t->BaseLevel ||
        u->Level > t->_MaxLevel ||
@@ -473,17 +474,6 @@ validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u)
    return GL_TRUE;
 }
 
-void
-_mesa_validate_image_units(struct gl_context *ctx)
-{
-   unsigned i;
-
-   for (i = 0; i < ctx->Const.MaxImageUnits; ++i) {
-      struct gl_image_unit *u = &ctx->ImageUnits[i];
-      u->_Valid = validate_image_unit(ctx, u);
-   }
-}
-
 static GLboolean
 validate_bind_image_texture(struct gl_context *ctx, GLuint unit,
                             GLuint texture, GLint level, GLboolean layered,
@@ -567,7 +557,6 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
    u->Access = access;
    u->Format = format;
    u->_ActualFormat = _mesa_get_shader_image_format(format);
-   u->_Valid = validate_image_unit(ctx, u);
 
    if (u->TexObj && _mesa_tex_target_is_layered(u->TexObj->Target)) {
       u->Layered = layered;
@@ -703,7 +692,6 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
          u->Access = GL_READ_WRITE;
          u->Format = tex_format;
          u->_ActualFormat = _mesa_get_shader_image_format(tex_format);
-         u->_Valid = validate_image_unit(ctx, u);
       } else {
          /* Unbind the texture from the unit */
          _mesa_reference_texobj(&u->TexObj, NULL);
@@ -713,7 +701,6 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
          u->Access = GL_READ_ONLY;
          u->Format = GL_R8;
          u->_ActualFormat = MESA_FORMAT_R_UNORM8;
-         u->_Valid = GL_FALSE;
       }
    }
 
diff --git a/src/mesa/main/shaderimage.h b/src/mesa/main/shaderimage.h
index bbe088a2459..94ee814a716 100644
--- a/src/mesa/main/shaderimage.h
+++ b/src/mesa/main/shaderimage.h
@@ -55,13 +55,15 @@ void
 _mesa_init_image_units(struct gl_context *ctx);
 
 /**
- * Recalculate the \c _Valid flag of a context's shader image units.
+ * Return GL_TRUE if the state of the image unit passed as argument is valid
+ * and access from the shader is allowed.  Otherwise loads from this unit
+ * should return zero and stores should have no effect.
  *
- * To be called when the state of any texture bound to an image unit
- * changes.
+ * The result depends on context state other than the passed image unit, part
+ * of the _NEW_TEXTURE set.
  */
-void
-_mesa_validate_image_units(struct gl_context *ctx);
+GLboolean
+_mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u);
 
 void GLAPIENTRY
 _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c
index 4e85fda24b4..ffc71931fec 100644
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -290,8 +290,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
    ralloc_free(shProg->InfoLog);
    shProg->InfoLog = ralloc_strdup(shProg, "");
 
-   ralloc_free(shProg->UniformBlocks);
-   shProg->UniformBlocks = NULL;
+   ralloc_free(shProg->BufferInterfaceBlocks);
+   shProg->BufferInterfaceBlocks = NULL;
    shProg->NumBufferInterfaceBlocks = 0;
    for (i = 0; i < MESA_SHADER_STAGES; i++) {
       ralloc_free(shProg->UniformBlockStageIndex[i]);
diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c
index 1acaf59f432..c37b31d1753 100644
--- a/src/mesa/main/shared.c
+++ b/src/mesa/main/shared.c
@@ -107,6 +107,11 @@ _mesa_alloc_shared_state(struct gl_context *ctx)
       };
       STATIC_ASSERT(ARRAY_SIZE(targets) == NUM_TEXTURE_TARGETS);
       shared->DefaultTex[i] = ctx->Driver.NewTextureObject(ctx, 0, targets[i]);
+      /* Need to explicitly set/overwrite the TargetIndex field here since
+       * the call to _mesa_tex_target_to_index() in NewTextureObject() may
+       * fail if the texture target is not supported.
+       */
+      shared->DefaultTex[i]->TargetIndex = i;
    }
 
    /* sanity check */
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index d3b1c72b08d..4043c4f2057 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -391,8 +391,12 @@ _mesa_update_state_locked( struct gl_context *ctx )
    GLbitfield new_state = ctx->NewState;
    GLbitfield prog_flags = _NEW_PROGRAM;
    GLbitfield new_prog_state = 0x0;
+   const GLbitfield computed_states = ~(_NEW_CURRENT_ATTRIB | _NEW_LINE);
 
-   if (new_state == _NEW_CURRENT_ATTRIB) 
+   /* we can skip a bunch of state validation checks if the dirty
+    * state matches one or more bits in 'computed_states'.
+    */
+   if ((new_state & computed_states) == 0)
       goto out;
 
    if (MESA_VERBOSE & VERBOSE_STATE)
diff --git a/src/mesa/main/texcompress_bptc.c b/src/mesa/main/texcompress_bptc.c
index f0f6553a01b..26e59158007 100644
--- a/src/mesa/main/texcompress_bptc.c
+++ b/src/mesa/main/texcompress_bptc.c
@@ -30,6 +30,7 @@
 #include "texcompress.h"
 #include "texcompress_bptc.h"
 #include "util/format_srgb.h"
+#include "util/half_float.h"
 #include "texstore.h"
 #include "macros.h"
 #include "image.h"
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index 173e43c817c..547055ecf39 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -286,6 +286,12 @@ _mesa_initialize_texture_object( struct gl_context *ctx,
    obj->RefCount = 1;
    obj->Name = name;
    obj->Target = target;
+   if (target != 0) {
+      obj->TargetIndex = _mesa_tex_target_to_index(ctx, target);
+   }
+   else {
+      obj->TargetIndex = NUM_TEXTURE_TARGETS; /* invalid/error value */
+   }
    obj->Priority = 1.0F;
    obj->BaseLevel = 0;
    obj->MaxLevel = 1000;
@@ -340,6 +346,10 @@ finish_texture_init(struct gl_context *ctx, GLenum target,
    GLenum filter = GL_LINEAR;
    assert(obj->Target == 0);
 
+   obj->Target = target;
+   obj->TargetIndex = _mesa_tex_target_to_index(ctx, target);
+   assert(obj->TargetIndex < NUM_TEXTURE_TARGETS);
+
    switch (target) {
       case GL_TEXTURE_2D_MULTISAMPLE:
       case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
@@ -1185,46 +1195,26 @@ invalidate_tex_image_error_check(struct gl_context *ctx, GLuint texture,
    return t;
 }
 
-/**
- * Wrapper for the driver function. Need this because _mesa_new_texture_object
- * permits a target of 0 and does not initialize targetIndex.
- */
-struct gl_texture_object *
-_mesa_create_nameless_texture(struct gl_context *ctx, GLenum target)
-{
-   struct gl_texture_object *texObj = NULL;
-   GLint targetIndex;
-
-   if (target == 0)
-      return texObj;
-
-   texObj = ctx->Driver.NewTextureObject(ctx, 0, target);
-   targetIndex = _mesa_tex_target_to_index(ctx, texObj->Target);
-   assert(targetIndex < NUM_TEXTURE_TARGETS);
-   texObj->TargetIndex = targetIndex;
-
-   return texObj;
-}
 
 /**
  * Helper function for glCreateTextures and glGenTextures. Need this because
  * glCreateTextures should throw errors if target = 0. This is not exposed to
  * the rest of Mesa to encourage Mesa internals to use nameless textures,
  * which do not require expensive hash lookups.
+ * \param target  either 0 or a a valid / error-checked texture target enum
  */
 static void
 create_textures(struct gl_context *ctx, GLenum target,
-                GLsizei n, GLuint *textures, bool dsa)
+                GLsizei n, GLuint *textures, const char *caller)
 {
    GLuint first;
    GLint i;
-   const char *func = dsa ? "Create" : "Gen";
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "gl%sTextures %d\n", func, n);
+      _mesa_debug(ctx, "%s %d\n", caller, n);
 
    if (n < 0) {
-      _mesa_error( ctx, GL_INVALID_VALUE, "gl%sTextures(n < 0)", func );
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", caller);
       return;
    }
 
@@ -1241,28 +1231,14 @@ create_textures(struct gl_context *ctx, GLenum target,
    /* Allocate new, empty texture objects */
    for (i = 0; i < n; i++) {
       struct gl_texture_object *texObj;
-      GLint targetIndex;
       GLuint name = first + i;
       texObj = ctx->Driver.NewTextureObject(ctx, name, target);
       if (!texObj) {
          mtx_unlock(&ctx->Shared->Mutex);
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sTextures", func);
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sTextures", caller);
          return;
       }
 
-      /* Initialize the target index if target is non-zero. */
-      if (target != 0) {
-         targetIndex = _mesa_tex_target_to_index(ctx, texObj->Target);
-         if (targetIndex < 0) { /* Bad Target */
-            mtx_unlock(&ctx->Shared->Mutex);
-            _mesa_error(ctx, GL_INVALID_ENUM, "gl%sTextures(target = %s)",
-                        func, _mesa_enum_to_string(texObj->Target));
-            return;
-         }
-         assert(targetIndex < NUM_TEXTURE_TARGETS);
-         texObj->TargetIndex = targetIndex;
-      }
-
       /* insert into hash table */
       _mesa_HashInsert(ctx->Shared->TexObjects, texObj->Name, texObj);
 
@@ -1296,7 +1272,7 @@ void GLAPIENTRY
 _mesa_GenTextures(GLsizei n, GLuint *textures)
 {
    GET_CURRENT_CONTEXT(ctx);
-   create_textures(ctx, 0, n, textures, false);
+   create_textures(ctx, 0, n, textures, "glGenTextures");
 }
 
 /**
@@ -1329,7 +1305,7 @@ _mesa_CreateTextures(GLenum target, GLsizei n, GLuint *textures)
       return;
    }
 
-   create_textures(ctx, target, n, textures, true);
+   create_textures(ctx, target, n, textures, "glCreateTextures");
 }
 
 /**
@@ -1383,8 +1359,12 @@ unbind_texobj_from_texunits(struct gl_context *ctx,
    const gl_texture_index index = texObj->TargetIndex;
    GLuint u;
 
-   if (texObj->Target == 0)
+   if (texObj->Target == 0) {
+      /* texture was never bound */
       return;
+   }
+
+   assert(index < NUM_TEXTURE_TARGETS);
 
    for (u = 0; u < ctx->Texture.NumCurrentTexUsed; u++) {
       struct gl_texture_unit *unit = &ctx->Texture.Unit[u];
@@ -1752,10 +1732,11 @@ _mesa_BindTexture( GLenum target, GLuint texName )
          _mesa_HashInsert(ctx->Shared->TexObjects, texName, newTexObj);
          mtx_unlock(&ctx->Shared->Mutex);
       }
-      newTexObj->Target = target;
-      newTexObj->TargetIndex = targetIndex;
    }
 
+   assert(newTexObj->Target == target);
+   assert(newTexObj->TargetIndex == targetIndex);
+
    bind_texture(ctx, ctx->Texture.CurrentUnit, newTexObj);
 }
 
@@ -1778,19 +1759,12 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_texture_object *texObj;
-   struct gl_texture_unit *texUnit;
 
    if (unit >= _mesa_max_tex_unit(ctx)) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBindTextureUnit(unit=%u)", unit);
       return;
    }
 
-   texUnit = _mesa_get_tex_unit(ctx, unit);
-   assert(texUnit);
-   if (!texUnit) {
-      return;
-   }
-
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glBindTextureUnit %s %d\n",
                   _mesa_enum_to_string(GL_TEXTURE0+unit), (GLint) texture);
@@ -1812,7 +1786,7 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture)
    /* Error checking */
    if (!texObj) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-         "glBindTextureUnit(non-gen name)");
+                  "glBindTextureUnit(non-gen name)");
       return;
    }
    if (texObj->Target == 0) {
diff --git a/src/mesa/main/texobj.h b/src/mesa/main/texobj.h
index 690878c85fc..8421337de4d 100644
--- a/src/mesa/main/texobj.h
+++ b/src/mesa/main/texobj.h
@@ -202,9 +202,6 @@ _mesa_unlock_context_textures( struct gl_context *ctx );
 extern void
 _mesa_lock_context_textures( struct gl_context *ctx );
 
-extern struct gl_texture_object *
-_mesa_create_nameless_texture(struct gl_context *ctx, GLenum target);
-
 extern void
 _mesa_delete_nameless_texture(struct gl_context *ctx,
                               struct gl_texture_object *texObj);
diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index 9b5928c4306..cb147fac476 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -34,7 +34,6 @@
 #include "context.h"
 #include "enums.h"
 #include "macros.h"
-#include "shaderimage.h"
 #include "texobj.h"
 #include "teximage.h"
 #include "texstate.h"
@@ -741,8 +740,6 @@ update_texture_state( struct gl_context *ctx )
 
    if (!prog[MESA_SHADER_FRAGMENT] || !prog[MESA_SHADER_VERTEX])
       update_texgen(ctx);
-
-   _mesa_validate_image_units(ctx);
 }
 
 
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index 5a3282a40c1..04b7d73da5c 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -681,6 +681,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
    texObj->Immutable = GL_TRUE;
    texObj->ImmutableLevels = origTexObj->ImmutableLevels;
    texObj->Target = target;
+   texObj->TargetIndex = _mesa_tex_target_to_index(ctx, target);
+   assert(texObj->TargetIndex < NUM_TEXTURE_TARGETS);
 
    if (ctx->Driver.TextureView != NULL &&
        !ctx->Driver.TextureView(ctx, texObj, origTexObj)) {
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index d48729778ae..083087d6baa 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -318,19 +318,12 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
 
       return;
    }
-   if ((uni->type->base_type == GLSL_TYPE_DOUBLE &&
-        returnType != GLSL_TYPE_DOUBLE) ||
-       (uni->type->base_type != GLSL_TYPE_DOUBLE &&
-        returnType == GLSL_TYPE_DOUBLE)) {
-	 _mesa_error( ctx, GL_INVALID_OPERATION,
-	             "glGetnUniform*vARB(incompatible uniform types)");
-	return;
-   }
 
    {
       unsigned elements = (uni->type->is_sampler())
 	 ? 1 : uni->type->components();
       const int dmul = uni->type->base_type == GLSL_TYPE_DOUBLE ? 2 : 1;
+      const int rmul = returnType == GLSL_TYPE_DOUBLE ? 2 : 1;
 
       /* Calculate the source base address *BEFORE* modifying elements to
        * account for the size of the user's buffer.
@@ -342,7 +335,7 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
              returnType == GLSL_TYPE_UINT || returnType == GLSL_TYPE_DOUBLE);
 
       /* doubles have a different size than the other 3 types */
-      unsigned bytes = sizeof(src[0]) * elements * dmul;
+      unsigned bytes = sizeof(src[0]) * elements * rmul;
       if (bufSize < 0 || bytes > (unsigned) bufSize) {
 	 _mesa_error( ctx, GL_INVALID_OPERATION,
 	             "glGetnUniform*vARB(out of bounds: bufSize is %d,"
@@ -366,32 +359,57 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
       } else {
 	 union gl_constant_value *const dst =
 	    (union gl_constant_value *) paramsOut;
-
 	 /* This code could be optimized by putting the loop inside the switch
 	  * statements.  However, this is not expected to be
 	  * performance-critical code.
 	  */
 	 for (unsigned i = 0; i < elements; i++) {
+	   int sidx = i * dmul;
+	   int didx = i * rmul;
+
 	    switch (returnType) {
 	    case GLSL_TYPE_FLOAT:
 	       switch (uni->type->base_type) {
 	       case GLSL_TYPE_UINT:
-		  dst[i].f = (float) src[i].u;
+		  dst[didx].f = (float) src[sidx].u;
 		  break;
 	       case GLSL_TYPE_INT:
 	       case GLSL_TYPE_SAMPLER:
                case GLSL_TYPE_IMAGE:
-		  dst[i].f = (float) src[i].i;
+		  dst[didx].f = (float) src[sidx].i;
 		  break;
 	       case GLSL_TYPE_BOOL:
-		  dst[i].f = src[i].i ? 1.0f : 0.0f;
+		  dst[didx].f = src[sidx].i ? 1.0f : 0.0f;
+		  break;
+	       case GLSL_TYPE_DOUBLE:
+		  dst[didx].f = *(double *)&src[sidx].f;
+		  break;
+	       default:
+		  assert(!"Should not get here.");
+		  break;
+	       }
+	       break;
+	    case GLSL_TYPE_DOUBLE:
+	       switch (uni->type->base_type) {
+	       case GLSL_TYPE_UINT:
+		  *(double *)&dst[didx].f = (double) src[sidx].u;
+		  break;
+	       case GLSL_TYPE_INT:
+	       case GLSL_TYPE_SAMPLER:
+	       case GLSL_TYPE_IMAGE:
+		  *(double *)&dst[didx].f = (double) src[sidx].i;
+		  break;
+	       case GLSL_TYPE_BOOL:
+		  *(double *)&dst[didx].f = src[sidx].i ? 1.0f : 0.0f;
+		  break;
+	       case GLSL_TYPE_FLOAT:
+		  *(double *)&dst[didx].f = (double) src[sidx].f;
 		  break;
 	       default:
 		  assert(!"Should not get here.");
 		  break;
 	       }
 	       break;
-
 	    case GLSL_TYPE_INT:
 	    case GLSL_TYPE_UINT:
 	       switch (uni->type->base_type) {
@@ -413,10 +431,13 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
 		   *      a floating-point value is rounded to the
 		   *      nearest integer..."
 		   */
-		  dst[i].i = IROUND(src[i].f);
+		  dst[didx].i = IROUND(src[sidx].f);
 		  break;
 	       case GLSL_TYPE_BOOL:
-		  dst[i].i = src[i].i ? 1 : 0;
+		  dst[didx].i = src[sidx].i ? 1 : 0;
+		  break;
+	       case GLSL_TYPE_DOUBLE:
+		  dst[didx].i = *(double *)&src[sidx].f;
 		  break;
 	       default:
 		  assert(!"Should not get here.");
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 04cc81f9809..bc235380d97 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1016,21 +1016,21 @@ _mesa_UniformBlockBinding(GLuint program,
       return;
    }
 
-   if (shProg->UniformBlocks[uniformBlockIndex].Binding !=
+   if (shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding !=
        uniformBlockBinding) {
       int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
 
-      shProg->UniformBlocks[uniformBlockIndex].Binding = uniformBlockBinding;
+      shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding;
 
       for (i = 0; i < MESA_SHADER_STAGES; i++) {
 	 int stage_index = shProg->UniformBlockStageIndex[i][uniformBlockIndex];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = shProg->_LinkedShaders[i];
-	    sh->UniformBlocks[stage_index].Binding = uniformBlockBinding;
+	    sh->BufferInterfaceBlocks[stage_index].Binding = uniformBlockBinding;
 	 }
       }
    }
@@ -1069,21 +1069,21 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
       return;
    }
 
-   if (shProg->UniformBlocks[shaderStorageBlockIndex].Binding !=
+   if (shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding !=
        shaderStorageBlockBinding) {
       int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
 
-      shProg->UniformBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding;
+      shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding;
 
       for (i = 0; i < MESA_SHADER_STAGES; i++) {
 	 int stage_index = shProg->UniformBlockStageIndex[i][shaderStorageBlockIndex];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = shProg->_LinkedShaders[i];
-	    sh->UniformBlocks[stage_index].Binding = shaderStorageBlockBinding;
+	    sh->BufferInterfaceBlocks[stage_index].Binding = shaderStorageBlockBinding;
 	 }
       }
    }
diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h
index bec035cdc97..2f88b65043d 100644
--- a/src/mesa/main/uniforms.h
+++ b/src/mesa/main/uniforms.h
@@ -27,7 +27,7 @@
 #define UNIFORMS_H
 
 #include "main/glheader.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/ir_uniform.h"
 #include "program/prog_parameter.h"
 
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 498b2f867d0..5635a643200 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -24,6 +24,7 @@
 
 
 #include <stdio.h>
+#include "context.h"
 #include "imports.h"
 #include "mtypes.h"
 #include "version.h"
@@ -181,7 +182,23 @@ _mesa_override_gl_version(struct gl_context *ctx)
 {
    if (_mesa_override_gl_version_contextless(&ctx->Const, &ctx->API,
                                              &ctx->Version)) {
-      create_version_string(ctx, "");
+      /* We need to include API in version string for OpenGL ES, otherwise
+       * application can not detect GLES via glGetString(GL_VERSION) query.
+       *
+       * From OpenGL ES 3.2 spec, Page 436:
+       *
+       *     "The VERSION string is laid out as follows:
+       *
+       *     OpenGL ES N.M vendor-specific information"
+       *
+       * From OpenGL 4.5 spec, Page 538:
+       *
+       *     "The VERSION and SHADING_LANGUAGE_VERSION strings are laid out as
+       *     follows:
+       *
+       *     <version number><space><vendor-specific information>"
+       */
+      create_version_string(ctx, _mesa_is_gles(ctx) ? "OpenGL ES " : "");
    }
 }
 
diff --git a/src/mesa/program/Android.mk b/src/mesa/program/Android.mk
index ccb0fa5f32b..cc67f8aeadd 100644
--- a/src/mesa/program/Android.mk
+++ b/src/mesa/program/Android.mk
@@ -75,6 +75,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa \
 	$(MESA_TOP)/src/glsl \
+	$(MESA_TOP)/src/glsl/nir \
 	$(MESA_TOP)/src/gallium/auxiliary \
 	$(MESA_TOP)/src/gallium/include
 
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 0214b8e684c..1099d79d834 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -42,7 +42,7 @@
 #include "glsl/ir_optimization.h"
 #include "glsl/ir_uniform.h"
 #include "glsl/glsl_parser_extras.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/linker.h"
 #include "glsl/program.h"
 #include "program/hash_table.h"
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index fc00534028f..539e3c05312 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -923,7 +923,7 @@ ptn_add_output_stores(struct ptn_compile *c)
 {
    nir_builder *b = &c->build;
 
-   foreach_list_typed(nir_variable, var, node, &b->shader->outputs) {
+   nir_foreach_variable(var, &b->shader->outputs) {
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
       store->num_components = glsl_get_vector_elements(var->type);
@@ -958,11 +958,10 @@ setup_registers_and_variables(struct ptn_compile *c)
    for (int i = 0; i < num_inputs; i++) {
       if (!(c->prog->InputsRead & BITFIELD64_BIT(i)))
          continue;
-      nir_variable *var = rzalloc(shader, nir_variable);
-      var->type = glsl_vec4_type();
-      var->data.read_only = true;
-      var->data.mode = nir_var_shader_in;
-      var->name = ralloc_asprintf(var, "in_%d", i);
+
+      nir_variable *var =
+         nir_variable_create(shader, nir_var_shader_in, glsl_vec4_type(),
+                             ralloc_asprintf(shader, "in_%d", i));
       var->data.location = i;
       var->data.index = 0;
 
@@ -992,12 +991,9 @@ setup_registers_and_variables(struct ptn_compile *c)
             nir_ssa_def *f001 = nir_vec4(b, &load_x->dest.ssa, nir_imm_float(b, 0.0),
                                          nir_imm_float(b, 0.0), nir_imm_float(b, 1.0));
 
-            nir_variable *fullvar = rzalloc(shader, nir_variable);
-            fullvar->type = glsl_vec4_type();
-            fullvar->data.mode = nir_var_local;
-            fullvar->name = "fogcoord_tmp";
-            exec_list_push_tail(&b->impl->locals, &fullvar->node);
-
+            nir_variable *fullvar =
+               nir_local_variable_create(b->impl, glsl_vec4_type(),
+                                         "fogcoord_tmp");
             nir_intrinsic_instr *store =
                nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
             store->num_components = 4;
@@ -1005,17 +1001,15 @@ setup_registers_and_variables(struct ptn_compile *c)
             store->src[0] = nir_src_for_ssa(f001);
             nir_builder_instr_insert(b, &store->instr);
 
-            /* Insert the real input into the list so the driver has real
-             * inputs, but set c->input_vars[i] to the temporary so we use
+            /* We inserted the real input into the list so the driver has real
+             * inputs, but we set c->input_vars[i] to the temporary so we use
              * the splatted value.
              */
-            exec_list_push_tail(&shader->inputs, &var->node);
             c->input_vars[i] = fullvar;
             continue;
          }
       }
 
-      exec_list_push_tail(&shader->inputs, &var->node);
       c->input_vars[i] = var;
    }
 
@@ -1135,6 +1129,12 @@ prog_to_nir(const struct gl_program *prog,
    s->info.uses_clip_distance_out = false;
    s->info.separate_shader = false;
 
+   if (stage == MESA_SHADER_FRAGMENT) {
+      struct gl_fragment_program *fp = (struct gl_fragment_program *)prog;
+
+      s->info.fs.uses_discard = fp->UsesKill;
+   }
+
 fail:
    if (c->error) {
       ralloc_free(s);
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index e94c1021258..0e78e6ab25d 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -173,57 +173,15 @@ _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string)
 
 
 /**
- * Find the line number and column for 'pos' within 'string'.
- * Return a copy of the line which contains 'pos'.  Free the line with
- * free().
- * \param string  the program string
- * \param pos     the position within the string
- * \param line    returns the line number corresponding to 'pos'.
- * \param col     returns the column number corresponding to 'pos'.
- * \return copy of the line containing 'pos'.
- */
-const GLubyte *
-_mesa_find_line_column(const GLubyte *string, const GLubyte *pos,
-                       GLint *line, GLint *col)
-{
-   const GLubyte *lineStart = string;
-   const GLubyte *p = string;
-   GLubyte *s;
-   int len;
-
-   *line = 1;
-
-   while (p != pos) {
-      if (*p == (GLubyte) '\n') {
-         (*line)++;
-         lineStart = p + 1;
-      }
-      p++;
-   }
-
-   *col = (pos - lineStart) + 1;
-
-   /* return copy of this line */
-   while (*p != 0 && *p != '\n')
-      p++;
-   len = p - lineStart;
-   s = malloc(len + 1);
-   memcpy(s, lineStart, len);
-   s[len] = 0;
-
-   return s;
-}
-
-
-/**
  * Initialize a new gl_program object.
  */
-static void
-init_program_struct(struct gl_program *prog, GLenum target, GLuint id)
+struct gl_program *
+_mesa_init_gl_program(struct gl_program *prog, GLenum target, GLuint id)
 {
    GLuint i;
 
-   assert(prog);
+   if (!prog)
+      return NULL;
 
    memset(prog, 0, sizeof(*prog));
    mtx_init(&prog->Mutex, mtx_plain);
@@ -235,102 +193,8 @@ init_program_struct(struct gl_program *prog, GLenum target, GLuint id)
    /* default mapping from samplers to texture units */
    for (i = 0; i < MAX_SAMPLERS; i++)
       prog->SamplerUnits[i] = i;
-}
-
-
-/**
- * Initialize a new fragment program object.
- */
-struct gl_program *
-_mesa_init_fragment_program(struct gl_context *ctx,
-                            struct gl_fragment_program *prog,
-                            GLenum target, GLuint id)
-{
-   if (prog) {
-      init_program_struct(&prog->Base, target, id);
-      return &prog->Base;
-   }
-   return NULL;
-}
-
-
-/**
- * Initialize a new vertex program object.
- */
-struct gl_program *
-_mesa_init_vertex_program(struct gl_context *ctx,
-                          struct gl_vertex_program *prog,
-                          GLenum target, GLuint id)
-{
-   if (prog) {
-      init_program_struct(&prog->Base, target, id);
-      return &prog->Base;
-   }
-   return NULL;
-}
-
-
-/**
- * Initialize a new compute program object.
- */
-struct gl_program *
-_mesa_init_compute_program(struct gl_context *ctx,
-                           struct gl_compute_program *prog,
-                           GLenum target, GLuint id)
-{
-   if (prog) {
-      init_program_struct(&prog->Base, target, id);
-      return &prog->Base;
-   }
-   return NULL;
-}
-
-
-/**
- * Initialize a new tessellation control program object.
- */
-struct gl_program *
-_mesa_init_tess_ctrl_program(struct gl_context *ctx,
-                             struct gl_tess_ctrl_program *prog,
-                             GLenum target, GLuint id)
-{
-   if (prog) {
-      init_program_struct(&prog->Base, target, id);
-      return &prog->Base;
-   }
-   return NULL;
-}
 
-
-/**
- * Initialize a new tessellation evaluation program object.
- */
-struct gl_program *
-_mesa_init_tess_eval_program(struct gl_context *ctx,
-                             struct gl_tess_eval_program *prog,
-                             GLenum target, GLuint id)
-{
-   if (prog) {
-      init_program_struct(&prog->Base, target, id);
-      return &prog->Base;
-   }
-   return NULL;
-}
-
-
-/**
- * Initialize a new geometry program object.
- */
-struct gl_program *
-_mesa_init_geometry_program(struct gl_context *ctx,
-                            struct gl_geometry_program *prog,
-                            GLenum target, GLuint id)
-{
-   if (prog) {
-      init_program_struct(&prog->Base, target, id);
-      return &prog->Base;
-   }
-   return NULL;
+   return prog;
 }
 
 
@@ -349,43 +213,36 @@ _mesa_init_geometry_program(struct gl_context *ctx,
 struct gl_program *
 _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id)
 {
-   struct gl_program *prog;
    switch (target) {
-   case GL_VERTEX_PROGRAM_ARB: /* == GL_VERTEX_PROGRAM_NV */
-      prog = _mesa_init_vertex_program(ctx, CALLOC_STRUCT(gl_vertex_program),
-                                       target, id );
-      break;
+   case GL_VERTEX_PROGRAM_ARB: { /* == GL_VERTEX_PROGRAM_NV */
+      struct gl_vertex_program *prog = CALLOC_STRUCT(gl_vertex_program);
+      return _mesa_init_gl_program(&prog->Base, target, id);
+   }
    case GL_FRAGMENT_PROGRAM_NV:
-   case GL_FRAGMENT_PROGRAM_ARB:
-      prog =_mesa_init_fragment_program(ctx,
-                                         CALLOC_STRUCT(gl_fragment_program),
-                                         target, id );
-      break;
-   case GL_GEOMETRY_PROGRAM_NV:
-      prog = _mesa_init_geometry_program(ctx,
-                                         CALLOC_STRUCT(gl_geometry_program),
-                                         target, id);
-      break;
-   case GL_TESS_CONTROL_PROGRAM_NV:
-      prog = _mesa_init_tess_ctrl_program(ctx,
-                                          CALLOC_STRUCT(gl_tess_ctrl_program),
-                                          target, id);
-      break;
-   case GL_TESS_EVALUATION_PROGRAM_NV:
-      prog = _mesa_init_tess_eval_program(ctx,
-                                         CALLOC_STRUCT(gl_tess_eval_program),
-                                         target, id);
-      break;
-   case GL_COMPUTE_PROGRAM_NV:
-      prog = _mesa_init_compute_program(ctx,
-                                        CALLOC_STRUCT(gl_compute_program),
-                                        target, id);
-      break;
+   case GL_FRAGMENT_PROGRAM_ARB: {
+      struct gl_fragment_program *prog = CALLOC_STRUCT(gl_fragment_program);
+      return _mesa_init_gl_program(&prog->Base, target, id);
+   }
+   case GL_GEOMETRY_PROGRAM_NV: {
+      struct gl_geometry_program *prog = CALLOC_STRUCT(gl_geometry_program);
+      return _mesa_init_gl_program(&prog->Base, target, id);
+   }
+   case GL_TESS_CONTROL_PROGRAM_NV: {
+      struct gl_tess_ctrl_program *prog = CALLOC_STRUCT(gl_tess_ctrl_program);
+      return _mesa_init_gl_program(&prog->Base, target, id);
+   }
+   case GL_TESS_EVALUATION_PROGRAM_NV: {
+      struct gl_tess_eval_program *prog = CALLOC_STRUCT(gl_tess_eval_program);
+      return _mesa_init_gl_program(&prog->Base, target, id);
+   }
+   case GL_COMPUTE_PROGRAM_NV: {
+      struct gl_compute_program *prog = CALLOC_STRUCT(gl_compute_program);
+      return _mesa_init_gl_program(&prog->Base, target, id);
+   }
    default:
       _mesa_problem(ctx, "bad target in _mesa_new_program");
-      prog = NULL;
+      return NULL;
    }
-   return prog;
 }
 
 
@@ -494,123 +351,6 @@ _mesa_reference_program_(struct gl_context *ctx,
 
 
 /**
- * Return a copy of a program.
- * XXX Problem here if the program object is actually OO-derivation
- * made by a device driver.
- */
-struct gl_program *
-_mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog)
-{
-   struct gl_program *clone;
-
-   clone = ctx->Driver.NewProgram(ctx, prog->Target, prog->Id);
-   if (!clone)
-      return NULL;
-
-   assert(clone->Target == prog->Target);
-   assert(clone->RefCount == 1);
-
-   clone->String = (GLubyte *) strdup((char *) prog->String);
-   clone->Format = prog->Format;
-   clone->Instructions = _mesa_alloc_instructions(prog->NumInstructions);
-   if (!clone->Instructions) {
-      _mesa_reference_program(ctx, &clone, NULL);
-      return NULL;
-   }
-   _mesa_copy_instructions(clone->Instructions, prog->Instructions,
-                           prog->NumInstructions);
-   clone->InputsRead = prog->InputsRead;
-   clone->OutputsWritten = prog->OutputsWritten;
-   clone->SamplersUsed = prog->SamplersUsed;
-   clone->ShadowSamplers = prog->ShadowSamplers;
-   memcpy(clone->TexturesUsed, prog->TexturesUsed, sizeof(prog->TexturesUsed));
-
-   if (prog->Parameters)
-      clone->Parameters = _mesa_clone_parameter_list(prog->Parameters);
-   if (prog->LocalParams) {
-      clone->LocalParams = malloc(MAX_PROGRAM_LOCAL_PARAMS *
-                                  sizeof(float[4]));
-      if (!clone->LocalParams) {
-         _mesa_reference_program(ctx, &clone, NULL);
-         return NULL;
-      }
-      memcpy(clone->LocalParams, prog->LocalParams,
-             MAX_PROGRAM_LOCAL_PARAMS * sizeof(float[4]));
-   }
-   clone->IndirectRegisterFiles = prog->IndirectRegisterFiles;
-   clone->NumInstructions = prog->NumInstructions;
-   clone->NumTemporaries = prog->NumTemporaries;
-   clone->NumParameters = prog->NumParameters;
-   clone->NumAttributes = prog->NumAttributes;
-   clone->NumAddressRegs = prog->NumAddressRegs;
-   clone->NumNativeInstructions = prog->NumNativeInstructions;
-   clone->NumNativeTemporaries = prog->NumNativeTemporaries;
-   clone->NumNativeParameters = prog->NumNativeParameters;
-   clone->NumNativeAttributes = prog->NumNativeAttributes;
-   clone->NumNativeAddressRegs = prog->NumNativeAddressRegs;
-   clone->NumAluInstructions = prog->NumAluInstructions;
-   clone->NumTexInstructions = prog->NumTexInstructions;
-   clone->NumTexIndirections = prog->NumTexIndirections;
-   clone->NumNativeAluInstructions = prog->NumNativeAluInstructions;
-   clone->NumNativeTexInstructions = prog->NumNativeTexInstructions;
-   clone->NumNativeTexIndirections = prog->NumNativeTexIndirections;
-
-   switch (prog->Target) {
-   case GL_VERTEX_PROGRAM_ARB:
-      {
-         const struct gl_vertex_program *vp = gl_vertex_program_const(prog);
-         struct gl_vertex_program *vpc = gl_vertex_program(clone);
-         vpc->IsPositionInvariant = vp->IsPositionInvariant;
-      }
-      break;
-   case GL_FRAGMENT_PROGRAM_ARB:
-      {
-         const struct gl_fragment_program *fp = gl_fragment_program_const(prog);
-         struct gl_fragment_program *fpc = gl_fragment_program(clone);
-         fpc->UsesKill = fp->UsesKill;
-         fpc->UsesDFdy = fp->UsesDFdy;
-         fpc->OriginUpperLeft = fp->OriginUpperLeft;
-         fpc->PixelCenterInteger = fp->PixelCenterInteger;
-      }
-      break;
-   case GL_GEOMETRY_PROGRAM_NV:
-      {
-         const struct gl_geometry_program *gp = gl_geometry_program_const(prog);
-         struct gl_geometry_program *gpc = gl_geometry_program(clone);
-         gpc->VerticesOut = gp->VerticesOut;
-         gpc->InputType = gp->InputType;
-         gpc->Invocations = gp->Invocations;
-         gpc->OutputType = gp->OutputType;
-         gpc->UsesEndPrimitive = gp->UsesEndPrimitive;
-         gpc->UsesStreams = gp->UsesStreams;
-      }
-      break;
-   case GL_TESS_CONTROL_PROGRAM_NV:
-      {
-         const struct gl_tess_ctrl_program *tcp = gl_tess_ctrl_program_const(prog);
-         struct gl_tess_ctrl_program *tcpc = gl_tess_ctrl_program(clone);
-         tcpc->VerticesOut = tcp->VerticesOut;
-      }
-      break;
-   case GL_TESS_EVALUATION_PROGRAM_NV:
-      {
-         const struct gl_tess_eval_program *tep = gl_tess_eval_program_const(prog);
-         struct gl_tess_eval_program *tepc = gl_tess_eval_program(clone);
-         tepc->PrimitiveMode = tep->PrimitiveMode;
-         tepc->Spacing = tep->Spacing;
-         tepc->VertexOrder = tep->VertexOrder;
-         tepc->PointMode = tep->PointMode;
-      }
-      break;
-   default:
-      _mesa_problem(NULL, "Unexpected target in _mesa_clone_program");
-   }
-
-   return clone;
-}
-
-
-/**
  * Insert 'count' NOP instructions at 'start' in the given program.
  * Adjust branch targets accordingly.
  */
@@ -707,190 +447,6 @@ _mesa_delete_instructions(struct gl_program *prog, GLuint start, GLuint count)
 
 
 /**
- * Search instructions for registers that match (oldFile, oldIndex),
- * replacing them with (newFile, newIndex).
- */
-static void
-replace_registers(struct prog_instruction *inst, GLuint numInst,
-                  GLuint oldFile, GLuint oldIndex,
-                  GLuint newFile, GLuint newIndex)
-{
-   GLuint i, j;
-   for (i = 0; i < numInst; i++) {
-      /* src regs */
-      for (j = 0; j < _mesa_num_inst_src_regs(inst[i].Opcode); j++) {
-         if (inst[i].SrcReg[j].File == oldFile &&
-             inst[i].SrcReg[j].Index == oldIndex) {
-            inst[i].SrcReg[j].File = newFile;
-            inst[i].SrcReg[j].Index = newIndex;
-         }
-      }
-      /* dst reg */
-      if (inst[i].DstReg.File == oldFile && inst[i].DstReg.Index == oldIndex) {
-         inst[i].DstReg.File = newFile;
-         inst[i].DstReg.Index = newIndex;
-      }
-   }
-}
-
-
-/**
- * Search instructions for references to program parameters.  When found,
- * increment the parameter index by 'offset'.
- * Used when combining programs.
- */
-static void
-adjust_param_indexes(struct prog_instruction *inst, GLuint numInst,
-                     GLuint offset)
-{
-   GLuint i, j;
-   for (i = 0; i < numInst; i++) {
-      for (j = 0; j < _mesa_num_inst_src_regs(inst[i].Opcode); j++) {
-         GLuint f = inst[i].SrcReg[j].File;
-         if (f == PROGRAM_CONSTANT ||
-             f == PROGRAM_UNIFORM ||
-             f == PROGRAM_STATE_VAR) {
-            inst[i].SrcReg[j].Index += offset;
-         }
-      }
-   }
-}
-
-
-/**
- * Combine two programs into one.  Fix instructions so the outputs of
- * the first program go to the inputs of the second program.
- */
-struct gl_program *
-_mesa_combine_programs(struct gl_context *ctx,
-                       const struct gl_program *progA,
-                       const struct gl_program *progB)
-{
-   struct prog_instruction *newInst;
-   struct gl_program *newProg;
-   const GLuint lenA = progA->NumInstructions - 1; /* omit END instr */
-   const GLuint lenB = progB->NumInstructions;
-   const GLuint numParamsA = _mesa_num_parameters(progA->Parameters);
-   const GLuint newLength = lenA + lenB;
-   GLboolean usedTemps[MAX_PROGRAM_TEMPS];
-   GLuint firstTemp = 0;
-   GLbitfield64 inputsB;
-   GLuint i;
-
-   assert(progA->Target == progB->Target);
-
-   newInst = _mesa_alloc_instructions(newLength);
-   if (!newInst)
-      return GL_FALSE;
-
-   _mesa_copy_instructions(newInst, progA->Instructions, lenA);
-   _mesa_copy_instructions(newInst + lenA, progB->Instructions, lenB);
-
-   /* adjust branch / instruction addresses for B's instructions */
-   for (i = 0; i < lenB; i++) {
-      newInst[lenA + i].BranchTarget += lenA;
-   }
-
-   newProg = ctx->Driver.NewProgram(ctx, progA->Target, 0);
-   newProg->Instructions = newInst;
-   newProg->NumInstructions = newLength;
-
-   /* find used temp regs (we may need new temps below) */
-   _mesa_find_used_registers(newProg, PROGRAM_TEMPORARY,
-                             usedTemps, MAX_PROGRAM_TEMPS);
-
-   if (newProg->Target == GL_FRAGMENT_PROGRAM_ARB) {
-      const struct gl_fragment_program *fprogA, *fprogB;
-      struct gl_fragment_program *newFprog;
-      GLbitfield64 progB_inputsRead = progB->InputsRead;
-      GLint progB_colorFile, progB_colorIndex;
-
-      fprogA = gl_fragment_program_const(progA);
-      fprogB = gl_fragment_program_const(progB);
-      newFprog = gl_fragment_program(newProg);
-
-      newFprog->UsesKill = fprogA->UsesKill || fprogB->UsesKill;
-      newFprog->UsesDFdy = fprogA->UsesDFdy || fprogB->UsesDFdy;
-
-      /* We'll do a search and replace for instances
-       * of progB_colorFile/progB_colorIndex below...
-       */
-      progB_colorFile = PROGRAM_INPUT;
-      progB_colorIndex = VARYING_SLOT_COL0;
-
-      /*
-       * The fragment program may get color from a state var rather than
-       * a fragment input (vertex output) if it's constant.
-       * See the texenvprogram.c code.
-       * So, search the program's parameter list now to see if the program
-       * gets color from a state var instead of a conventional fragment
-       * input register.
-       */
-      for (i = 0; i < progB->Parameters->NumParameters; i++) {
-         struct gl_program_parameter *p = &progB->Parameters->Parameters[i];
-         if (p->Type == PROGRAM_STATE_VAR &&
-             p->StateIndexes[0] == STATE_INTERNAL &&
-             p->StateIndexes[1] == STATE_CURRENT_ATTRIB &&
-             (int) p->StateIndexes[2] == (int) VERT_ATTRIB_COLOR0) {
-            progB_inputsRead |= VARYING_BIT_COL0;
-            progB_colorFile = PROGRAM_STATE_VAR;
-            progB_colorIndex = i;
-            break;
-         }
-      }
-
-      /* Connect color outputs of fprogA to color inputs of fprogB, via a
-       * new temporary register.
-       */
-      if ((progA->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_COLOR)) &&
-          (progB_inputsRead & VARYING_BIT_COL0)) {
-         GLint tempReg = _mesa_find_free_register(usedTemps, MAX_PROGRAM_TEMPS,
-                                                  firstTemp);
-         if (tempReg < 0) {
-            _mesa_problem(ctx, "No free temp regs found in "
-                          "_mesa_combine_programs(), using 31");
-            tempReg = 31;
-         }
-         firstTemp = tempReg + 1;
-
-         /* replace writes to result.color[0] with tempReg */
-         replace_registers(newInst, lenA,
-                           PROGRAM_OUTPUT, FRAG_RESULT_COLOR,
-                           PROGRAM_TEMPORARY, tempReg);
-         /* replace reads from the input color with tempReg */
-         replace_registers(newInst + lenA, lenB,
-                           progB_colorFile, progB_colorIndex, /* search for */
-                           PROGRAM_TEMPORARY, tempReg  /* replace with */ );
-      }
-
-      /* compute combined program's InputsRead */
-      inputsB = progB_inputsRead;
-      if (progA->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_COLOR)) {
-         inputsB &= ~(1 << VARYING_SLOT_COL0);
-      }
-      newProg->InputsRead = progA->InputsRead | inputsB;
-      newProg->OutputsWritten = progB->OutputsWritten;
-      newProg->SamplersUsed = progA->SamplersUsed | progB->SamplersUsed;
-   }
-   else {
-      /* vertex program */
-      assert(0);      /* XXX todo */
-   }
-
-   /*
-    * Merge parameters (uniforms, constants, etc)
-    */
-   newProg->Parameters = _mesa_combine_parameter_lists(progA->Parameters,
-                                                       progB->Parameters);
-
-   adjust_param_indexes(newInst + lenA, lenB, numParamsA);
-
-
-   return newProg;
-}
-
-
-/**
  * Populate the 'used' array with flags indicating which registers (TEMPs,
  * INPUTs, OUTPUTs, etc, are used by the given program.
  * \param file  type of register to scan for
@@ -952,140 +508,6 @@ _mesa_find_free_register(const GLboolean used[],
 }
 
 
-
-/**
- * Check if the given register index is valid (doesn't exceed implementation-
- * dependent limits).
- * \return GL_TRUE if OK, GL_FALSE if bad index
- */
-GLboolean
-_mesa_valid_register_index(const struct gl_context *ctx,
-                           gl_shader_stage shaderType,
-                           gl_register_file file, GLint index)
-{
-   const struct gl_program_constants *c;
-
-   assert(0 <= shaderType && shaderType < MESA_SHADER_STAGES);
-   c = &ctx->Const.Program[shaderType];
-
-   switch (file) {
-   case PROGRAM_UNDEFINED:
-      return GL_TRUE;  /* XXX or maybe false? */
-
-   case PROGRAM_TEMPORARY:
-      return index >= 0 && index < (GLint) c->MaxTemps;
-
-   case PROGRAM_UNIFORM:
-   case PROGRAM_STATE_VAR:
-      /* aka constant buffer */
-      return index >= 0 && index < (GLint) c->MaxUniformComponents / 4;
-
-   case PROGRAM_CONSTANT:
-      /* constant buffer w/ possible relative negative addressing */
-      return (index > (int) c->MaxUniformComponents / -4 &&
-              index < (int) c->MaxUniformComponents / 4);
-
-   case PROGRAM_INPUT:
-      if (index < 0)
-         return GL_FALSE;
-
-      switch (shaderType) {
-      case MESA_SHADER_VERTEX:
-         return index < VERT_ATTRIB_GENERIC0 + (GLint) c->MaxAttribs;
-      case MESA_SHADER_FRAGMENT:
-         return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying;
-      case MESA_SHADER_GEOMETRY:
-         return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying;
-      default:
-         return GL_FALSE;
-      }
-
-   case PROGRAM_OUTPUT:
-      if (index < 0)
-         return GL_FALSE;
-
-      switch (shaderType) {
-      case MESA_SHADER_VERTEX:
-         return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying;
-      case MESA_SHADER_FRAGMENT:
-         return index < FRAG_RESULT_DATA0 + (GLint) ctx->Const.MaxDrawBuffers;
-      case MESA_SHADER_GEOMETRY:
-         return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying;
-      default:
-         return GL_FALSE;
-      }
-
-   case PROGRAM_ADDRESS:
-      return index >= 0 && index < (GLint) c->MaxAddressRegs;
-
-   default:
-      _mesa_problem(ctx,
-                    "unexpected register file in _mesa_valid_register_index()");
-      return GL_FALSE;
-   }
-}
-
-
-
-/**
- * "Post-process" a GPU program.  This is intended to be used for debugging.
- * Example actions include no-op'ing instructions or changing instruction
- * behaviour.
- */
-void
-_mesa_postprocess_program(struct gl_context *ctx, struct gl_program *prog)
-{
-   static const GLfloat white[4] = { 0.5, 0.5, 0.5, 0.5 };
-   GLuint i;
-   GLuint whiteSwizzle;
-   GLint whiteIndex = _mesa_add_unnamed_constant(prog->Parameters,
-                                                 (gl_constant_value *) white,
-                                                 4, &whiteSwizzle);
-
-   (void) whiteIndex;
-
-   for (i = 0; i < prog->NumInstructions; i++) {
-      struct prog_instruction *inst = prog->Instructions + i;
-      const GLuint n = _mesa_num_inst_src_regs(inst->Opcode);
-
-      (void) n;
-
-      if (_mesa_is_tex_instruction(inst->Opcode)) {
-#if 0
-         /* replace TEX/TXP/TXB with MOV */
-         inst->Opcode = OPCODE_MOV;
-         inst->DstReg.WriteMask = WRITEMASK_XYZW;
-         inst->SrcReg[0].Swizzle = SWIZZLE_XYZW;
-         inst->SrcReg[0].Negate = NEGATE_NONE;
-#endif
-
-#if 0
-         /* disable shadow texture mode */
-         inst->TexShadow = 0;
-#endif
-      }
-
-      if (inst->Opcode == OPCODE_TXP) {
-#if 0
-         inst->Opcode = OPCODE_MOV;
-         inst->DstReg.WriteMask = WRITEMASK_XYZW;
-         inst->SrcReg[0].File = PROGRAM_CONSTANT;
-         inst->SrcReg[0].Index = whiteIndex;
-         inst->SrcReg[0].Swizzle = SWIZZLE_XYZW;
-         inst->SrcReg[0].Negate = NEGATE_NONE;
-#endif
-#if 0
-         inst->TexShadow = 0;
-#endif
-#if 0
-         inst->Opcode = OPCODE_TEX;
-         inst->TexShadow = 0;
-#endif
-      }
-
-   }
-}
-
 /* Gets the minimum number of shader invocations per fragment.
  * This function is useful to determine if we need to do per
  * sample shading or per fragment shading.
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index a894147cafd..24e05974dc3 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -63,40 +63,8 @@ _mesa_update_default_objects_program(struct gl_context *ctx);
 extern void
 _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string);
 
-extern const GLubyte *
-_mesa_find_line_column(const GLubyte *string, const GLubyte *pos,
-                       GLint *line, GLint *col);
-
-
-extern struct gl_program *
-_mesa_init_vertex_program(struct gl_context *ctx,
-                          struct gl_vertex_program *prog,
-                          GLenum target, GLuint id);
-
-extern struct gl_program *
-_mesa_init_fragment_program(struct gl_context *ctx,
-                            struct gl_fragment_program *prog,
-                            GLenum target, GLuint id);
-
 extern struct gl_program *
-_mesa_init_tess_ctrl_program(struct gl_context *ctx,
-                            struct gl_tess_ctrl_program *prog,
-                            GLenum target, GLuint id);
-
-extern struct gl_program *
-_mesa_init_tess_eval_program(struct gl_context *ctx,
-                            struct gl_tess_eval_program *prog,
-                            GLenum target, GLuint id);
-
-extern struct gl_program *
-_mesa_init_geometry_program(struct gl_context *ctx,
-                            struct gl_geometry_program *prog,
-                            GLenum target, GLuint id);
-
-extern struct gl_program *
-_mesa_init_compute_program(struct gl_context *ctx,
-                           struct gl_compute_program *prog,
-                           GLenum target, GLuint id);
+_mesa_init_gl_program(struct gl_program *prog, GLenum target, GLuint id);
 
 extern struct gl_program *
 _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id);
@@ -176,56 +144,12 @@ _mesa_reference_tesseprog(struct gl_context *ctx,
                            (struct gl_program *) prog);
 }
 
-extern struct gl_program *
-_mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog);
-
-static inline struct gl_vertex_program *
-_mesa_clone_vertex_program(struct gl_context *ctx,
-                           const struct gl_vertex_program *prog)
-{
-   return (struct gl_vertex_program *) _mesa_clone_program(ctx, &prog->Base);
-}
-
-static inline struct gl_tess_ctrl_program *
-_mesa_clone_tess_ctrl_program(struct gl_context *ctx,
-                             const struct gl_tess_ctrl_program *prog)
-{
-   return (struct gl_tess_ctrl_program *) _mesa_clone_program(ctx, &prog->Base);
-}
-
-static inline struct gl_tess_eval_program *
-_mesa_clone_tess_eval_program(struct gl_context *ctx,
-                             const struct gl_tess_eval_program *prog)
-{
-   return (struct gl_tess_eval_program *) _mesa_clone_program(ctx, &prog->Base);
-}
-
-static inline struct gl_geometry_program *
-_mesa_clone_geometry_program(struct gl_context *ctx,
-                             const struct gl_geometry_program *prog)
-{
-   return (struct gl_geometry_program *) _mesa_clone_program(ctx, &prog->Base);
-}
-
-static inline struct gl_fragment_program *
-_mesa_clone_fragment_program(struct gl_context *ctx,
-                             const struct gl_fragment_program *prog)
-{
-   return (struct gl_fragment_program *) _mesa_clone_program(ctx, &prog->Base);
-}
-
-
 extern  GLboolean
 _mesa_insert_instructions(struct gl_program *prog, GLuint start, GLuint count);
 
 extern  GLboolean
 _mesa_delete_instructions(struct gl_program *prog, GLuint start, GLuint count);
 
-extern struct gl_program *
-_mesa_combine_programs(struct gl_context *ctx,
-                       const struct gl_program *progA,
-                       const struct gl_program *progB);
-
 extern void
 _mesa_find_used_registers(const struct gl_program *prog,
                           gl_register_file file,
@@ -235,15 +159,6 @@ extern GLint
 _mesa_find_free_register(const GLboolean used[],
                          GLuint maxRegs, GLuint firstReg);
 
-
-extern GLboolean
-_mesa_valid_register_index(const struct gl_context *ctx,
-                           gl_shader_stage shaderType,
-                           gl_register_file file, GLint index);
-
-extern void
-_mesa_postprocess_program(struct gl_context *ctx, struct gl_program *prog);
-
 extern GLint
 _mesa_get_min_invocations_per_fragment(struct gl_context *ctx,
                                        const struct gl_fragment_program *prog,
diff --git a/src/mesa/program/sampler.cpp b/src/mesa/program/sampler.cpp
index 1198a3c45f1..84e2504baba 100644
--- a/src/mesa/program/sampler.cpp
+++ b/src/mesa/program/sampler.cpp
@@ -24,7 +24,7 @@
  */
 
 #include "main/mtypes.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
 #include "glsl/ir.h"
 #include "glsl/ir_uniform.h"
 #include "glsl/ir_visitor.h"
diff --git a/src/mesa/state_tracker/st_atom_clip.c b/src/mesa/state_tracker/st_atom_clip.c
index 506a770499f..b820d843385 100644
--- a/src/mesa/state_tracker/st_atom_clip.c
+++ b/src/mesa/state_tracker/st_atom_clip.c
@@ -56,6 +56,9 @@ static void update_clip( struct st_context *st )
          use_eye = TRUE;
    }
 
+   /* _ClipUserPlane = _NEW_TRANSFORM | _NEW_PROJECTION
+    * EyeUserPlane = _NEW_TRANSFORM
+    */
    memcpy(clip.ucp,
           use_eye ? ctx->Transform.EyeUserPlane
                   : ctx->Transform._ClipUserPlane, sizeof(clip.ucp));
@@ -70,7 +73,7 @@ static void update_clip( struct st_context *st )
 const struct st_tracked_state st_update_clip = {
    "st_update_clip",					/* name */
    {							/* dirty */
-      _NEW_TRANSFORM,                                   /* mesa */
+      _NEW_TRANSFORM | _NEW_PROJECTION,                 /* mesa */
       ST_NEW_VERTEX_PROGRAM,				/* st */
    },
    update_clip						/* update */
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 6affb4d84d5..acaa85d9356 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -238,7 +238,7 @@ static void st_bind_ubos(struct st_context *st,
       struct gl_uniform_buffer_binding *binding;
       struct st_buffer_object *st_obj;
 
-      binding = &st->ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding];
+      binding = &st->ctx->UniformBufferBindings[shader->UniformBlocks[i]->Binding];
       st_obj = st_buffer_object(binding->BufferObject);
 
       cb.buffer = st_obj->buffer;
diff --git a/src/mesa/state_tracker/st_atom_pixeltransfer.c b/src/mesa/state_tracker/st_atom_pixeltransfer.c
index a04163cc137..f94c358afba 100644
--- a/src/mesa/state_tracker/st_atom_pixeltransfer.c
+++ b/src/mesa/state_tracker/st_atom_pixeltransfer.c
@@ -25,65 +25,17 @@
  * 
  **************************************************************************/
 
-/*
- * Generate fragment programs to implement pixel transfer ops, such as
- * scale/bias, colortable, convolution...
- *
- * Authors:
+/* Authors:
  *   Brian Paul
  */
 
-#include "main/imports.h"
-#include "main/image.h"
-#include "main/macros.h"
-#include "program/program.h"
-#include "program/prog_cache.h"
-#include "program/prog_instruction.h"
-#include "program/prog_parameter.h"
-#include "program/prog_print.h"
-
 #include "st_context.h"
-#include "st_format.h"
 #include "st_texture.h"
 
-#include "pipe/p_screen.h"
-#include "pipe/p_context.h"
 #include "util/u_inlines.h"
 #include "util/u_pack_color.h"
 
 
-struct state_key
-{
-   GLuint scaleAndBias:1;
-   GLuint pixelMaps:1;
-
-#if 0
-   GLfloat Maps[3][256][4];
-   int NumMaps;
-   GLint NumStages;
-   pipeline_stage Stages[STAGE_MAX];
-   GLboolean StagesUsed[STAGE_MAX];
-   GLfloat Scale1[4], Bias1[4];
-   GLfloat Scale2[4], Bias2[4];
-#endif
-};
-
-static void
-make_state_key(struct gl_context *ctx,  struct state_key *key)
-{
-   memset(key, 0, sizeof(*key));
-
-   if (ctx->Pixel.RedBias != 0.0 || ctx->Pixel.RedScale != 1.0 ||
-       ctx->Pixel.GreenBias != 0.0 || ctx->Pixel.GreenScale != 1.0 ||
-       ctx->Pixel.BlueBias != 0.0 || ctx->Pixel.BlueScale != 1.0 ||
-       ctx->Pixel.AlphaBias != 0.0 || ctx->Pixel.AlphaScale != 1.0) {
-      key->scaleAndBias = 1;
-   }
-
-   key->pixelMaps = ctx->Pixel.MapColorFlag;
-}
-
-
 /**
  * Update the pixelmap texture with the contents of the R/G/B/A pixel maps.
  */
@@ -128,74 +80,15 @@ load_color_map_texture(struct gl_context *ctx, struct pipe_resource *pt)
    pipe_transfer_unmap(pipe, transfer);
 }
 
-
-
-#define MAX_INST 100
-
 /**
- * Returns a fragment program which implements the current pixel transfer ops.
+ * Upload the pixel transfer color map texture.
  */
-static struct gl_fragment_program *
-get_pixel_transfer_program(struct gl_context *ctx, const struct state_key *key)
+static void
+update_pixel_transfer(struct st_context *st)
 {
-   struct st_context *st = st_context(ctx);
-   struct prog_instruction inst[MAX_INST];
-   struct gl_program_parameter_list *params;
-   struct gl_fragment_program *fp;
-   GLuint ic = 0;
-   const GLuint colorTemp = 0;
-
-   fp = (struct gl_fragment_program *)
-      ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
-   if (!fp)
-      return NULL;
-
-   params = _mesa_new_parameter_list();
-
-   /*
-    * Get initial pixel color from the texture.
-    * TEX colorTemp, fragment.texcoord[0], texture[0], 2D;
-    */
-   _mesa_init_instructions(inst + ic, 1);
-   inst[ic].Opcode = OPCODE_TEX;
-   inst[ic].DstReg.File = PROGRAM_TEMPORARY;
-   inst[ic].DstReg.Index = colorTemp;
-   inst[ic].SrcReg[0].File = PROGRAM_INPUT;
-   inst[ic].SrcReg[0].Index = VARYING_SLOT_TEX0;
-   inst[ic].TexSrcUnit = 0;
-   inst[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-   ic++;
-   fp->Base.InputsRead = BITFIELD64_BIT(VARYING_SLOT_TEX0);
-   fp->Base.OutputsWritten = BITFIELD64_BIT(FRAG_RESULT_COLOR);
-   fp->Base.SamplersUsed = 0x1;  /* sampler 0 (bit 0) is used */
-
-   if (key->scaleAndBias) {
-      static const gl_state_index scale_state[STATE_LENGTH] =
-         { STATE_INTERNAL, STATE_PT_SCALE, 0, 0, 0 };
-      static const gl_state_index bias_state[STATE_LENGTH] =
-         { STATE_INTERNAL, STATE_PT_BIAS, 0, 0, 0 };
-      GLint scale_p, bias_p;
-
-      scale_p = _mesa_add_state_reference(params, scale_state);
-      bias_p = _mesa_add_state_reference(params, bias_state);
-
-      /* MAD colorTemp, colorTemp, scale, bias; */
-      _mesa_init_instructions(inst + ic, 1);
-      inst[ic].Opcode = OPCODE_MAD;
-      inst[ic].DstReg.File = PROGRAM_TEMPORARY;
-      inst[ic].DstReg.Index = colorTemp;
-      inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY;
-      inst[ic].SrcReg[0].Index = colorTemp;
-      inst[ic].SrcReg[1].File = PROGRAM_STATE_VAR;
-      inst[ic].SrcReg[1].Index = scale_p;
-      inst[ic].SrcReg[2].File = PROGRAM_STATE_VAR;
-      inst[ic].SrcReg[2].Index = bias_p;
-      ic++;
-   }
-
-   if (key->pixelMaps) {
-      const GLuint temp = 1;
+   struct gl_context *ctx = st->ctx;
 
+   if (ctx->Pixel.MapColorFlag) {
       /* create the colormap/texture now if not already done */
       if (!st->pixel_xfer.pixelmap_texture) {
          st->pixel_xfer.pixelmap_texture = st_create_color_map_texture(ctx);
@@ -203,117 +96,11 @@ get_pixel_transfer_program(struct gl_context *ctx, const struct state_key *key)
             st_create_texture_sampler_view(st->pipe,
                                            st->pixel_xfer.pixelmap_texture);
       }
-
-      /* with a little effort, we can do four pixel map look-ups with
-       * two TEX instructions:
-       */
-
-      /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */
-      _mesa_init_instructions(inst + ic, 1);
-      inst[ic].Opcode = OPCODE_TEX;
-      inst[ic].DstReg.File = PROGRAM_TEMPORARY;
-      inst[ic].DstReg.Index = temp;
-      inst[ic].DstReg.WriteMask = WRITEMASK_XY; /* write R,G */
-      inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY;
-      inst[ic].SrcReg[0].Index = colorTemp;
-      inst[ic].TexSrcUnit = 1;
-      inst[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-      ic++;
-
-      /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */
-      _mesa_init_instructions(inst + ic, 1);
-      inst[ic].Opcode = OPCODE_TEX;
-      inst[ic].DstReg.File = PROGRAM_TEMPORARY;
-      inst[ic].DstReg.Index = temp;
-      inst[ic].DstReg.WriteMask = WRITEMASK_ZW; /* write B,A */
-      inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY;
-      inst[ic].SrcReg[0].Index = colorTemp;
-      inst[ic].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W,
-                                                 SWIZZLE_Z, SWIZZLE_W);
-      inst[ic].TexSrcUnit = 1;
-      inst[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-      ic++;
-
-      /* MOV colorTemp, temp; */
-      _mesa_init_instructions(inst + ic, 1);
-      inst[ic].Opcode = OPCODE_MOV;
-      inst[ic].DstReg.File = PROGRAM_TEMPORARY;
-      inst[ic].DstReg.Index = colorTemp;
-      inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY;
-      inst[ic].SrcReg[0].Index = temp;
-      ic++;
-
-      fp->Base.SamplersUsed |= (1 << 1);  /* sampler 1 is used */
-   }
-
-   /* Modify last instruction's dst reg to write to result.color */
-   {
-      struct prog_instruction *last = &inst[ic - 1];
-      last->DstReg.File = PROGRAM_OUTPUT;
-      last->DstReg.Index = FRAG_RESULT_COLOR;
-   }
-
-   /* END; */
-   _mesa_init_instructions(inst + ic, 1);
-   inst[ic].Opcode = OPCODE_END;
-   ic++;
-
-   assert(ic <= MAX_INST);
-
-
-   fp->Base.Instructions = _mesa_alloc_instructions(ic);
-   if (!fp->Base.Instructions) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY,
-                  "generating pixel transfer program");
-      _mesa_free_parameter_list(params);
-      return NULL;
-   }
-
-   _mesa_copy_instructions(fp->Base.Instructions, inst, ic);
-   fp->Base.NumInstructions = ic;
-   fp->Base.Parameters = params;
-
-#if 0
-   printf("========= pixel transfer prog\n");
-   _mesa_print_program(&fp->Base);
-   _mesa_print_parameter_list(fp->Base.Parameters);
-#endif
-
-   return fp;
-}
-
-
-
-/**
- * Update st->pixel_xfer.program in response to new pixel-transfer state.
- */
-static void
-update_pixel_transfer(struct st_context *st)
-{
-   struct gl_context *ctx = st->ctx;
-   struct state_key key;
-   struct gl_fragment_program *fp;
-
-   make_state_key(st->ctx, &key);
-
-   fp = (struct gl_fragment_program *)
-      _mesa_search_program_cache(st->pixel_xfer.cache, &key, sizeof(key));
-   if (!fp) {
-      fp = get_pixel_transfer_program(st->ctx, &key);
-      _mesa_program_cache_insert(st->ctx, st->pixel_xfer.cache,
-                                 &key, sizeof(key), &fp->Base);
-   }
-
-   if (ctx->Pixel.MapColorFlag) {
       load_color_map_texture(ctx, st->pixel_xfer.pixelmap_texture);
    }
-   st->pixel_xfer.pixelmap_enabled = ctx->Pixel.MapColorFlag;
-
-   st->pixel_xfer.program = (struct st_fragment_program *) fp;
 }
 
 
-
 const struct st_tracked_state st_update_pixel_transfer = {
    "st_update_pixel_transfer",				/* name */
    {							/* dirty */
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index 230eba8c4a5..bb6dfe85644 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -108,151 +108,6 @@ struct bitmap_cache
 
 
 /**
- * Make fragment program for glBitmap:
- *   Sample the texture and kill the fragment if the bit is 0.
- * This program will be combined with the user's fragment program.
- */
-static struct st_fragment_program *
-make_bitmap_fragment_program(struct gl_context *ctx, GLuint samplerIndex)
-{
-   struct st_context *st = st_context(ctx);
-   struct st_fragment_program *stfp;
-   struct gl_program *p;
-   GLuint ic = 0;
-
-   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
-   if (!p)
-      return NULL;
-
-   p->NumInstructions = 3;
-
-   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
-   if (!p->Instructions) {
-      ctx->Driver.DeleteProgram(ctx, p);
-      return NULL;
-   }
-   _mesa_init_instructions(p->Instructions, p->NumInstructions);
-
-   /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
-   p->Instructions[ic].Opcode = OPCODE_TEX;
-   p->Instructions[ic].DstReg.File = PROGRAM_TEMPORARY;
-   p->Instructions[ic].DstReg.Index = 0;
-   p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
-   p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0;
-   p->Instructions[ic].TexSrcUnit = samplerIndex;
-   p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-   ic++;
-
-   /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
-   p->Instructions[ic].Opcode = OPCODE_KIL;
-   p->Instructions[ic].SrcReg[0].File = PROGRAM_TEMPORARY;
-
-   if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM)
-      p->Instructions[ic].SrcReg[0].Swizzle = SWIZZLE_XXXX;
-
-   p->Instructions[ic].SrcReg[0].Index = 0;
-   p->Instructions[ic].SrcReg[0].Negate = NEGATE_XYZW;
-   ic++;
-
-   /* END; */
-   p->Instructions[ic++].Opcode = OPCODE_END;
-
-   assert(ic == p->NumInstructions);
-
-   p->InputsRead = VARYING_BIT_TEX0;
-   p->OutputsWritten = 0x0;
-   p->SamplersUsed = (1 << samplerIndex);
-
-   stfp = (struct st_fragment_program *) p;
-   stfp->Base.UsesKill = GL_TRUE;
-
-   return stfp;
-}
-
-
-static struct gl_program *
-make_bitmap_fragment_program_glsl(struct st_context *st,
-                                  struct st_fragment_program *orig,
-                                  GLuint samplerIndex)
-{
-   struct gl_context *ctx = st->ctx;
-   struct st_fragment_program *fp = (struct st_fragment_program *)
-      ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
-
-   if (!fp)
-      return NULL;
-   
-   get_bitmap_visitor(fp, orig->glsl_to_tgsi, samplerIndex);
-   return &fp->Base.Base;
-}
-
-
-static int
-find_free_bit(uint bitfield)
-{
-   int i;
-   for (i = 0; i < 32; i++) {
-      if ((bitfield & (1 << i)) == 0) {
-         return i;
-      }
-   }
-   return -1;
-}
-
-
-/**
- * Combine basic bitmap fragment program with the user-defined program.
- * \param st  current context
- * \param fpIn  the incoming fragment program
- * \param fpOut  the new fragment program which does fragment culling
- * \param bitmap_sampler  sampler number for the bitmap texture
- */
-void
-st_make_bitmap_fragment_program(struct st_context *st,
-                                struct gl_fragment_program *fpIn,
-                                struct gl_fragment_program **fpOut,
-                                GLuint *bitmap_sampler)
-{
-   struct st_fragment_program *bitmap_prog;
-   struct st_fragment_program *stfpIn = (struct st_fragment_program *) fpIn;
-   struct gl_program *newProg;
-   uint sampler;
-
-   /*
-    * Generate new program which is the user-defined program prefixed
-    * with the bitmap sampler/kill instructions.
-    */
-   sampler = find_free_bit(fpIn->Base.SamplersUsed);
-   
-   if (stfpIn->glsl_to_tgsi)
-      newProg = make_bitmap_fragment_program_glsl(st, stfpIn, sampler);
-   else {
-      bitmap_prog = make_bitmap_fragment_program(st->ctx, sampler);
-
-      newProg = _mesa_combine_programs(st->ctx,
-                                       &bitmap_prog->Base.Base,
-                                       &fpIn->Base);
-      /* done with this after combining */
-      st_reference_fragprog(st, &bitmap_prog, NULL);
-   }
-
-#if 0
-   {
-      printf("Combined bitmap program:\n");
-      _mesa_print_program(newProg);
-      printf("InputsRead: 0x%x\n", newProg->InputsRead);
-      printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten);
-      _mesa_print_parameter_list(newProg->Parameters);
-   }
-#endif
-
-   /* return results */
-   *fpOut = (struct gl_fragment_program *) newProg;
-   *bitmap_sampler = sampler;
-}
-
-
-/**
  * Copy user-provide bitmap bits into texture buffer, expanding
  * bits into texels.
  * "On" bits will set texels to 0x0.
diff --git a/src/mesa/state_tracker/st_cb_bitmap.h b/src/mesa/state_tracker/st_cb_bitmap.h
index b4254ca1eeb..dc7e5cb5c9e 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.h
+++ b/src/mesa/state_tracker/st_cb_bitmap.h
@@ -31,6 +31,7 @@
 
 
 #include "main/compiler.h"
+#include <stdbool.h>
 
 struct dd_function_table;
 struct st_context;
@@ -47,13 +48,11 @@ extern void
 st_destroy_bitmap(struct st_context *st);
 
 extern void
-st_make_bitmap_fragment_program(struct st_context *st,
-                                struct gl_fragment_program *fpIn,
-                                struct gl_fragment_program **fpOut,
-                                GLuint *bitmap_sampler);
-
-extern void
 st_flush_bitmap_cache(struct st_context *st);
 
+extern const struct tgsi_token *
+st_get_bitmap_shader(const struct tgsi_token *tokens,
+                     unsigned sampler_index,
+                     bool use_texcoord, bool swizzle_xxxx);
 
 #endif /* ST_CB_BITMAP_H */
diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c
new file mode 100644
index 00000000000..cddea36d4f6
--- /dev/null
+++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c
@@ -0,0 +1,174 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2015 Advanced Micro Devices, Inc.
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "st_cb_bitmap.h"
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_dump.h"
+#include "util/u_debug.h"
+
+struct tgsi_bitmap_transform {
+   struct tgsi_transform_context base;
+   struct tgsi_shader_info info;
+   unsigned sampler_index;
+   bool use_texcoord;
+   bool swizzle_xxxx;
+   bool first_instruction_emitted;
+};
+
+static inline struct tgsi_bitmap_transform *
+tgsi_bitmap_transform(struct tgsi_transform_context *tctx)
+{
+   return (struct tgsi_bitmap_transform *)tctx;
+}
+
+static void
+transform_instr(struct tgsi_transform_context *tctx,
+		struct tgsi_full_instruction *current_inst)
+{
+   struct tgsi_bitmap_transform *ctx = tgsi_bitmap_transform(tctx);
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+   unsigned i, semantic;
+   int texcoord_index = -1;
+
+   if (ctx->first_instruction_emitted) {
+      tctx->emit_instruction(tctx, current_inst);
+      return;
+   }
+
+   ctx->first_instruction_emitted = true;
+
+   /* Add TEMP[0] if it's missing. */
+   if (ctx->info.file_max[TGSI_FILE_TEMPORARY] == -1) {
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_TEMPORARY;
+      tctx->emit_declaration(tctx, &decl);
+   }
+
+   /* Add TEXCOORD[0] if it's missing. */
+   semantic = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD :
+                                  TGSI_SEMANTIC_GENERIC;
+   for (i = 0; i < ctx->info.num_inputs; i++) {
+      if (ctx->info.input_semantic_name[i] == semantic &&
+          ctx->info.input_semantic_index[i] == 0) {
+         texcoord_index = i;
+         break;
+      }
+   }
+
+   if (texcoord_index == -1) {
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_INPUT;
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.Name = semantic;
+      decl.Declaration.Interpolate = 1;
+      decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
+      decl.Range.First = decl.Range.Last = ctx->info.num_inputs;
+      texcoord_index = ctx->info.num_inputs;
+      tctx->emit_declaration(tctx, &decl);
+   }
+
+   /* Declare the sampler. */
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_SAMPLER;
+   decl.Range.First = decl.Range.Last = ctx->sampler_index;
+   tctx->emit_declaration(tctx, &decl);
+
+   /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+   inst.Instruction.Texture = 1;
+   inst.Texture.Texture = TGSI_TEXTURE_2D;
+
+   inst.Instruction.NumDstRegs = 1;
+   inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+   inst.Dst[0].Register.Index = 0;
+   inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+
+   inst.Instruction.NumSrcRegs = 2;
+   inst.Src[0].Register.File  = TGSI_FILE_INPUT;
+   inst.Src[0].Register.Index = texcoord_index;
+   inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
+   inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y;
+   inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z;
+   inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W;
+   inst.Src[1].Register.File  = TGSI_FILE_SAMPLER;
+   inst.Src[1].Register.Index = ctx->sampler_index;
+
+   tctx->emit_instruction(tctx, &inst);
+
+   /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = TGSI_OPCODE_KILL_IF;
+   inst.Instruction.NumDstRegs = 0;
+   inst.Instruction.NumSrcRegs = 1;
+
+   inst.Src[0].Register.File  = TGSI_FILE_TEMPORARY;
+   inst.Src[0].Register.Index = 0;
+   inst.Src[0].Register.Negate = 1;
+   inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
+   if (ctx->swizzle_xxxx) {
+      inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X;
+      inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_X;
+   } else {
+      inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y;
+      inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z;
+      inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W;
+   }
+   tctx->emit_instruction(tctx, &inst);
+
+   /* And emit the instruction we got. */
+   tctx->emit_instruction(tctx, current_inst);
+}
+
+const struct tgsi_token *
+st_get_bitmap_shader(const struct tgsi_token *tokens,
+                     unsigned sampler_index,
+                     bool use_texcoord, bool swizzle_xxxx)
+{
+   struct tgsi_bitmap_transform ctx;
+   struct tgsi_token *newtoks;
+   int newlen;
+
+   memset(&ctx, 0, sizeof(ctx));
+   ctx.base.transform_instruction = transform_instr;
+   ctx.sampler_index = sampler_index;
+   ctx.use_texcoord = use_texcoord;
+   ctx.swizzle_xxxx = swizzle_xxxx;
+   tgsi_scan_shader(tokens, &ctx.info);
+
+   newlen = tgsi_num_tokens(tokens) + 20;
+   newtoks = tgsi_alloc_tokens(newlen);
+   if (!newtoks)
+      return NULL;
+
+   tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base);
+   return newtoks;
+}
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 152160e1dd2..7e8633edc1a 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -72,217 +72,74 @@
 
 
 /**
- * Check if the given program is:
- * 0: MOVE result.color, fragment.color;
- * 1: END;
- */
-static GLboolean
-is_passthrough_program(const struct gl_fragment_program *prog)
-{
-   if (prog->Base.NumInstructions == 2) {
-      const struct prog_instruction *inst = prog->Base.Instructions;
-      if (inst[0].Opcode == OPCODE_MOV &&
-          inst[1].Opcode == OPCODE_END &&
-          inst[0].DstReg.File == PROGRAM_OUTPUT &&
-          inst[0].DstReg.Index == FRAG_RESULT_COLOR &&
-          inst[0].DstReg.WriteMask == WRITEMASK_XYZW &&
-          inst[0].SrcReg[0].File == PROGRAM_INPUT &&
-          inst[0].SrcReg[0].Index == VARYING_SLOT_COL0 &&
-          inst[0].SrcReg[0].Swizzle == SWIZZLE_XYZW) {
-         return GL_TRUE;
-      }
-   }
-   return GL_FALSE;
-}
-
-
-/**
- * Returns a fragment program which implements the current pixel transfer ops.
- */
-static struct gl_fragment_program *
-get_glsl_pixel_transfer_program(struct st_context *st,
-                                struct st_fragment_program *orig)
-{
-   int pixelMaps = 0, scaleAndBias = 0;
-   struct gl_context *ctx = st->ctx;
-   struct st_fragment_program *fp = (struct st_fragment_program *)
-      ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
-
-   if (!fp)
-      return NULL;
-
-   if (ctx->Pixel.RedBias != 0.0 || ctx->Pixel.RedScale != 1.0 ||
-       ctx->Pixel.GreenBias != 0.0 || ctx->Pixel.GreenScale != 1.0 ||
-       ctx->Pixel.BlueBias != 0.0 || ctx->Pixel.BlueScale != 1.0 ||
-       ctx->Pixel.AlphaBias != 0.0 || ctx->Pixel.AlphaScale != 1.0) {
-      scaleAndBias = 1;
-   }
-
-   pixelMaps = ctx->Pixel.MapColorFlag;
-
-   if (pixelMaps) {
-      /* create the colormap/texture now if not already done */
-      if (!st->pixel_xfer.pixelmap_texture) {
-         st->pixel_xfer.pixelmap_texture = st_create_color_map_texture(ctx);
-         st->pixel_xfer.pixelmap_sampler_view =
-            st_create_texture_sampler_view(st->pipe,
-                                           st->pixel_xfer.pixelmap_texture);
-      }
-   }
-
-   get_pixel_transfer_visitor(fp, orig->glsl_to_tgsi,
-                              scaleAndBias, pixelMaps);
-
-   return &fp->Base;
-}
-
-
-/**
- * Make fragment shader for glDraw/CopyPixels.  This shader is made
- * by combining the pixel transfer shader with the user-defined shader.
- * \param fpIn  the current/incoming fragment program
- * \param fpOut  returns the combined fragment program
- */
-void
-st_make_drawpix_fragment_program(struct st_context *st,
-                                 struct gl_fragment_program *fpIn,
-                                 struct gl_fragment_program **fpOut)
-{
-   struct gl_program *newProg;
-   struct st_fragment_program *stfp = (struct st_fragment_program *) fpIn;
-
-   if (is_passthrough_program(fpIn)) {
-      newProg = (struct gl_program *) _mesa_clone_fragment_program(st->ctx,
-                                             &st->pixel_xfer.program->Base);
-   }
-   else if (stfp->glsl_to_tgsi != NULL) {
-      newProg = (struct gl_program *) get_glsl_pixel_transfer_program(st, stfp);
-   }
-   else {
-#if 0
-      /* debug */
-      printf("Base program:\n");
-      _mesa_print_program(&fpIn->Base);
-      printf("DrawPix program:\n");
-      _mesa_print_program(&st->pixel_xfer.program->Base.Base);
-#endif
-      newProg = _mesa_combine_programs(st->ctx,
-                                       &st->pixel_xfer.program->Base.Base,
-                                       &fpIn->Base);
-   }
-
-#if 0
-   /* debug */
-   printf("Combined DrawPixels program:\n");
-   _mesa_print_program(newProg);
-   printf("InputsRead: 0x%x\n", newProg->InputsRead);
-   printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten);
-   _mesa_print_parameter_list(newProg->Parameters);
-#endif
-
-   *fpOut = (struct gl_fragment_program *) newProg;
-}
-
-
-/**
  * Create fragment program that does a TEX() instruction to get a Z and/or
  * stencil value value, then writes to FRAG_RESULT_DEPTH/FRAG_RESULT_STENCIL.
  * Used for glDrawPixels(GL_DEPTH_COMPONENT / GL_STENCIL_INDEX).
  * Pass fragment color through as-is.
- * \return pointer to the gl_fragment program
+ *
+ * \return CSO of the fragment shader.
  */
-struct gl_fragment_program *
-st_make_drawpix_z_stencil_program(struct st_context *st,
-                                  GLboolean write_depth,
-                                  GLboolean write_stencil)
+static void *
+get_drawpix_z_stencil_program(struct st_context *st,
+                              GLboolean write_depth,
+                              GLboolean write_stencil)
 {
-   struct gl_context *ctx = st->ctx;
-   struct gl_program *p;
-   struct gl_fragment_program *fp;
-   GLuint ic = 0;
+   struct ureg_program *ureg;
+   struct ureg_src depth_sampler, stencil_sampler;
+   struct ureg_src texcoord, color;
+   struct ureg_dst out_color, out_depth, out_stencil;
    const GLuint shaderIndex = write_depth * 2 + write_stencil;
+   void *cso;
 
-   assert(shaderIndex < ARRAY_SIZE(st->drawpix.shaders));
+   assert(shaderIndex < ARRAY_SIZE(st->drawpix.zs_shaders));
 
-   if (st->drawpix.shaders[shaderIndex]) {
+   if (st->drawpix.zs_shaders[shaderIndex]) {
       /* already have the proper shader */
-      return st->drawpix.shaders[shaderIndex];
+      return st->drawpix.zs_shaders[shaderIndex];
    }
 
-   /*
-    * Create shader now
-    */
-   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
-   if (!p)
+   ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
+   if (ureg == NULL)
       return NULL;
 
-   p->NumInstructions = write_depth ? 3 : 1;
-   p->NumInstructions += write_stencil ? 1 : 0;
-
-   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
-   if (!p->Instructions) {
-      ctx->Driver.DeleteProgram(ctx, p);
-      return NULL;
-   }
-   _mesa_init_instructions(p->Instructions, p->NumInstructions);
+   ureg_property(ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, TRUE);
 
    if (write_depth) {
-      /* TEX result.depth, fragment.texcoord[0], texture[0], 2D; */
-      p->Instructions[ic].Opcode = OPCODE_TEX;
-      p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
-      p->Instructions[ic].DstReg.Index = FRAG_RESULT_DEPTH;
-      p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Z;
-      p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
-      p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0;
-      p->Instructions[ic].TexSrcUnit = 0;
-      p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-      ic++;
-      /* MOV result.color, fragment.color; */
-      p->Instructions[ic].Opcode = OPCODE_MOV;
-      p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
-      p->Instructions[ic].DstReg.Index = FRAG_RESULT_COLOR;
-      p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
-      p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_COL0;
-      ic++;
+      color = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0,
+                                 TGSI_INTERPOLATE_COLOR);
+      out_color = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
+
+      depth_sampler = ureg_DECL_sampler(ureg, 0);
+      out_depth = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
    }
 
    if (write_stencil) {
-      /* TEX result.stencil, fragment.texcoord[0], texture[0], 2D; */
-      p->Instructions[ic].Opcode = OPCODE_TEX;
-      p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
-      p->Instructions[ic].DstReg.Index = FRAG_RESULT_STENCIL;
-      p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Y;
-      p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
-      p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0;
-      p->Instructions[ic].TexSrcUnit = 1;
-      p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-      ic++;
+      stencil_sampler = ureg_DECL_sampler(ureg, 1);
+      out_stencil = ureg_DECL_output(ureg, TGSI_SEMANTIC_STENCIL, 0);
    }
 
-   /* END; */
-   p->Instructions[ic++].Opcode = OPCODE_END;
-
-   assert(ic == p->NumInstructions);
+   texcoord = ureg_DECL_fs_input(ureg,
+                                 st->needs_texcoord_semantic ?
+                                    TGSI_SEMANTIC_TEXCOORD :
+                                    TGSI_SEMANTIC_GENERIC,
+                                 0, TGSI_INTERPOLATE_LINEAR);
 
-   p->InputsRead = VARYING_BIT_TEX0 | VARYING_BIT_COL0;
-   p->OutputsWritten = 0;
    if (write_depth) {
-      p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_DEPTH);
-      p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_COLOR);
+      ureg_TEX(ureg, ureg_writemask(out_depth, TGSI_WRITEMASK_Z),
+               TGSI_TEXTURE_2D, texcoord, depth_sampler);
+      ureg_MOV(ureg, out_color, color);
    }
-   if (write_stencil)
-      p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_STENCIL);
 
-   p->SamplersUsed =  0x1;  /* sampler 0 (bit 0) is used */
    if (write_stencil)
-      p->SamplersUsed |= 1 << 1;
+      ureg_TEX(ureg, ureg_writemask(out_stencil, TGSI_WRITEMASK_Y),
+               TGSI_TEXTURE_2D, texcoord, stencil_sampler);
 
-   fp = (struct gl_fragment_program *) p;
+   ureg_END(ureg);
+   cso = ureg_create_shader_and_destroy(ureg, st->pipe);
 
    /* save the new shader */
-   st->drawpix.shaders[shaderIndex] = fp;
-
-   return fp;
+   st->drawpix.zs_shaders[shaderIndex] = cso;
+   return cso;
 }
 
 
@@ -668,6 +525,7 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
                    int num_sampler_view,
                    void *driver_vp,
                    void *driver_fp,
+                   struct st_fp_variant *fpv,
                    const GLfloat *color,
                    GLboolean invertTex,
                    GLboolean write_depth, GLboolean write_stencil)
@@ -755,10 +613,9 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
 
-   /* texture sampling state: */
+   /* user samplers, plus the drawpix samplers */
    {
       struct pipe_sampler_state sampler;
-      const struct pipe_sampler_state *states[2] = {&sampler, &sampler};
 
       memset(&sampler, 0, sizeof(sampler));
       sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
@@ -769,8 +626,25 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
       sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
       sampler.normalized_coords = normalized;
 
-      cso_set_samplers(cso, PIPE_SHADER_FRAGMENT,
-                       num_sampler_view > 1 ? 2 : 1, states);
+      if (fpv) {
+         const struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS];
+         uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1,
+                         st->state.num_samplers[PIPE_SHADER_FRAGMENT]);
+         uint i;
+
+         for (i = 0; i < st->state.num_samplers[PIPE_SHADER_FRAGMENT]; i++)
+            samplers[i] = &st->state.samplers[PIPE_SHADER_FRAGMENT][i];
+
+         samplers[fpv->drawpix_sampler] = &sampler;
+         if (sv[1])
+            samplers[fpv->pixelmap_sampler] = &sampler;
+
+         cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num, samplers);
+      } else {
+         const struct pipe_sampler_state *samplers[2] = {&sampler, &sampler};
+
+         cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, samplers);
+      }
    }
 
    /* viewport state: viewport matching window dims */
@@ -790,8 +664,21 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_set_vertex_elements(cso, 3, st->velems_util_draw);
    cso_set_stream_outputs(st->cso_context, 0, NULL, NULL);
 
-   /* texture state: */
-   cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, sv);
+   /* user textures, plus the drawpix textures */
+   if (fpv) {
+      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
+      uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1,
+                      st->state.num_sampler_views[PIPE_SHADER_FRAGMENT]);
+
+      memcpy(sampler_views, st->state.sampler_views[PIPE_SHADER_FRAGMENT],
+             sizeof(sampler_views));
+
+      sampler_views[fpv->drawpix_sampler] = sv[0];
+      if (sv[1])
+         sampler_views[fpv->pixelmap_sampler] = sv[1];
+      cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num, sampler_views);
+   } else
+      cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, sv);
 
    /* Compute Gallium window coords (y=0=top) with pixel zoom.
     * Recall that these coords are transformed by the current
@@ -1048,30 +935,6 @@ get_color_fp_variant(struct st_context *st)
 
 
 /**
- * Get fragment program variant for a glDrawPixels or glCopyPixels
- * command for depth/stencil data.
- */
-static struct st_fp_variant *
-get_depth_stencil_fp_variant(struct st_context *st, GLboolean write_depth,
-                             GLboolean write_stencil)
-{
-   struct st_fp_variant_key key;
-   struct st_fp_variant *fpv;
-
-   memset(&key, 0, sizeof(key));
-
-   key.st = st;
-   key.drawpixels = 1;
-   key.drawpixels_z = write_depth;
-   key.drawpixels_stencil = write_stencil;
-
-   fpv = st_get_fp_variant(st, st->fp, &key);
-
-   return fpv;
-}
-
-
-/**
  * Clamp glDrawPixels width and height to the maximum texture size.
  */
 static void
@@ -1109,8 +972,8 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
    GLboolean write_stencil = GL_FALSE, write_depth = GL_FALSE;
    struct pipe_sampler_view *sv[2] = { NULL };
    int num_sampler_view = 1;
-   struct st_fp_variant *fpv;
    struct gl_pixelstore_attrib clippedUnpack;
+   struct st_fp_variant *fpv = NULL;
 
    /* Mesa state should be up to date by now */
    assert(ctx->NewState == 0x0);
@@ -1144,31 +1007,27 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
     * Get vertex/fragment shaders
     */
    if (write_depth || write_stencil) {
-      fpv = get_depth_stencil_fp_variant(st, write_depth, write_stencil);
-
-      driver_fp = fpv->driver_shader;
-
+      driver_fp = get_drawpix_z_stencil_program(st, write_depth,
+                                                write_stencil);
       driver_vp = make_passthrough_vertex_shader(st, GL_TRUE);
-
       color = ctx->Current.RasterColor;
    }
    else {
       fpv = get_color_fp_variant(st);
 
       driver_fp = fpv->driver_shader;
-
       driver_vp = make_passthrough_vertex_shader(st, GL_FALSE);
 
       color = NULL;
-      if (st->pixel_xfer.pixelmap_enabled) {
+      if (ctx->Pixel.MapColorFlag) {
          pipe_sampler_view_reference(&sv[1],
                                      st->pixel_xfer.pixelmap_sampler_view);
          num_sampler_view++;
       }
-   }
 
-   /* update fragment program constants */
-   st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      /* update fragment program constants */
+      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+   }
 
    /* draw with textured quad */
    {
@@ -1197,7 +1056,7 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
                                sv,
                                num_sampler_view,
                                driver_vp,
-                               driver_fp,
+                               driver_fp, fpv,
                                color, GL_FALSE, write_depth, write_stencil);
             pipe_sampler_view_reference(&sv[0], NULL);
             if (num_sampler_view > 1)
@@ -1452,6 +1311,7 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
    void *driver_vp, *driver_fp;
    struct pipe_resource *pt;
    struct pipe_sampler_view *sv[2] = { NULL };
+   struct st_fp_variant *fpv = NULL;
    int num_sampler_view = 1;
    GLfloat *color;
    enum pipe_format srcFormat;
@@ -1459,7 +1319,6 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
    GLboolean invertTex = GL_FALSE;
    GLint readX, readY, readW, readH;
    struct gl_pixelstore_attrib pack = ctx->DefaultPacking;
-   struct st_fp_variant *fpv;
 
    st_validate_state(st);
 
@@ -1491,19 +1350,22 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
     * Get vertex/fragment shaders
     */
    if (type == GL_COLOR) {
+      fpv = get_color_fp_variant(st);
+
       rbRead = st_get_color_read_renderbuffer(ctx);
       color = NULL;
 
-      fpv = get_color_fp_variant(st);
       driver_fp = fpv->driver_shader;
-
       driver_vp = make_passthrough_vertex_shader(st, GL_FALSE);
 
-      if (st->pixel_xfer.pixelmap_enabled) {
+      if (ctx->Pixel.MapColorFlag) {
          pipe_sampler_view_reference(&sv[1],
                                      st->pixel_xfer.pixelmap_sampler_view);
          num_sampler_view++;
       }
+
+      /* update fragment program constants */
+      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
    }
    else {
       assert(type == GL_DEPTH);
@@ -1511,15 +1373,10 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
                                Attachment[BUFFER_DEPTH].Renderbuffer);
       color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
 
-      fpv = get_depth_stencil_fp_variant(st, GL_TRUE, GL_FALSE);
-      driver_fp = fpv->driver_shader;
-
+      driver_fp = get_drawpix_z_stencil_program(st, GL_TRUE, GL_FALSE);
       driver_vp = make_passthrough_vertex_shader(st, GL_TRUE);
    }
 
-   /* update fragment program constants */
-   st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
-
    /* Choose the format for the temporary texture. */
    srcFormat = rbRead->texture->format;
    srcBind = PIPE_BIND_SAMPLER_VIEW |
@@ -1645,7 +1502,7 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
                       sv,
                       num_sampler_view,
                       driver_vp, 
-                      driver_fp,
+                      driver_fp, fpv,
                       color, invertTex, GL_FALSE, GL_FALSE);
 
    pipe_resource_reference(&pt, NULL);
@@ -1666,12 +1523,12 @@ st_destroy_drawpix(struct st_context *st)
 {
    GLuint i;
 
-   for (i = 0; i < ARRAY_SIZE(st->drawpix.shaders); i++) {
-      if (st->drawpix.shaders[i])
-         _mesa_reference_fragprog(st->ctx, &st->drawpix.shaders[i], NULL);
+   for (i = 0; i < ARRAY_SIZE(st->drawpix.zs_shaders); i++) {
+      if (st->drawpix.zs_shaders[i])
+         cso_delete_fragment_shader(st->cso_context,
+                                    st->drawpix.zs_shaders[i]);
    }
 
-   st_reference_fragprog(st, &st->pixel_xfer.combined_prog, NULL);
    if (st->drawpix.vert_shaders[0])
       cso_delete_vertex_shader(st->cso_context, st->drawpix.vert_shaders[0]);
    if (st->drawpix.vert_shaders[1])
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h
index c707ace2f9f..f1fb32dd6cf 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.h
+++ b/src/mesa/state_tracker/st_cb_drawpixels.h
@@ -31,6 +31,7 @@
 
 
 #include "main/compiler.h"
+#include <stdbool.h>
 
 struct dd_function_table;
 struct st_context;
@@ -40,15 +41,11 @@ extern void st_init_drawpixels_functions(struct dd_function_table *functions);
 extern void
 st_destroy_drawpix(struct st_context *st);
 
-extern void
-st_make_drawpix_fragment_program(struct st_context *st,
-                                 struct gl_fragment_program *fpIn,
-                                 struct gl_fragment_program **fpOut);
-
-extern struct gl_fragment_program *
-st_make_drawpix_z_stencil_program(struct st_context *st,
-                                  GLboolean write_depth,
-                                  GLboolean write_stencil);
-
+extern const struct tgsi_token *
+st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
+                      bool scale_and_bias, unsigned scale_const,
+                      unsigned bias_const, bool pixel_maps,
+                      unsigned drawpix_sampler, unsigned pixelmap_sampler,
+                      unsigned texcoord_const);
 
 #endif /* ST_CB_DRAWPIXELS_H */
diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
new file mode 100644
index 00000000000..749b46cfbf7
--- /dev/null
+++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
@@ -0,0 +1,278 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2015 Advanced Micro Devices, Inc.
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "st_cb_drawpixels.h"
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_scan.h"
+
+struct tgsi_drawpix_transform {
+   struct tgsi_transform_context base;
+   struct tgsi_shader_info info;
+   bool use_texcoord;
+   bool scale_and_bias;
+   bool pixel_maps;
+   bool first_instruction_emitted;
+   unsigned scale_const;
+   unsigned bias_const;
+   unsigned color_temp;
+   unsigned drawpix_sampler;
+   unsigned pixelmap_sampler;
+   unsigned texcoord_const;
+};
+
+static inline struct tgsi_drawpix_transform *
+tgsi_drawpix_transform(struct tgsi_transform_context *tctx)
+{
+   return (struct tgsi_drawpix_transform *)tctx;
+}
+
+static void
+set_src(struct tgsi_full_instruction *inst, unsigned i, unsigned file, unsigned index,
+        unsigned x, unsigned y, unsigned z, unsigned w)
+{
+   inst->Src[i].Register.File  = file;
+   inst->Src[i].Register.Index = index;
+   inst->Src[i].Register.SwizzleX = x;
+   inst->Src[i].Register.SwizzleY = y;
+   inst->Src[i].Register.SwizzleZ = z;
+   inst->Src[i].Register.SwizzleW = w;
+}
+
+#define SET_SRC(inst, i, file, index, x, y, z, w) \
+   set_src(inst, i, file, index, TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, \
+           TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w)
+
+static void
+transform_instr(struct tgsi_transform_context *tctx,
+		struct tgsi_full_instruction *current_inst)
+{
+   struct tgsi_drawpix_transform *ctx = tgsi_drawpix_transform(tctx);
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+   unsigned i, sem_texcoord = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD :
+                                                  TGSI_SEMANTIC_GENERIC;
+   int texcoord_index = -1;
+
+   if (ctx->first_instruction_emitted)
+      goto transform_inst;
+
+   ctx->first_instruction_emitted = true;
+
+   /* Add scale and bias constants. */
+   if (ctx->scale_and_bias) {
+      if (ctx->info.const_file_max[0] < (int)ctx->scale_const) {
+         decl = tgsi_default_full_declaration();
+         decl.Declaration.File = TGSI_FILE_CONSTANT;
+         decl.Range.First = decl.Range.Last = ctx->scale_const;
+         tctx->emit_declaration(tctx, &decl);
+      }
+
+      if (ctx->info.const_file_max[0] < (int)ctx->bias_const) {
+         decl = tgsi_default_full_declaration();
+         decl.Declaration.File = TGSI_FILE_CONSTANT;
+         decl.Range.First = decl.Range.Last = ctx->bias_const;
+         tctx->emit_declaration(tctx, &decl);
+      }
+   }
+
+   if (ctx->info.const_file_max[0] < (int)ctx->texcoord_const) {
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_CONSTANT;
+      decl.Range.First = decl.Range.Last = ctx->texcoord_const;
+      tctx->emit_declaration(tctx, &decl);
+   }
+
+   /* Add a new temp. */
+   ctx->color_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_TEMPORARY;
+   decl.Range.First = decl.Range.Last = ctx->color_temp;
+   tctx->emit_declaration(tctx, &decl);
+
+   /* Add TEXCOORD[texcoord_slot] if it's missing. */
+   for (i = 0; i < ctx->info.num_inputs; i++) {
+      if (ctx->info.input_semantic_name[i] == sem_texcoord &&
+          ctx->info.input_semantic_index[i] == 0) {
+         texcoord_index = i;
+         break;
+      }
+   }
+
+   if (texcoord_index == -1) {
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_INPUT;
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.Name = sem_texcoord;
+      decl.Declaration.Interpolate = 1;
+      decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
+      decl.Range.First = decl.Range.Last = ctx->info.num_inputs;
+      texcoord_index = ctx->info.num_inputs;
+      tctx->emit_declaration(tctx, &decl);
+   }
+
+   /* Declare the drawpix sampler if it's missing. */
+   if (!(ctx->info.samplers_declared & (1 << ctx->drawpix_sampler))) {
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_SAMPLER;
+      decl.Range.First = decl.Range.Last = ctx->drawpix_sampler;
+      tctx->emit_declaration(tctx, &decl);
+   }
+
+   /* Declare the pixel map sampler if it's missing. */
+   if (ctx->pixel_maps &&
+       !(ctx->info.samplers_declared & (1 << ctx->pixelmap_sampler))) {
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_SAMPLER;
+      decl.Range.First = decl.Range.Last = ctx->pixelmap_sampler;
+      tctx->emit_declaration(tctx, &decl);
+   }
+
+   /* Get initial pixel color from the texture.
+    * TEX temp, fragment.texcoord[0], texture[0], 2D;
+    */
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+   inst.Instruction.Texture = 1;
+   inst.Texture.Texture = TGSI_TEXTURE_2D;
+
+   inst.Instruction.NumDstRegs = 1;
+   inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+   inst.Dst[0].Register.Index = ctx->color_temp;
+   inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+
+   inst.Instruction.NumSrcRegs = 2;
+   SET_SRC(&inst, 0, TGSI_FILE_INPUT, texcoord_index, X, Y, Z, W);
+   inst.Src[1].Register.File  = TGSI_FILE_SAMPLER;
+   inst.Src[1].Register.Index = ctx->drawpix_sampler;
+
+   tctx->emit_instruction(tctx, &inst);
+
+   /* Apply the scale and bias. */
+   if (ctx->scale_and_bias) {
+      /* MAD temp, temp, scale, bias; */
+      inst = tgsi_default_full_instruction();
+      inst.Instruction.Opcode = TGSI_OPCODE_MAD;
+
+      inst.Instruction.NumDstRegs = 1;
+      inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+      inst.Dst[0].Register.Index = ctx->color_temp;
+      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+
+      inst.Instruction.NumSrcRegs = 3;
+      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Z, W);
+      SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, ctx->scale_const, X, Y, Z, W);
+      SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, ctx->bias_const, X, Y, Z, W);
+
+      tctx->emit_instruction(tctx, &inst);
+   }
+
+   if (ctx->pixel_maps) {
+      /* do four pixel map look-ups with two TEX instructions: */
+
+      /* TEX temp.xy, temp.xyyy, texture[1], 2D; */
+      inst = tgsi_default_full_instruction();
+      inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+      inst.Instruction.Texture = 1;
+      inst.Texture.Texture = TGSI_TEXTURE_2D;
+
+      inst.Instruction.NumDstRegs = 1;
+      inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+      inst.Dst[0].Register.Index = ctx->color_temp;
+      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XY;
+
+      inst.Instruction.NumSrcRegs = 2;
+      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Y, Y);
+      inst.Src[1].Register.File  = TGSI_FILE_SAMPLER;
+      inst.Src[1].Register.Index = ctx->pixelmap_sampler;
+
+      tctx->emit_instruction(tctx, &inst);
+
+      /* TEX temp.zw, temp.zwww, texture[1], 2D; */
+      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_ZW;
+      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, Z, W, W, W);
+      tctx->emit_instruction(tctx, &inst);
+   }
+
+   /* Now, "color_temp" should be used in place of IN:COLOR0,
+    * and CONST[texcoord_slot] should be used in place of IN:TEXCOORD0.
+    */
+
+transform_inst:
+
+   for (i = 0; i < current_inst->Instruction.NumSrcRegs; i++) {
+      struct tgsi_full_src_register *src = &current_inst->Src[i];
+      unsigned reg = src->Register.Index;
+
+      if (src->Register.File != TGSI_FILE_INPUT || src->Register.Indirect)
+         continue;
+
+      if (ctx->info.input_semantic_name[reg] == TGSI_SEMANTIC_COLOR &&
+          ctx->info.input_semantic_index[reg] == 0) {
+         src->Register.File = TGSI_FILE_TEMPORARY;
+         src->Register.Index = ctx->color_temp;
+      } else if (ctx->info.input_semantic_name[reg] == sem_texcoord &&
+                 ctx->info.input_semantic_index[reg] == 0) {
+         src->Register.File = TGSI_FILE_CONSTANT;
+         src->Register.Index = ctx->texcoord_const;
+      }
+   }
+
+   tctx->emit_instruction(tctx, current_inst);
+}
+
+const struct tgsi_token *
+st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
+                      bool scale_and_bias, unsigned scale_const,
+                      unsigned bias_const, bool pixel_maps,
+                      unsigned drawpix_sampler, unsigned pixelmap_sampler,
+                      unsigned texcoord_const)
+{
+   struct tgsi_drawpix_transform ctx;
+   struct tgsi_token *newtoks;
+   int newlen;
+
+   memset(&ctx, 0, sizeof(ctx));
+   ctx.base.transform_instruction = transform_instr;
+   ctx.use_texcoord = use_texcoord;
+   ctx.scale_and_bias = scale_and_bias;
+   ctx.scale_const = scale_const;
+   ctx.bias_const = bias_const;
+   ctx.pixel_maps = pixel_maps;
+   ctx.drawpix_sampler = drawpix_sampler;
+   ctx.pixelmap_sampler = pixelmap_sampler;
+   ctx.texcoord_const = texcoord_const;
+   tgsi_scan_shader(tokens, &ctx.info);
+
+   newlen = tgsi_num_tokens(tokens) + 30;
+   newtoks = tgsi_alloc_tokens(newlen);
+   if (!newtoks)
+      return NULL;
+
+   tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base);
+   return newtoks;
+}
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index ff703fa41cb..2a2eb0992c8 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -456,7 +456,7 @@ st_update_renderbuffer_surface(struct st_context *st,
       surf_tmpl.u.tex.first_layer = first_layer;
       surf_tmpl.u.tex.last_layer = last_layer;
 
-      pipe_surface_reference(&strb->surface, NULL);
+      pipe_surface_release(pipe, &strb->surface);
 
       strb->surface = pipe->create_surface(pipe, resource, &surf_tmpl);
    }
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 3029909d12d..708bdf5011e 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -105,29 +105,24 @@ st_new_program(struct gl_context *ctx, GLenum target, GLuint id)
    switch (target) {
    case GL_VERTEX_PROGRAM_ARB: {
       struct st_vertex_program *prog = ST_CALLOC_STRUCT(st_vertex_program);
-      return _mesa_init_vertex_program(ctx, &prog->Base, target, id);
+      return _mesa_init_gl_program(&prog->Base.Base, target, id);
    }
-
    case GL_FRAGMENT_PROGRAM_ARB: {
       struct st_fragment_program *prog = ST_CALLOC_STRUCT(st_fragment_program);
-      return _mesa_init_fragment_program(ctx, &prog->Base, target, id);
+      return _mesa_init_gl_program(&prog->Base.Base, target, id);
    }
-
    case GL_GEOMETRY_PROGRAM_NV: {
       struct st_geometry_program *prog = ST_CALLOC_STRUCT(st_geometry_program);
-      return _mesa_init_geometry_program(ctx, &prog->Base, target, id);
+      return _mesa_init_gl_program(&prog->Base.Base, target, id);
    }
-
    case GL_TESS_CONTROL_PROGRAM_NV: {
       struct st_tessctrl_program *prog = ST_CALLOC_STRUCT(st_tessctrl_program);
-      return _mesa_init_tess_ctrl_program(ctx, &prog->Base, target, id);
+      return _mesa_init_gl_program(&prog->Base.Base, target, id);
    }
-
    case GL_TESS_EVALUATION_PROGRAM_NV: {
       struct st_tesseval_program *prog = ST_CALLOC_STRUCT(st_tesseval_program);
-      return _mesa_init_tess_eval_program(ctx, &prog->Base, target, id);
+      return _mesa_init_gl_program(&prog->Base.Base, target, id);
    }
-
    default:
       assert(0);
       return NULL;
@@ -234,6 +229,8 @@ st_program_string_notify( struct gl_context *ctx,
       struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
 
       st_release_fp_variants(st, stfp);
+      if (!st_translate_fragment_program(st, stfp))
+         return false;
 
       if (st->fp == stfp)
 	 st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
@@ -242,6 +239,8 @@ st_program_string_notify( struct gl_context *ctx,
       struct st_geometry_program *stgp = (struct st_geometry_program *) prog;
 
       st_release_gp_variants(st, stgp);
+      if (!st_translate_geometry_program(st, stgp))
+         return false;
 
       if (st->gp == stgp)
 	 st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
@@ -249,7 +248,9 @@ st_program_string_notify( struct gl_context *ctx,
    else if (target == GL_VERTEX_PROGRAM_ARB) {
       struct st_vertex_program *stvp = (struct st_vertex_program *) prog;
 
-      st_release_vp_variants( st, stvp );
+      st_release_vp_variants(st, stvp);
+      if (!st_translate_vertex_program(st, stvp))
+         return false;
 
       if (st->vp == stvp)
 	 st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
@@ -259,6 +260,8 @@ st_program_string_notify( struct gl_context *ctx,
          (struct st_tessctrl_program *) prog;
 
       st_release_tcp_variants(st, sttcp);
+      if (!st_translate_tessctrl_program(st, sttcp))
+         return false;
 
       if (st->tcp == sttcp)
          st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
@@ -268,6 +271,8 @@ st_program_string_notify( struct gl_context *ctx,
          (struct st_tesseval_program *) prog;
 
       st_release_tep_variants(st, sttep);
+      if (!st_translate_tesseval_program(st, sttep))
+         return false;
 
       if (st->tep == sttep)
          st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index a9ab5edcf49..bef7307bb27 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -224,8 +224,6 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
 
    st->ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
 
-   st->pixel_xfer.cache = _mesa_new_program_cache();
-
    st->has_stencil_export =
       screen->get_param(screen, PIPE_CAP_SHADER_STENCIL_EXPORT);
    st->has_shader_model3 = screen->get_param(screen, PIPE_CAP_SM3);
@@ -386,8 +384,8 @@ void st_destroy_context( struct st_context *st )
       pipe_surface_reference(&st->state.framebuffer.cbufs[i], NULL);
    }
    pipe_surface_reference(&st->state.framebuffer.zsbuf, NULL);
-
-   _mesa_delete_program_cache(st->ctx, st->pixel_xfer.cache);
+   pipe_sampler_view_reference(&st->pixel_xfer.pixelmap_sampler_view, NULL);
+   pipe_resource_reference(&st->pixel_xfer.pixelmap_texture, NULL);
 
    _vbo_DestroyContext(st->ctx);
 
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index a4cda29059d..f187d82449b 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -162,15 +162,8 @@ struct st_context
    struct gl_texture_object *default_texture;
 
    struct {
-      struct gl_program_cache *cache;
-      struct st_fragment_program *program;  /**< cur pixel transfer prog */
-      GLuint xfer_prog_sn;  /**< pixel xfer program serial no. */
-      GLuint user_prog_sn;  /**< user fragment program serial no. */
-      struct st_fragment_program *combined_prog;
-      GLuint combined_prog_sn;
       struct pipe_resource *pixelmap_texture;
       struct pipe_sampler_view *pixelmap_sampler_view;
-      boolean pixelmap_enabled;  /**< use the pixelmap texture? */
    } pixel_xfer;
 
    /** for glBitmap */
@@ -184,7 +177,7 @@ struct st_context
 
    /** for glDraw/CopyPixels */
    struct {
-      struct gl_fragment_program *shaders[4];
+      void *zs_shaders[4];
       void *vert_shaders[2];   /**< ureg shaders */
    } drawpix;
 
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 50891c112cb..6d859c6ab5b 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -98,7 +98,7 @@ st_print_current(void)
    if (st->vp->Base.Base.Parameters)
       _mesa_print_parameter_list(st->vp->Base.Base.Parameters);
 
-   tgsi_dump( st->fp->variants[0].tgsi.tokens, 0 );
+   tgsi_dump(st->fp->tgsi.tokens, 0);
    if (st->fp->Base.Base.Parameters)
       _mesa_print_parameter_list(st->fp->Base.Base.Parameters);
 }
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 633e90ffa38..f481e8902d8 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -4334,216 +4334,6 @@ glsl_to_tgsi_visitor::renumber_registers(void)
    ralloc_free(first_reads);
 }
 
-/**
- * Returns a fragment program which implements the current pixel transfer ops.
- * Based on get_pixel_transfer_program in st_atom_pixeltransfer.c.
- */
-extern "C" void
-get_pixel_transfer_visitor(struct st_fragment_program *fp,
-                           glsl_to_tgsi_visitor *original,
-                           int scale_and_bias, int pixel_maps)
-{
-   glsl_to_tgsi_visitor *v = new glsl_to_tgsi_visitor();
-   struct st_context *st = st_context(original->ctx);
-   struct gl_program *prog = &fp->Base.Base;
-   struct gl_program_parameter_list *params = _mesa_new_parameter_list();
-   st_src_reg coord, src0;
-   st_dst_reg dst0;
-   glsl_to_tgsi_instruction *inst;
-
-   /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */
-   v->ctx = original->ctx;
-   v->prog = prog;
-   v->shader_program = NULL;
-   v->shader = NULL;
-   v->glsl_version = original->glsl_version;
-   v->native_integers = original->native_integers;
-   v->options = original->options;
-   v->next_temp = original->next_temp;
-   v->num_address_regs = original->num_address_regs;
-   v->samplers_used = prog->SamplersUsed = original->samplers_used;
-   v->indirect_addr_consts = original->indirect_addr_consts;
-   memcpy(&v->immediates, &original->immediates, sizeof(v->immediates));
-   v->num_immediates = original->num_immediates;
-
-   /*
-    * Get initial pixel color from the texture.
-    * TEX colorTemp, fragment.texcoord[0], texture[0], 2D;
-    */
-   coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type);
-   src0 = v->get_temp(glsl_type::vec4_type);
-   dst0 = st_dst_reg(src0);
-   inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord);
-   inst->sampler_array_size = 1;
-   inst->tex_target = TEXTURE_2D_INDEX;
-
-   prog->InputsRead |= VARYING_BIT_TEX0;
-   prog->SamplersUsed |= (1 << 0); /* mark sampler 0 as used */
-   v->samplers_used |= (1 << 0);
-
-   if (scale_and_bias) {
-      static const gl_state_index scale_state[STATE_LENGTH] =
-         { STATE_INTERNAL, STATE_PT_SCALE,
-           (gl_state_index) 0, (gl_state_index) 0, (gl_state_index) 0 };
-      static const gl_state_index bias_state[STATE_LENGTH] =
-         { STATE_INTERNAL, STATE_PT_BIAS,
-           (gl_state_index) 0, (gl_state_index) 0, (gl_state_index) 0 };
-      GLint scale_p, bias_p;
-      st_src_reg scale, bias;
-
-      scale_p = _mesa_add_state_reference(params, scale_state);
-      bias_p = _mesa_add_state_reference(params, bias_state);
-
-      /* MAD colorTemp, colorTemp, scale, bias; */
-      scale = st_src_reg(PROGRAM_STATE_VAR, scale_p, GLSL_TYPE_FLOAT);
-      bias = st_src_reg(PROGRAM_STATE_VAR, bias_p, GLSL_TYPE_FLOAT);
-      inst = v->emit_asm(NULL, TGSI_OPCODE_MAD, dst0, src0, scale, bias);
-   }
-
-   if (pixel_maps) {
-      st_src_reg temp = v->get_temp(glsl_type::vec4_type);
-      st_dst_reg temp_dst = st_dst_reg(temp);
-
-      assert(st->pixel_xfer.pixelmap_texture);
-      (void) st;
-
-      /* With a little effort, we can do four pixel map look-ups with
-       * two TEX instructions:
-       */
-
-      /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */
-      temp_dst.writemask = WRITEMASK_XY; /* write R,G */
-      inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
-      inst->sampler.index = 1;
-      inst->sampler_array_size = 1;
-      inst->tex_target = TEXTURE_2D_INDEX;
-
-      /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */
-      src0.swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W);
-      temp_dst.writemask = WRITEMASK_ZW; /* write B,A */
-      inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
-      inst->sampler.index = 1;
-      inst->sampler_array_size = 1;
-      inst->tex_target = TEXTURE_2D_INDEX;
-
-      prog->SamplersUsed |= (1 << 1); /* mark sampler 1 as used */
-      v->samplers_used |= (1 << 1);
-
-      /* MOV colorTemp, temp; */
-      inst = v->emit_asm(NULL, TGSI_OPCODE_MOV, dst0, temp);
-   }
-
-   /* Now copy the instructions from the original glsl_to_tgsi_visitor into the
-    * new visitor. */
-   foreach_in_list(glsl_to_tgsi_instruction, inst, &original->instructions) {
-      glsl_to_tgsi_instruction *newinst;
-      st_src_reg src_regs[4];
-
-      if (inst->dst[0].file == PROGRAM_OUTPUT)
-         prog->OutputsWritten |= BITFIELD64_BIT(inst->dst[0].index);
-
-      for (int i = 0; i < 4; i++) {
-         src_regs[i] = inst->src[i];
-         if (src_regs[i].file == PROGRAM_INPUT &&
-             src_regs[i].index == VARYING_SLOT_COL0) {
-            src_regs[i].file = PROGRAM_TEMPORARY;
-            src_regs[i].index = src0.index;
-         }
-         else if (src_regs[i].file == PROGRAM_INPUT)
-            prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
-      }
-
-      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2], src_regs[3]);
-      newinst->tex_target = inst->tex_target;
-      newinst->sampler_array_size = inst->sampler_array_size;
-   }
-
-   /* Make modifications to fragment program info. */
-   prog->Parameters = _mesa_combine_parameter_lists(params,
-                                                    original->prog->Parameters);
-   _mesa_free_parameter_list(params);
-   count_resources(v, prog);
-   fp->glsl_to_tgsi = v;
-}
-
-/**
- * Make fragment program for glBitmap:
- *   Sample the texture and kill the fragment if the bit is 0.
- * This program will be combined with the user's fragment program.
- *
- * Based on make_bitmap_fragment_program in st_cb_bitmap.c.
- */
-extern "C" void
-get_bitmap_visitor(struct st_fragment_program *fp,
-                   glsl_to_tgsi_visitor *original, int samplerIndex)
-{
-   glsl_to_tgsi_visitor *v = new glsl_to_tgsi_visitor();
-   struct st_context *st = st_context(original->ctx);
-   struct gl_program *prog = &fp->Base.Base;
-   st_src_reg coord, src0;
-   st_dst_reg dst0;
-   glsl_to_tgsi_instruction *inst;
-
-   /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */
-   v->ctx = original->ctx;
-   v->prog = prog;
-   v->shader_program = NULL;
-   v->shader = NULL;
-   v->glsl_version = original->glsl_version;
-   v->native_integers = original->native_integers;
-   v->options = original->options;
-   v->next_temp = original->next_temp;
-   v->num_address_regs = original->num_address_regs;
-   v->samplers_used = prog->SamplersUsed = original->samplers_used;
-   v->indirect_addr_consts = original->indirect_addr_consts;
-   memcpy(&v->immediates, &original->immediates, sizeof(v->immediates));
-   v->num_immediates = original->num_immediates;
-
-   /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
-   coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type);
-   src0 = v->get_temp(glsl_type::vec4_type);
-   dst0 = st_dst_reg(src0);
-   inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord);
-   inst->sampler.index = samplerIndex;
-   inst->sampler_array_size = 1;
-   inst->tex_target = TEXTURE_2D_INDEX;
-
-   prog->InputsRead |= VARYING_BIT_TEX0;
-   prog->SamplersUsed |= (1 << samplerIndex); /* mark sampler as used */
-   v->samplers_used |= (1 << samplerIndex);
-
-   /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
-   src0.negate = NEGATE_XYZW;
-   if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM)
-      src0.swizzle = SWIZZLE_XXXX;
-   inst = v->emit_asm(NULL, TGSI_OPCODE_KILL_IF, undef_dst, src0);
-
-   /* Now copy the instructions from the original glsl_to_tgsi_visitor into the
-    * new visitor. */
-   foreach_in_list(glsl_to_tgsi_instruction, inst, &original->instructions) {
-      glsl_to_tgsi_instruction *newinst;
-      st_src_reg src_regs[4];
-
-      if (inst->dst[0].file == PROGRAM_OUTPUT)
-         prog->OutputsWritten |= BITFIELD64_BIT(inst->dst[0].index);
-
-      for (int i = 0; i < 4; i++) {
-         src_regs[i] = inst->src[i];
-         if (src_regs[i].file == PROGRAM_INPUT)
-            prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
-      }
-
-      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2], src_regs[3]);
-      newinst->tex_target = inst->tex_target;
-      newinst->sampler_array_size = inst->sampler_array_size;
-   }
-
-   /* Make modifications to fragment program info. */
-   prog->Parameters = _mesa_clone_parameter_list(original->prog->Parameters);
-   count_resources(v, prog);
-   fp->glsl_to_tgsi = v;
-}
-
 /* ------------------------- TGSI conversion stuff -------------------------- */
 struct label {
    unsigned branch_target;
@@ -4852,7 +4642,7 @@ src_register(struct st_translate *t, const st_src_reg *reg)
 static struct ureg_dst
 translate_dst(struct st_translate *t,
               const st_dst_reg *dst_reg,
-              bool saturate, bool clamp_color)
+              bool saturate)
 {
    struct ureg_dst dst = dst_register(t, dst_reg->file, dst_reg->index,
                                       dst_reg->array_id);
@@ -4864,28 +4654,6 @@ translate_dst(struct st_translate *t,
 
    if (saturate)
       dst = ureg_saturate(dst);
-   else if (clamp_color && dst_reg->file == PROGRAM_OUTPUT) {
-      /* Clamp colors for ARB_color_buffer_float. */
-      switch (t->procType) {
-      case TGSI_PROCESSOR_VERTEX:
-         /* This can only occur with a compatibility profile, which doesn't
-          * support geometry shaders. */
-         if (dst_reg->index == VARYING_SLOT_COL0 ||
-             dst_reg->index == VARYING_SLOT_COL1 ||
-             dst_reg->index == VARYING_SLOT_BFC0 ||
-             dst_reg->index == VARYING_SLOT_BFC1) {
-            dst = ureg_saturate(dst);
-         }
-         break;
-
-      case TGSI_PROCESSOR_FRAGMENT:
-         if (dst_reg->index == FRAG_RESULT_COLOR ||
-             dst_reg->index >= FRAG_RESULT_DATA0) {
-            dst = ureg_saturate(dst);
-         }
-         break;
-      }
-   }
 
    if (dst_reg->reladdr != NULL) {
       assert(dst_reg->file != PROGRAM_TEMPORARY);
@@ -4991,8 +4759,7 @@ translate_tex_offset(struct st_translate *t,
 
 static void
 compile_tgsi_instruction(struct st_translate *t,
-                         const glsl_to_tgsi_instruction *inst,
-                         bool clamp_dst_color_output)
+                         const glsl_to_tgsi_instruction *inst)
 {
    struct ureg_program *ureg = t->ureg;
    GLuint i;
@@ -5010,8 +4777,7 @@ compile_tgsi_instruction(struct st_translate *t,
    for (i = 0; i < num_dst; i++)
       dst[i] = translate_dst(t,
                              &inst->dst[i],
-                             inst->saturate,
-                             clamp_dst_color_output);
+                             inst->saturate);
 
    for (i = 0; i < num_src; i++)
       src[i] = translate_src(t, &inst->src[i]);
@@ -5286,16 +5052,6 @@ emit_face_var(struct gl_context *ctx, struct st_translate *t)
    t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
 }
 
-static void
-emit_edgeflags(struct st_translate *t)
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_dst edge_dst = t->outputs[t->outputMapping[VARYING_SLOT_EDGE]];
-   struct ureg_src edge_src = t->inputs[t->inputMapping[VERT_ATTRIB_EDGEFLAG]];
-
-   ureg_MOV(ureg, edge_dst, edge_src);
-}
-
 static bool
 find_array(unsigned attr, struct array_decl *arrays, unsigned count,
            unsigned *array_id, unsigned *array_size)
@@ -5353,9 +5109,7 @@ st_translate_program(
    const GLuint outputMapping[],
    const GLuint outputSlotToAttr[],
    const ubyte outputSemanticName[],
-   const ubyte outputSemanticIndex[],
-   boolean passthrough_edgeflags,
-   boolean clamp_color)
+   const ubyte outputSemanticIndex[])
 {
    struct st_translate *t;
    unsigned i;
@@ -5544,8 +5298,6 @@ st_translate_program(
             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X);
          }
       }
-      if (passthrough_edgeflags)
-         emit_edgeflags(t);
    }
 
    /* Declare address register.
@@ -5639,7 +5391,7 @@ st_translate_program(
       unsigned num_ubos = program->shader->NumUniformBlocks;
 
       for (i = 0; i < num_ubos; i++) {
-         unsigned size = program->shader->UniformBlocks[i].UniformBufferSize;
+         unsigned size = program->shader->UniformBlocks[i]->UniformBufferSize;
          unsigned num_const_vecs = (size + 15) / 16;
          unsigned first, last;
          assert(num_const_vecs > 0);
@@ -5696,7 +5448,7 @@ st_translate_program(
     */
    foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions) {
       set_insn_start(t, ureg_get_instruction_number(ureg));
-      compile_tgsi_instruction(t, inst, clamp_color);
+      compile_tgsi_instruction(t, inst);
    }
 
    /* Fix up all emitted labels:
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h
index 4af747fa9de..729295bcb52 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.h
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h
@@ -52,17 +52,9 @@ enum pipe_error st_translate_program(
    const GLuint outputMapping[],
    const GLuint outputSlotToAttr[],
    const ubyte outputSemanticName[],
-   const ubyte outputSemanticIndex[],
-   boolean passthrough_edgeflags,
-   boolean clamp_color);
+   const ubyte outputSemanticIndex[]);
 
 void free_glsl_to_tgsi_visitor(struct glsl_to_tgsi_visitor *v);
-void get_pixel_transfer_visitor(struct st_fragment_program *fp,
-                                struct glsl_to_tgsi_visitor *original,
-                                int scale_and_bias, int pixel_maps);
-void get_bitmap_visitor(struct st_fragment_program *fp,
-                        struct glsl_to_tgsi_visitor *original,
-                        int samplerIndex);
 
 GLboolean st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog);
 
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 896e239ee68..4b9dc994ea5 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -283,8 +283,7 @@ st_translate_texture_target( GLuint textarget,
 static struct ureg_dst
 translate_dst( struct st_translate *t,
                const struct prog_dst_register *DstReg,
-               boolean saturate,
-               boolean clamp_color)
+               boolean saturate)
 {
    struct ureg_dst dst = dst_register( t, 
                                        DstReg->File,
@@ -295,27 +294,6 @@ translate_dst( struct st_translate *t,
    
    if (saturate)
       dst = ureg_saturate( dst );
-   else if (clamp_color && DstReg->File == PROGRAM_OUTPUT) {
-      /* Clamp colors for ARB_color_buffer_float. */
-      switch (t->procType) {
-      case TGSI_PROCESSOR_VERTEX:
-         /* This can only occur with a compatibility profile, which doesn't
-          * support geometry shaders. */
-         if (DstReg->Index == VARYING_SLOT_COL0 ||
-             DstReg->Index == VARYING_SLOT_COL1 ||
-             DstReg->Index == VARYING_SLOT_BFC0 ||
-             DstReg->Index == VARYING_SLOT_BFC1) {
-            dst = ureg_saturate(dst);
-         }
-         break;
-
-      case TGSI_PROCESSOR_FRAGMENT:
-         if (DstReg->Index >= FRAG_RESULT_COLOR) {
-            dst = ureg_saturate(dst);
-         }
-         break;
-      }
-   }
 
    if (DstReg->RelAddr)
       dst = ureg_dst_indirect( dst, ureg_src(t->address[0]) );
@@ -649,8 +627,7 @@ static void
 compile_instruction(
    struct gl_context *ctx,
    struct st_translate *t,
-   const struct prog_instruction *inst,
-   boolean clamp_dst_color_output)
+   const struct prog_instruction *inst)
 {
    struct ureg_program *ureg = t->ureg;
    GLuint i;
@@ -665,8 +642,7 @@ compile_instruction(
    if (num_dst) 
       dst[0] = translate_dst( t, 
                               &inst->DstReg,
-                              inst->Saturate,
-                              clamp_dst_color_output);
+                              inst->Saturate);
 
    for (i = 0; i < num_src; i++) 
       src[i] = translate_src( t, &inst->SrcReg[i] );
@@ -974,18 +950,6 @@ emit_face_var( struct st_translate *t,
 }
 
 
-static void
-emit_edgeflags( struct st_translate *t,
-                 const struct gl_program *program )
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_dst edge_dst = t->outputs[t->outputMapping[VARYING_SLOT_EDGE]];
-   struct ureg_src edge_src = t->inputs[t->inputMapping[VERT_ATTRIB_EDGEFLAG]];
-
-   ureg_MOV( ureg, edge_dst, edge_src );
-}
-
-
 /**
  * Translate Mesa program to TGSI format.
  * \param program  the program to translate
@@ -1019,9 +983,7 @@ st_translate_mesa_program(
    GLuint numOutputs,
    const GLuint outputMapping[],
    const ubyte outputSemanticName[],
-   const ubyte outputSemanticIndex[],
-   boolean passthrough_edgeflags,
-   boolean clamp_color)
+   const ubyte outputSemanticIndex[])
 {
    struct st_translate translate, *t;
    unsigned i;
@@ -1125,8 +1087,6 @@ st_translate_mesa_program(
             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X);
 	 }
       }
-      if (passthrough_edgeflags)
-         emit_edgeflags( t, program );
    }
 
    /* Declare address register.
@@ -1231,7 +1191,7 @@ st_translate_mesa_program(
     */
    for (i = 0; i < program->NumInstructions; i++) {
       set_insn_start( t, ureg_get_instruction_number( ureg ));
-      compile_instruction( ctx, t, &program->Instructions[i], clamp_color );
+      compile_instruction(ctx, t, &program->Instructions[i]);
    }
 
    /* Fix up all emitted labels:
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.h b/src/mesa/state_tracker/st_mesa_to_tgsi.h
index 62bb654e95a..ed7a3adfe1a 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.h
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.h
@@ -58,9 +58,7 @@ st_translate_mesa_program(
    GLuint numOutputs,
    const GLuint outputMapping[],
    const ubyte outputSemanticName[],
-   const ubyte outputSemanticIndex[],
-   boolean passthrough_edgeflags,
-   boolean clamp_color);
+   const ubyte outputSemanticIndex[]);
 
 unsigned
 st_translate_texture_target(GLuint textarget, GLboolean shadow);
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index a07f8fec309..6a69ba7aa26 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -43,6 +43,8 @@
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_emulate.h"
+#include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_ureg.h"
 
 #include "st_debug.h"
@@ -92,6 +94,11 @@ st_release_vp_variants( struct st_context *st,
    }
 
    stvp->variants = NULL;
+
+   if (stvp->tgsi.tokens) {
+      tgsi_free_tokens(stvp->tgsi.tokens);
+      stvp->tgsi.tokens = NULL;
+   }
 }
 
 
@@ -107,8 +114,6 @@ delete_fp_variant(struct st_context *st, struct st_fp_variant *fpv)
       cso_delete_fragment_shader(st->cso_context, fpv->driver_shader);
    if (fpv->parameters)
       _mesa_free_parameter_list(fpv->parameters);
-   if (fpv->tgsi.tokens)
-      ureg_free_tokens(fpv->tgsi.tokens);
    free(fpv);
 }
 
@@ -128,6 +133,11 @@ st_release_fp_variants(struct st_context *st, struct st_fragment_program *stfp)
    }
 
    stfp->variants = NULL;
+
+   if (stfp->tgsi.tokens) {
+      ureg_free_tokens(stfp->tgsi.tokens);
+      stfp->tgsi.tokens = NULL;
+   }
 }
 
 
@@ -160,6 +170,11 @@ st_release_gp_variants(struct st_context *st, struct st_geometry_program *stgp)
    }
 
    stgp->variants = NULL;
+
+   if (stgp->tgsi.tokens) {
+      ureg_free_tokens(stgp->tgsi.tokens);
+      stgp->tgsi.tokens = NULL;
+   }
 }
 
 
@@ -192,6 +207,11 @@ st_release_tcp_variants(struct st_context *st, struct st_tessctrl_program *sttcp
    }
 
    sttcp->variants = NULL;
+
+   if (sttcp->tgsi.tokens) {
+      ureg_free_tokens(sttcp->tgsi.tokens);
+      sttcp->tgsi.tokens = NULL;
+   }
 }
 
 
@@ -224,28 +244,34 @@ st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep
    }
 
    sttep->variants = NULL;
+
+   if (sttep->tgsi.tokens) {
+      ureg_free_tokens(sttep->tgsi.tokens);
+      sttep->tgsi.tokens = NULL;
+   }
 }
 
 
 /**
- * Translate a Mesa vertex shader into a TGSI shader.
- * \param outputMapping  to map vertex program output registers (VARYING_SLOT_x)
- *       to TGSI output slots
- * \param tokensOut  destination for TGSI tokens
- * \return  pointer to cached pipe_shader object.
+ * Translate a vertex program.
  */
-void
-st_prepare_vertex_program(struct gl_context *ctx,
+bool
+st_translate_vertex_program(struct st_context *st,
                             struct st_vertex_program *stvp)
 {
-   struct st_context *st = st_context(ctx);
-   GLuint attr;
+   struct ureg_program *ureg;
+   enum pipe_error error;
+   unsigned num_outputs = 0;
+   unsigned attr;
+   unsigned input_to_index[VERT_ATTRIB_MAX] = {0};
+   unsigned output_slot_to_attr[VARYING_SLOT_MAX] = {0};
+   ubyte output_semantic_name[VARYING_SLOT_MAX] = {0};
+   ubyte output_semantic_index[VARYING_SLOT_MAX] = {0};
 
    stvp->num_inputs = 0;
-   stvp->num_outputs = 0;
 
    if (stvp->Base.IsPositionInvariant)
-      _mesa_insert_mvp_code(ctx, &stvp->Base);
+      _mesa_insert_mvp_code(st->ctx, &stvp->Base);
 
    /*
     * Determine number of inputs, the mappings between VERT_ATTRIB_x
@@ -253,7 +279,7 @@ st_prepare_vertex_program(struct gl_context *ctx,
     */
    for (attr = 0; attr < VERT_ATTRIB_MAX; attr++) {
       if ((stvp->Base.Base.InputsRead & BITFIELD64_BIT(attr)) != 0) {
-         stvp->input_to_index[attr] = stvp->num_inputs;
+         input_to_index[attr] = stvp->num_inputs;
          stvp->index_to_input[stvp->num_inputs] = attr;
          stvp->num_inputs++;
          if ((stvp->Base.Base.DoubleInputsRead & BITFIELD64_BIT(attr)) != 0) {
@@ -264,7 +290,7 @@ st_prepare_vertex_program(struct gl_context *ctx,
       }
    }
    /* bit of a hack, presetup potentially unused edgeflag input */
-   stvp->input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs;
+   input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs;
    stvp->index_to_input[stvp->num_inputs] = VERT_ATTRIB_EDGEFLAG;
 
    /* Compute mapping of vertex program outputs to slots.
@@ -274,62 +300,62 @@ st_prepare_vertex_program(struct gl_context *ctx,
          stvp->result_to_output[attr] = ~0;
       }
       else {
-         unsigned slot = stvp->num_outputs++;
+         unsigned slot = num_outputs++;
 
          stvp->result_to_output[attr] = slot;
-         stvp->output_slot_to_attr[slot] = attr;
+         output_slot_to_attr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_POS:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_COL0:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_COL1:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            stvp->output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_BFC0:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_BFC1:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            stvp->output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_FOGC:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_PSIZ:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_DIST0:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_DIST1:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            stvp->output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_EDGE:
             assert(0);
             break;
          case VARYING_SLOT_CLIP_VERTEX:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_LAYER:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_VIEWPORT:
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
-            stvp->output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
+            output_semantic_index[slot] = 0;
             break;
 
          case VARYING_SLOT_TEX0:
@@ -341,8 +367,8 @@ st_prepare_vertex_program(struct gl_context *ctx,
          case VARYING_SLOT_TEX6:
          case VARYING_SLOT_TEX7:
             if (st->needs_texcoord_semantic) {
-               stvp->output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
-               stvp->output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
+               output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
+               output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
                break;
             }
             /* fall through */
@@ -350,55 +376,24 @@ st_prepare_vertex_program(struct gl_context *ctx,
          default:
             assert(attr >= VARYING_SLOT_VAR0 ||
                    (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
-            stvp->output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            stvp->output_semantic_index[slot] =
+            output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
+            output_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
             break;
          }
       }
    }
    /* similar hack to above, presetup potentially unused edgeflag output */
-   stvp->result_to_output[VARYING_SLOT_EDGE] = stvp->num_outputs;
-   stvp->output_semantic_name[stvp->num_outputs] = TGSI_SEMANTIC_EDGEFLAG;
-   stvp->output_semantic_index[stvp->num_outputs] = 0;
-}
-
-
-/**
- * Translate a vertex program to create a new variant.
- */
-static struct st_vp_variant *
-st_translate_vertex_program(struct st_context *st,
-                            struct st_vertex_program *stvp,
-                            const struct st_vp_variant_key *key)
-{
-   struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant);
-   struct pipe_context *pipe = st->pipe;
-   struct ureg_program *ureg;
-   enum pipe_error error;
-   unsigned num_outputs;
-
-   st_prepare_vertex_program(st->ctx, stvp);
+   stvp->result_to_output[VARYING_SLOT_EDGE] = num_outputs;
+   output_semantic_name[num_outputs] = TGSI_SEMANTIC_EDGEFLAG;
+   output_semantic_index[num_outputs] = 0;
 
    if (!stvp->glsl_to_tgsi)
-   {
       _mesa_remove_output_reads(&stvp->Base.Base, PROGRAM_OUTPUT);
-   }
 
    ureg = ureg_create_with_screen(TGSI_PROCESSOR_VERTEX, st->pipe->screen);
-   if (ureg == NULL) {
-      free(vpv);
-      return NULL;
-   }
-
-   vpv->key = *key;
-
-   vpv->num_inputs = stvp->num_inputs;
-   num_outputs = stvp->num_outputs;
-   if (key->passthrough_edgeflags) {
-      vpv->num_inputs++;
-      num_outputs++;
-   }
+   if (ureg == NULL)
+      return false;
 
    if (ST_DEBUG & DEBUG_MESA) {
       _mesa_print_program(&stvp->Base.Base);
@@ -406,15 +401,15 @@ st_translate_vertex_program(struct st_context *st,
       debug_printf("\n");
    }
 
-   if (stvp->glsl_to_tgsi)
+   if (stvp->glsl_to_tgsi) {
       error = st_translate_program(st->ctx,
                                    TGSI_PROCESSOR_VERTEX,
                                    ureg,
                                    stvp->glsl_to_tgsi,
                                    &stvp->Base.Base,
                                    /* inputs */
-                                   vpv->num_inputs,
-                                   stvp->input_to_index,
+                                   stvp->num_inputs,
+                                   input_to_index,
                                    NULL, /* inputSlotToAttr */
                                    NULL, /* input semantic name */
                                    NULL, /* input semantic index */
@@ -423,43 +418,75 @@ st_translate_vertex_program(struct st_context *st,
                                    /* outputs */
                                    num_outputs,
                                    stvp->result_to_output,
-                                   stvp->output_slot_to_attr,
-                                   stvp->output_semantic_name,
-                                   stvp->output_semantic_index,
-                                   key->passthrough_edgeflags,
-                                   key->clamp_color);
-   else
+                                   output_slot_to_attr,
+                                   output_semantic_name,
+                                   output_semantic_index);
+
+      st_translate_stream_output_info(stvp->glsl_to_tgsi,
+                                      stvp->result_to_output,
+                                      &stvp->tgsi.stream_output);
+
+      free_glsl_to_tgsi_visitor(stvp->glsl_to_tgsi);
+      stvp->glsl_to_tgsi = NULL;
+   } else
       error = st_translate_mesa_program(st->ctx,
                                         TGSI_PROCESSOR_VERTEX,
                                         ureg,
                                         &stvp->Base.Base,
                                         /* inputs */
-                                        vpv->num_inputs,
-                                        stvp->input_to_index,
+                                        stvp->num_inputs,
+                                        input_to_index,
                                         NULL, /* input semantic name */
                                         NULL, /* input semantic index */
                                         NULL,
                                         /* outputs */
                                         num_outputs,
                                         stvp->result_to_output,
-                                        stvp->output_semantic_name,
-                                        stvp->output_semantic_index,
-                                        key->passthrough_edgeflags,
-                                        key->clamp_color);
+                                        output_semantic_name,
+                                        output_semantic_index);
+
+   if (error) {
+      debug_printf("%s: failed to translate Mesa program:\n", __func__);
+      _mesa_print_program(&stvp->Base.Base);
+      debug_assert(0);
+      return false;
+   }
+
+   stvp->tgsi.tokens = ureg_get_tokens(ureg, NULL);
+   ureg_destroy(ureg);
+   return stvp->tgsi.tokens != NULL;
+}
 
-   if (error)
-      goto fail;
+static struct st_vp_variant *
+st_create_vp_variant(struct st_context *st,
+                     struct st_vertex_program *stvp,
+                     const struct st_vp_variant_key *key)
+{
+   struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant);
+   struct pipe_context *pipe = st->pipe;
 
-   vpv->tgsi.tokens = ureg_get_tokens( ureg, NULL );
-   if (!vpv->tgsi.tokens)
-      goto fail;
+   vpv->key = *key;
+   vpv->tgsi.tokens = tgsi_dup_tokens(stvp->tgsi.tokens);
+   vpv->tgsi.stream_output = stvp->tgsi.stream_output;
+   vpv->num_inputs = stvp->num_inputs;
 
-   ureg_destroy( ureg );
+   /* Emulate features. */
+   if (key->clamp_color || key->passthrough_edgeflags) {
+      const struct tgsi_token *tokens;
+      unsigned flags =
+         (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) |
+         (key->passthrough_edgeflags ? TGSI_EMU_PASSTHROUGH_EDGEFLAG : 0);
 
-   if (stvp->glsl_to_tgsi) {
-      st_translate_stream_output_info(stvp->glsl_to_tgsi,
-                                      stvp->result_to_output,
-                                      &vpv->tgsi.stream_output);
+      tokens = tgsi_emulate(vpv->tgsi.tokens, flags);
+
+      if (tokens) {
+         tgsi_free_tokens(vpv->tgsi.tokens);
+         vpv->tgsi.tokens = tokens;
+
+         if (key->passthrough_edgeflags)
+            vpv->num_inputs++;
+      } else
+         fprintf(stderr, "mesa: cannot emulate deprecated features\n");
    }
 
    if (ST_DEBUG & DEBUG_TGSI) {
@@ -469,14 +496,6 @@ st_translate_vertex_program(struct st_context *st,
 
    vpv->driver_shader = pipe->create_vs_state(pipe, &vpv->tgsi);
    return vpv;
-
-fail:
-   debug_printf("%s: failed to translate Mesa program:\n", __func__);
-   _mesa_print_program(&stvp->Base.Base);
-   debug_assert(0);
-
-   ureg_destroy( ureg );
-   return NULL;
 }
 
 
@@ -499,7 +518,7 @@ st_get_vp_variant(struct st_context *st,
 
    if (!vpv) {
       /* create now */
-      vpv = st_translate_vertex_program(st, stvp, key);
+      vpv = st_create_vp_variant(st, stvp, key);
       if (vpv) {
          /* insert into list */
          vpv->next = stvp->variants;
@@ -533,19 +552,12 @@ st_translate_interp(enum glsl_interp_qualifier glsl_qual, bool is_color)
 
 
 /**
- * Translate a Mesa fragment shader into a TGSI shader using extra info in
- * the key.
- * \return  new fragment program variant
+ * Translate a Mesa fragment shader into a TGSI shader.
  */
-static struct st_fp_variant *
+bool
 st_translate_fragment_program(struct st_context *st,
-                              struct st_fragment_program *stfp,
-                              const struct st_fp_variant_key *key)
+                              struct st_fragment_program *stfp)
 {
-   struct pipe_context *pipe = st->pipe;
-   struct st_fp_variant *variant = CALLOC_STRUCT(st_fp_variant);
-   GLboolean deleteFP = GL_FALSE;
-
    GLuint outputMapping[FRAG_RESULT_MAX];
    GLuint inputMapping[VARYING_SLOT_MAX];
    GLuint inputSlotToAttr[VARYING_SLOT_MAX];
@@ -565,40 +577,8 @@ st_translate_fragment_program(struct st_context *st,
    ubyte fs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
    uint fs_num_outputs = 0;
 
-   if (!variant)
-      return NULL;
-
-   assert(!(key->bitmap && key->drawpixels));
    memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr));
 
-   if (key->bitmap) {
-      /* glBitmap drawing */
-      struct gl_fragment_program *fp; /* we free this temp program below */
-
-      st_make_bitmap_fragment_program(st, &stfp->Base,
-                                      &fp, &variant->bitmap_sampler);
-
-      variant->parameters = _mesa_clone_parameter_list(fp->Base.Parameters);
-      stfp = st_fragment_program(fp);
-      deleteFP = GL_TRUE;
-   }
-   else if (key->drawpixels) {
-      /* glDrawPixels drawing */
-      struct gl_fragment_program *fp; /* we free this temp program below */
-
-      if (key->drawpixels_z || key->drawpixels_stencil) {
-         fp = st_make_drawpix_z_stencil_program(st, key->drawpixels_z,
-                                                key->drawpixels_stencil);
-      }
-      else {
-         /* RGBA */
-         st_make_drawpix_fragment_program(st, &stfp->Base, &fp);
-         variant->parameters = _mesa_clone_parameter_list(fp->Base.Parameters);
-         deleteFP = GL_TRUE;
-      }
-      stfp = st_fragment_program(fp);
-   }
-
    if (!stfp->glsl_to_tgsi)
       _mesa_remove_output_reads(&stfp->Base.Base, PROGRAM_OUTPUT);
 
@@ -620,8 +600,7 @@ st_translate_fragment_program(struct st_context *st,
             interpLocation[slot] = TGSI_INTERPOLATE_LOC_CENTER;
 
          if (stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID |
-                                                 SYSTEM_BIT_SAMPLE_POS) ||
-             key->persample_shading)
+                                                 SYSTEM_BIT_SAMPLE_POS))
             interpLocation[slot] = TGSI_INTERPOLATE_LOC_SAMPLE;
 
          switch (attr) {
@@ -805,10 +784,8 @@ st_translate_fragment_program(struct st_context *st,
    }
 
    ureg = ureg_create_with_screen(TGSI_PROCESSOR_FRAGMENT, st->pipe->screen);
-   if (ureg == NULL) {
-      free(variant);
-      return NULL;
-   }
+   if (ureg == NULL)
+      return false;
 
    if (ST_DEBUG & DEBUG_MESA) {
       _mesa_print_program(&stfp->Base.Base);
@@ -841,7 +818,7 @@ st_translate_fragment_program(struct st_context *st,
       }
    }
 
-   if (stfp->glsl_to_tgsi)
+   if (stfp->glsl_to_tgsi) {
       st_translate_program(st->ctx,
                            TGSI_PROCESSOR_FRAGMENT,
                            ureg,
@@ -860,9 +837,11 @@ st_translate_fragment_program(struct st_context *st,
                            outputMapping,
                            NULL,
                            fs_output_semantic_name,
-                           fs_output_semantic_index, FALSE,
-                           key->clamp_color );
-   else
+                           fs_output_semantic_index);
+
+      free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi);
+      stfp->glsl_to_tgsi = NULL;
+   } else
       st_translate_mesa_program(st->ctx,
                                 TGSI_PROCESSOR_FRAGMENT,
                                 ureg,
@@ -877,31 +856,134 @@ st_translate_fragment_program(struct st_context *st,
                                 fs_num_outputs,
                                 outputMapping,
                                 fs_output_semantic_name,
-                                fs_output_semantic_index, FALSE,
-                                key->clamp_color);
+                                fs_output_semantic_index);
+
+   stfp->tgsi.tokens = ureg_get_tokens(ureg, NULL);
+   ureg_destroy(ureg);
+   return stfp->tgsi.tokens != NULL;
+}
+
+static struct st_fp_variant *
+st_create_fp_variant(struct st_context *st,
+                     struct st_fragment_program *stfp,
+                     const struct st_fp_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct st_fp_variant *variant = CALLOC_STRUCT(st_fp_variant);
+   struct pipe_shader_state tgsi = {0};
+
+   if (!variant)
+      return NULL;
+
+   tgsi.tokens = stfp->tgsi.tokens;
 
-   variant->tgsi.tokens = ureg_get_tokens( ureg, NULL );
-   ureg_destroy( ureg );
+   assert(!(key->bitmap && key->drawpixels));
+
+   /* Emulate features. */
+   if (key->clamp_color || key->persample_shading) {
+      const struct tgsi_token *tokens;
+      unsigned flags =
+         (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) |
+         (key->persample_shading ? TGSI_EMU_FORCE_PERSAMPLE_INTERP : 0);
+
+      tokens = tgsi_emulate(tgsi.tokens, flags);
+
+      if (tokens)
+         tgsi.tokens = tokens;
+      else
+         fprintf(stderr, "mesa: cannot emulate deprecated features\n");
+   }
+
+   /* glBitmap */
+   if (key->bitmap) {
+      const struct tgsi_token *tokens;
+
+      variant->bitmap_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1;
+
+      tokens = st_get_bitmap_shader(tgsi.tokens,
+                                    variant->bitmap_sampler,
+                                    st->needs_texcoord_semantic,
+                                    st->bitmap.tex_format ==
+                                    PIPE_FORMAT_L8_UNORM);
+
+      if (tokens) {
+         if (tgsi.tokens != stfp->tgsi.tokens)
+            tgsi_free_tokens(tgsi.tokens);
+         tgsi.tokens = tokens;
+         variant->parameters =
+            _mesa_clone_parameter_list(stfp->Base.Base.Parameters);
+      } else
+         fprintf(stderr, "mesa: cannot create a shader for glBitmap\n");
+   }
+
+   /* glDrawPixels (color only) */
+   if (key->drawpixels) {
+      const struct tgsi_token *tokens;
+      unsigned scale_const = 0, bias_const = 0, texcoord_const = 0;
+
+      /* Find the first unused slot. */
+      variant->drawpix_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1;
+
+      if (key->pixelMaps) {
+         unsigned samplers_used = stfp->Base.Base.SamplersUsed |
+                                  (1 << variant->drawpix_sampler);
+
+         variant->pixelmap_sampler = ffs(~samplers_used) - 1;
+      }
+
+      variant->parameters =
+         _mesa_clone_parameter_list(stfp->Base.Base.Parameters);
+
+      if (key->scaleAndBias) {
+         static const gl_state_index scale_state[STATE_LENGTH] =
+            { STATE_INTERNAL, STATE_PT_SCALE };
+         static const gl_state_index bias_state[STATE_LENGTH] =
+            { STATE_INTERNAL, STATE_PT_BIAS };
+
+         scale_const = _mesa_add_state_reference(variant->parameters,
+                                                 scale_state);
+         bias_const = _mesa_add_state_reference(variant->parameters,
+                                                bias_state);
+      }
+
+      {
+         static const gl_state_index state[STATE_LENGTH] =
+            { STATE_INTERNAL, STATE_CURRENT_ATTRIB, VERT_ATTRIB_TEX0 };
+
+         texcoord_const = _mesa_add_state_reference(variant->parameters,
+                                                    state);
+      }
+
+      tokens = st_get_drawpix_shader(tgsi.tokens,
+                                     st->needs_texcoord_semantic,
+                                     key->scaleAndBias, scale_const,
+                                     bias_const, key->pixelMaps,
+                                     variant->drawpix_sampler,
+                                     variant->pixelmap_sampler,
+                                     texcoord_const);
+
+      if (tokens) {
+         if (tgsi.tokens != stfp->tgsi.tokens)
+            tgsi_free_tokens(tgsi.tokens);
+         tgsi.tokens = tokens;
+      } else
+         fprintf(stderr, "mesa: cannot create a shader for glDrawPixels\n");
+   }
 
    if (ST_DEBUG & DEBUG_TGSI) {
-      tgsi_dump(variant->tgsi.tokens, 0/*TGSI_DUMP_VERBOSE*/);
+      tgsi_dump(tgsi.tokens, 0);
       debug_printf("\n");
    }
 
    /* fill in variant */
-   variant->driver_shader = pipe->create_fs_state(pipe, &variant->tgsi);
+   variant->driver_shader = pipe->create_fs_state(pipe, &tgsi);
    variant->key = *key;
 
-   if (deleteFP) {
-      /* Free the temporary program made above */
-      struct gl_fragment_program *fp = &stfp->Base;
-      _mesa_reference_fragprog(st->ctx, &fp, NULL);
-   }
-
+   if (tgsi.tokens != stfp->tgsi.tokens)
+      tgsi_free_tokens(tgsi.tokens);
    return variant;
 }
 
-
 /**
  * Translate fragment program if needed.
  */
@@ -921,7 +1003,7 @@ st_get_fp_variant(struct st_context *st,
 
    if (!fpv) {
       /* create new */
-      fpv = st_translate_fragment_program(st, stfp, key);
+      fpv = st_create_fp_variant(st, stfp, key);
       if (fpv) {
          /* insert into list */
          fpv->next = stfp->variants;
@@ -1191,9 +1273,7 @@ st_translate_program_common(struct st_context *st,
                         outputMapping,
                         outputSlotToAttr,
                         output_semantic_name,
-                        output_semantic_index,
-                        FALSE,
-                        FALSE);
+                        output_semantic_index);
 
    out_state->tokens = ureg_get_tokens(ureg, NULL);
    ureg_destroy(ureg);
@@ -1217,19 +1297,15 @@ st_translate_program_common(struct st_context *st,
 /**
  * Translate a geometry program to create a new variant.
  */
-static struct st_gp_variant *
+bool
 st_translate_geometry_program(struct st_context *st,
-                              struct st_geometry_program *stgp,
-                              const struct st_gp_variant_key *key)
+                              struct st_geometry_program *stgp)
 {
-   struct pipe_context *pipe = st->pipe;
    struct ureg_program *ureg;
-   struct st_gp_variant *gpv;
-   struct pipe_shader_state state;
 
    ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
    if (ureg == NULL)
-      return NULL;
+      return false;
 
    ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType);
    ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType);
@@ -1238,19 +1314,29 @@ st_translate_geometry_program(struct st_context *st,
    ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations);
 
    st_translate_program_common(st, &stgp->Base.Base, stgp->glsl_to_tgsi, ureg,
-                               TGSI_PROCESSOR_GEOMETRY, &state);
+                               TGSI_PROCESSOR_GEOMETRY, &stgp->tgsi);
+
+   free_glsl_to_tgsi_visitor(stgp->glsl_to_tgsi);
+   stgp->glsl_to_tgsi = NULL;
+   return true;
+}
+
+
+static struct st_gp_variant *
+st_create_gp_variant(struct st_context *st,
+                     struct st_geometry_program *stgp,
+                     const struct st_gp_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct st_gp_variant *gpv;
 
    gpv = CALLOC_STRUCT(st_gp_variant);
-   if (!gpv) {
-      ureg_free_tokens(state.tokens);
+   if (!gpv)
       return NULL;
-   }
 
    /* fill in new variant */
-   gpv->driver_shader = pipe->create_gs_state(pipe, &state);
+   gpv->driver_shader = pipe->create_gs_state(pipe, &stgp->tgsi);
    gpv->key = *key;
-
-   ureg_free_tokens(state.tokens);
    return gpv;
 }
 
@@ -1274,7 +1360,7 @@ st_get_gp_variant(struct st_context *st,
 
    if (!gpv) {
       /* create new */
-      gpv = st_translate_geometry_program(st, stgp, key);
+      gpv = st_create_gp_variant(st, stgp, key);
       if (gpv) {
          /* insert into list */
          gpv->next = stgp->variants;
@@ -1289,38 +1375,43 @@ st_get_gp_variant(struct st_context *st,
 /**
  * Translate a tessellation control program to create a new variant.
  */
-static struct st_tcp_variant *
+bool
 st_translate_tessctrl_program(struct st_context *st,
-                              struct st_tessctrl_program *sttcp,
-                              const struct st_tcp_variant_key *key)
+                              struct st_tessctrl_program *sttcp)
 {
-   struct pipe_context *pipe = st->pipe;
    struct ureg_program *ureg;
-   struct st_tcp_variant *tcpv;
-   struct pipe_shader_state state;
 
-   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, pipe->screen);
-   if (ureg == NULL) {
-      return NULL;
-   }
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, st->pipe->screen);
+   if (ureg == NULL)
+      return false;
 
    ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT,
                  sttcp->Base.VerticesOut);
 
    st_translate_program_common(st, &sttcp->Base.Base, sttcp->glsl_to_tgsi,
-                               ureg, TGSI_PROCESSOR_TESS_CTRL, &state);
+                               ureg, TGSI_PROCESSOR_TESS_CTRL, &sttcp->tgsi);
+
+   free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi);
+   sttcp->glsl_to_tgsi = NULL;
+   return true;
+}
+
+
+static struct st_tcp_variant *
+st_create_tcp_variant(struct st_context *st,
+                      struct st_tessctrl_program *sttcp,
+                      const struct st_tcp_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct st_tcp_variant *tcpv;
 
    tcpv = CALLOC_STRUCT(st_tcp_variant);
-   if (!tcpv) {
-      ureg_free_tokens(state.tokens);
+   if (!tcpv)
       return NULL;
-   }
 
    /* fill in new variant */
-   tcpv->driver_shader = pipe->create_tcs_state(pipe, &state);
+   tcpv->driver_shader = pipe->create_tcs_state(pipe, &sttcp->tgsi);
    tcpv->key = *key;
-
-   ureg_free_tokens(state.tokens);
    return tcpv;
 }
 
@@ -1344,7 +1435,7 @@ st_get_tcp_variant(struct st_context *st,
 
    if (!tcpv) {
       /* create new */
-      tcpv = st_translate_tessctrl_program(st, sttcp, key);
+      tcpv = st_create_tcp_variant(st, sttcp, key);
       if (tcpv) {
          /* insert into list */
          tcpv->next = sttcp->variants;
@@ -1359,20 +1450,15 @@ st_get_tcp_variant(struct st_context *st,
 /**
  * Translate a tessellation evaluation program to create a new variant.
  */
-static struct st_tep_variant *
+bool
 st_translate_tesseval_program(struct st_context *st,
-                              struct st_tesseval_program *sttep,
-                              const struct st_tep_variant_key *key)
+                              struct st_tesseval_program *sttep)
 {
-   struct pipe_context *pipe = st->pipe;
    struct ureg_program *ureg;
-   struct st_tep_variant *tepv;
-   struct pipe_shader_state state;
 
-   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, pipe->screen);
-   if (ureg == NULL) {
-      return NULL;
-   }
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, st->pipe->screen);
+   if (ureg == NULL)
+      return false;
 
    if (sttep->Base.PrimitiveMode == GL_ISOLINES)
       ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, GL_LINES);
@@ -1400,19 +1486,29 @@ st_translate_tesseval_program(struct st_context *st,
    ureg_property(ureg, TGSI_PROPERTY_TES_POINT_MODE, sttep->Base.PointMode);
 
    st_translate_program_common(st, &sttep->Base.Base, sttep->glsl_to_tgsi,
-                               ureg, TGSI_PROCESSOR_TESS_EVAL, &state);
+                               ureg, TGSI_PROCESSOR_TESS_EVAL, &sttep->tgsi);
+
+   free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi);
+   sttep->glsl_to_tgsi = NULL;
+   return true;
+}
+
+
+static struct st_tep_variant *
+st_create_tep_variant(struct st_context *st,
+                      struct st_tesseval_program *sttep,
+                      const struct st_tep_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct st_tep_variant *tepv;
 
    tepv = CALLOC_STRUCT(st_tep_variant);
-   if (!tepv) {
-      ureg_free_tokens(state.tokens);
+   if (!tepv)
       return NULL;
-   }
 
    /* fill in new variant */
-   tepv->driver_shader = pipe->create_tes_state(pipe, &state);
+   tepv->driver_shader = pipe->create_tes_state(pipe, &sttep->tgsi);
    tepv->key = *key;
-
-   ureg_free_tokens(state.tokens);
    return tepv;
 }
 
@@ -1436,7 +1532,7 @@ st_get_tep_variant(struct st_context *st,
 
    if (!tepv) {
       /* create new */
-      tepv = st_translate_tesseval_program(st, sttep, key);
+      tepv = st_create_tep_variant(st, sttep, key);
       if (tepv) {
          /* insert into list */
          tepv->next = sttep->variants;
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index 7013993fe38..d9b53ac008c 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -59,8 +59,6 @@ struct st_fp_variant_key
    GLuint drawpixels:1;           /**< glDrawPixels variant */
    GLuint scaleAndBias:1;         /**< glDrawPixels w/ scale and/or bias? */
    GLuint pixelMaps:1;            /**< glDrawPixels w/ pixel lookup map? */
-   GLuint drawpixels_z:1;         /**< glDrawPixels(GL_DEPTH) */
-   GLuint drawpixels_stencil:1;   /**< glDrawPixels(GL_STENCIL) */
 
    /** for ARB_color_buffer_float */
    GLuint clamp_color:1;
@@ -78,8 +76,6 @@ struct st_fp_variant
    /** Parameters which generated this version of fragment program */
    struct st_fp_variant_key key;
 
-   struct pipe_shader_state tgsi;
-
    /** Driver's compiled shader */
    void *driver_shader;
 
@@ -87,6 +83,10 @@ struct st_fp_variant
    struct gl_program_parameter_list *parameters;
    uint bitmap_sampler;
 
+   /** For glDrawPixels variants */
+   unsigned drawpix_sampler;
+   unsigned pixelmap_sampler;
+
    /** next in linked list */
    struct st_fp_variant *next;
 };
@@ -98,6 +98,7 @@ struct st_fp_variant
 struct st_fragment_program
 {
    struct gl_fragment_program Base;
+   struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
 
    struct st_fp_variant *variants;
@@ -153,20 +154,16 @@ struct st_vp_variant
 struct st_vertex_program
 {
    struct gl_vertex_program Base;  /**< The Mesa vertex program */
+   struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
 
    /** maps a Mesa VERT_ATTRIB_x to a packed TGSI input index */
-   GLuint input_to_index[VERT_ATTRIB_MAX];
    /** maps a TGSI input index back to a Mesa VERT_ATTRIB_x */
    GLuint index_to_input[PIPE_MAX_SHADER_INPUTS];
    GLuint num_inputs;
 
    /** Maps VARYING_SLOT_x to slot */
    GLuint result_to_output[VARYING_SLOT_MAX];
-   GLuint output_slot_to_attr[VARYING_SLOT_MAX];
-   ubyte output_semantic_name[VARYING_SLOT_MAX];
-   ubyte output_semantic_index[VARYING_SLOT_MAX];
-   GLuint num_outputs;
 
    /** List of translated variants of this vertex program.
     */
@@ -203,6 +200,7 @@ struct st_gp_variant
 struct st_geometry_program
 {
    struct gl_geometry_program Base;  /**< The Mesa geometry program */
+   struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
 
    struct st_gp_variant *variants;
@@ -238,6 +236,7 @@ struct st_tcp_variant
 struct st_tessctrl_program
 {
    struct gl_tess_ctrl_program Base;  /**< The Mesa tess ctrl program */
+   struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
 
    struct st_tcp_variant *variants;
@@ -273,6 +272,7 @@ struct st_tep_variant
 struct st_tesseval_program
 {
    struct gl_tess_eval_program Base;  /**< The Mesa tess eval program */
+   struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
 
    struct st_tep_variant *variants;
@@ -414,16 +414,6 @@ st_get_tep_variant(struct st_context *st,
                    struct st_tesseval_program *stgp,
                    const struct st_tep_variant_key *key);
 
-
-extern void
-st_prepare_vertex_program(struct gl_context *ctx,
-                          struct st_vertex_program *stvp);
-
-extern GLboolean
-st_prepare_fragment_program(struct gl_context *ctx,
-                            struct st_fragment_program *stfp);
-
-
 extern void
 st_release_vp_variants( struct st_context *st,
                         struct st_vertex_program *stvp );
@@ -447,6 +437,25 @@ st_release_tep_variants(struct st_context *st,
 extern void
 st_destroy_program_variants(struct st_context *st);
 
+extern bool
+st_translate_vertex_program(struct st_context *st,
+                            struct st_vertex_program *stvp);
+
+extern bool
+st_translate_fragment_program(struct st_context *st,
+                              struct st_fragment_program *stfp);
+
+extern bool
+st_translate_geometry_program(struct st_context *st,
+                              struct st_geometry_program *stgp);
+
+extern bool
+st_translate_tessctrl_program(struct st_context *st,
+                              struct st_tessctrl_program *sttcp);
+
+extern bool
+st_translate_tesseval_program(struct st_context *st,
+                              struct st_tesseval_program *sttep);
 
 extern void
 st_print_current_vertex_program(void);
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index c130ab3f93d..6f29abbe1ba 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -35,6 +35,7 @@
 #include "main/mtypes.h"
 #include "main/macros.h"
 #include "main/enums.h"
+#include "util/half_float.h"
 
 #include "t_context.h"
 #include "tnl.h"
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index e3eb286e482..5e1a760eb2c 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -33,7 +33,6 @@
 #include "vbo.h"
 #include "vbo_context.h"
 
-#define NR_MAT_ATTRIBS 12
 
 static GLuint check_size( const GLfloat *attr )
 {
@@ -44,32 +43,47 @@ static GLuint check_size( const GLfloat *attr )
 }
 
 
+/**
+ * Helper for initializing a vertex array.
+ */
+static void
+init_array(struct gl_context *ctx, struct gl_client_array *cl,
+           unsigned size, const void *pointer)
+{
+   memset(cl, 0, sizeof(*cl));
+
+   cl->Size = size;
+   cl->Type = GL_FLOAT;
+   cl->Format = GL_RGBA;
+   cl->Stride = 0;
+   cl->StrideB = 0;
+   cl->_ElementSize = cl->Size * sizeof(GLfloat);
+   cl->Ptr = pointer;
+   cl->Enabled = 1;
+
+   _mesa_reference_buffer_object(ctx, &cl->BufferObj,
+                                 ctx->Shared->NullBufferObj);
+}
+
+
+/**
+ * Set up the vbo->currval arrays to point at the context's current
+ * vertex attributes (with strides = 0).
+ */
 static void init_legacy_currval(struct gl_context *ctx)
 {
    struct vbo_context *vbo = vbo_context(ctx);
-   struct gl_client_array *arrays = &vbo->currval[VBO_ATTRIB_POS];
    GLuint i;
 
-   memset(arrays, 0, sizeof(*arrays) * VERT_ATTRIB_FF_MAX);
-
    /* Set up a constant (StrideB == 0) array for each current
     * attribute:
     */
    for (i = 0; i < VERT_ATTRIB_FF_MAX; i++) {
-      struct gl_client_array *cl = &arrays[i];
+      struct gl_client_array *cl = &vbo->currval[VERT_ATTRIB_FF(i)];
 
-      /* Size will have to be determined at runtime:
-       */
-      cl->Size = check_size(ctx->Current.Attrib[i]);
-      cl->Stride = 0;
-      cl->StrideB = 0;
-      cl->Enabled = 1;
-      cl->Type = GL_FLOAT;
-      cl->Format = GL_RGBA;
-      cl->Ptr = (const void *)ctx->Current.Attrib[i];
-      cl->_ElementSize = cl->Size * sizeof(GLfloat);
-      _mesa_reference_buffer_object(ctx, &cl->BufferObj,
-                                    ctx->Shared->NullBufferObj);
+      init_array(ctx, cl,
+                 check_size(ctx->Current.Attrib[i]),
+                 ctx->Current.Attrib[i]);
    }
 }
 
@@ -77,26 +91,12 @@ static void init_legacy_currval(struct gl_context *ctx)
 static void init_generic_currval(struct gl_context *ctx)
 {
    struct vbo_context *vbo = vbo_context(ctx);
-   struct gl_client_array *arrays = &vbo->currval[VBO_ATTRIB_GENERIC0];
    GLuint i;
 
-   memset(arrays, 0, sizeof(*arrays) * VERT_ATTRIB_GENERIC_MAX);
-
    for (i = 0; i < VERT_ATTRIB_GENERIC_MAX; i++) {
-      struct gl_client_array *cl = &arrays[i];
+      struct gl_client_array *cl = &vbo->currval[VBO_ATTRIB_GENERIC0 + i];
 
-      /* This will have to be determined at runtime:
-       */
-      cl->Size = 1;
-      cl->Type = GL_FLOAT;
-      cl->Format = GL_RGBA;
-      cl->Ptr = (const void *)ctx->Current.Attrib[VERT_ATTRIB_GENERIC0 + i];
-      cl->Stride = 0;
-      cl->StrideB = 0;
-      cl->Enabled = 1;
-      cl->_ElementSize = cl->Size * sizeof(GLfloat);
-      _mesa_reference_buffer_object(ctx, &cl->BufferObj,
-                                    ctx->Shared->NullBufferObj);
+      init_array(ctx, cl, 1, ctx->Current.Attrib[VERT_ATTRIB_GENERIC0 + i]);
    }
 }
 
@@ -104,46 +104,34 @@ static void init_generic_currval(struct gl_context *ctx)
 static void init_mat_currval(struct gl_context *ctx)
 {
    struct vbo_context *vbo = vbo_context(ctx);
-   struct gl_client_array *arrays =
-      &vbo->currval[VBO_ATTRIB_MAT_FRONT_AMBIENT];
    GLuint i;
 
-   assert(NR_MAT_ATTRIBS == MAT_ATTRIB_MAX);
-
-   memset(arrays, 0, sizeof(*arrays) * NR_MAT_ATTRIBS);
-
    /* Set up a constant (StrideB == 0) array for each current
     * attribute:
     */
-   for (i = 0; i < NR_MAT_ATTRIBS; i++) {
-      struct gl_client_array *cl = &arrays[i];
+   for (i = 0; i < MAT_ATTRIB_MAX; i++) {
+      struct gl_client_array *cl =
+         &vbo->currval[VBO_ATTRIB_MAT_FRONT_AMBIENT + i];
+      unsigned size;
 
       /* Size is fixed for the material attributes, for others will
        * be determined at runtime:
        */
-      switch (i - VERT_ATTRIB_GENERIC0) {
+      switch (i) {
       case MAT_ATTRIB_FRONT_SHININESS:
       case MAT_ATTRIB_BACK_SHININESS:
-	 cl->Size = 1;
-	 break;
+         size = 1;
+         break;
       case MAT_ATTRIB_FRONT_INDEXES:
       case MAT_ATTRIB_BACK_INDEXES:
-	 cl->Size = 3;
-	 break;
+         size = 3;
+         break;
       default:
-	 cl->Size = 4;
-	 break;
+         size = 4;
+         break;
       }
 
-      cl->Ptr = (const void *)ctx->Light.Material.Attrib[i];
-      cl->Type = GL_FLOAT;
-      cl->Format = GL_RGBA;
-      cl->Stride = 0;
-      cl->StrideB = 0;
-      cl->Enabled = 1;
-      cl->_ElementSize = cl->Size * sizeof(GLfloat);
-      _mesa_reference_buffer_object(ctx, &cl->BufferObj,
-                                    ctx->Shared->NullBufferObj);
+      init_array(ctx, cl, size, ctx->Light.Material.Attrib[i]);
    }
 }
 
@@ -175,7 +163,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
       for (i = 0; i < ARRAY_SIZE(vbo->map_vp_none); i++) 
 	 vbo->map_vp_none[i] = i;
       /* map material attribs to generic slots */
-      for (i = 0; i < NR_MAT_ATTRIBS; i++) 
+      for (i = 0; i < MAT_ATTRIB_MAX; i++)
 	 vbo->map_vp_none[VERT_ATTRIB_GENERIC(i)]
             = VBO_ATTRIB_MAT_FRONT_AMBIENT + i;
 
diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h
index 80f3015925d..00378eb7984 100644
--- a/src/mesa/vbo/vbo_exec.h
+++ b/src/mesa/vbo/vbo_exec.h
@@ -79,7 +79,7 @@ struct vbo_exec_copied_vtx {
 
 struct vbo_exec_context
 {
-   struct gl_context *ctx;   
+   struct gl_context *ctx;
    GLvertexformat vtxfmt;
    GLvertexformat vtxfmt_noop;
    GLboolean validating; /**< if we're in the middle of state validation */
@@ -97,15 +97,17 @@ struct vbo_exec_context
       GLuint   buffer_used;             /* in bytes */
       fi_type vertex[VBO_ATTRIB_MAX*4]; /* current vertex */
 
-      GLuint vert_count;
-      GLuint max_vert;
+      GLuint vert_count;   /**< Number of vertices currently in buffer */
+      GLuint max_vert;     /**< Max number of vertices allowed in buffer */
       struct vbo_exec_copied_vtx copied;
 
-      GLubyte attrsz[VBO_ATTRIB_MAX];
-      GLenum attrtype[VBO_ATTRIB_MAX];
-      GLubyte active_sz[VBO_ATTRIB_MAX];
+      GLubyte attrsz[VBO_ATTRIB_MAX];   /**< nr. of attrib components (1..4) */
+      GLenum attrtype[VBO_ATTRIB_MAX];  /**< GL_FLOAT, GL_DOUBLE, GL_INT, etc */
+      GLubyte active_sz[VBO_ATTRIB_MAX];  /**< attrib size (nr. 32-bit words) */
 
+      /** pointers into the current 'vertex' array, declared above */
       fi_type *attrptr[VBO_ATTRIB_MAX];
+
       struct gl_client_array arrays[VERT_ATTRIB_MAX];
 
       /* According to program mode, the values above plus current
@@ -115,7 +117,6 @@ struct vbo_exec_context
       const struct gl_client_array *inputs[VERT_ATTRIB_MAX];
    } vtx;
 
-   
    struct {
       GLboolean recalculate_maps;
       struct vbo_exec_eval1_map map1[VERT_ATTRIB_MAX];
@@ -131,7 +132,7 @@ struct vbo_exec_context
       GLboolean recalculate_inputs;
    } array;
 
-   /* Which flags to set in vbo_exec_BeginVertices() */
+   /* Which flags to set in vbo_exec_begin_vertices() */
    GLbitfield begin_vertices_flags;
 
 #ifdef DEBUG
@@ -147,8 +148,6 @@ void vbo_exec_init( struct gl_context *ctx );
 void vbo_exec_destroy( struct gl_context *ctx );
 void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state );
 
-void vbo_exec_BeginVertices( struct gl_context *ctx );
-
 
 /* Internal functions:
  */
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index 583a2f9b79f..7ae08fe3062 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -375,13 +375,16 @@ vbo_exec_wrap_upgrade_vertex(struct vbo_exec_context *exec,
  * This is when a vertex attribute transitions to a different size.
  * For example, we saw a bunch of glTexCoord2f() calls and now we got a
  * glTexCoord4f() call.  We promote the array from size=2 to size=4.
+ * \param newSize  size of new vertex (number of 32-bit words).
  */
 static void
-vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, GLuint newSize, GLenum newType)
+vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr,
+                      GLuint newSize, GLenum newType)
 {
    struct vbo_exec_context *exec = &vbo_context(ctx)->exec;
 
-   if (newSize > exec->vtx.attrsz[attr] || newType != exec->vtx.attrtype[attr]) {
+   if (newSize > exec->vtx.attrsz[attr] ||
+       newType != exec->vtx.attrtype[attr]) {
       /* New size is larger.  Need to flush existing vertices and get
        * an enlarged vertex format.
        */
@@ -411,20 +414,49 @@ vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, GLuint newSize, GLenu
 
 
 /**
+ * Called upon first glVertex, glColor, glTexCoord, etc.
+ */
+static void
+vbo_exec_begin_vertices(struct gl_context *ctx)
+{
+   struct vbo_exec_context *exec = &vbo_context(ctx)->exec;
+
+   vbo_exec_vtx_map( exec );
+
+   assert((ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT) == 0);
+   assert(exec->begin_vertices_flags);
+
+   ctx->Driver.NeedFlush |= exec->begin_vertices_flags;
+}
+
+
+/**
  * This macro is used to implement all the glVertex, glColor, glTexCoord,
  * glVertexAttrib, etc functions.
+ * \param A  attribute index
+ * \param N  attribute size (1..4)
+ * \param T  type (GL_FLOAT, GL_DOUBLE, GL_INT, GL_UNSIGNED_INT)
+ * \param C  cast type (fi_type or double)
+ * \param V0, V1, v2, V3  attribute value
  */
 #define ATTR_UNION( A, N, T, C, V0, V1, V2, V3 )                        \
 do {									\
    struct vbo_exec_context *exec = &vbo_context(ctx)->exec;		\
    int sz = (sizeof(C) / sizeof(GLfloat));                              \
-   if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT)))	\
-      vbo_exec_BeginVertices(ctx);					\
                                                                         \
+   assert(sz == 1 || sz == 2);                                          \
+                                                                        \
+   if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) {     \
+      vbo_exec_begin_vertices(ctx);					\
+   }									\
+                                                                        \
+   /* check if attribute size or type is changing */                    \
    if (unlikely(exec->vtx.active_sz[A] != N * sz) ||                    \
-       unlikely(exec->vtx.attrtype[A] != T))                            \
+       unlikely(exec->vtx.attrtype[A] != T)) {                          \
       vbo_exec_fixup_vertex(ctx, A, N * sz, T);                         \
+   }									\
                                                                         \
+   /* store vertex attribute in vertex buffer */                        \
    {									\
       C *dest = (C *)exec->vtx.attrptr[A];                              \
       if (N>0) dest[0] = V0;						\
@@ -438,6 +470,7 @@ do {									\
       /* This is a glVertex call */					\
       GLuint i;								\
 									\
+      /* copy 32-bit words */                                           \
       for (i = 0; i < exec->vtx.vertex_size; i++)			\
 	 exec->vtx.buffer_ptr[i] = exec->vtx.vertex[i];			\
 									\
@@ -1149,22 +1182,6 @@ void vbo_exec_vtx_destroy( struct vbo_exec_context *exec )
 
 
 /**
- * Called upon first glVertex, glColor, glTexCoord, etc.
- */
-void vbo_exec_BeginVertices( struct gl_context *ctx )
-{
-   struct vbo_exec_context *exec = &vbo_context(ctx)->exec;
-
-   vbo_exec_vtx_map( exec );
-
-   assert((ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT) == 0);
-   assert(exec->begin_vertices_flags);
-
-   ctx->Driver.NeedFlush |= exec->begin_vertices_flags;
-}
-
-
-/**
  * If inside glBegin()/glEnd(), it should assert(0).  Otherwise, if
  * FLUSH_STORED_VERTICES bit in \p flags is set flushes any buffered
  * vertices, if FLUSH_UPDATE_CURRENT bit is set updates
@@ -1197,7 +1214,7 @@ void vbo_exec_FlushVertices( struct gl_context *ctx, GLuint flags )
    /* Flush (draw), and make sure VBO is left unmapped when done */
    vbo_exec_FlushVertices_internal(exec, GL_TRUE);
 
-   /* Need to do this to ensure vbo_exec_BeginVertices gets called again:
+   /* Need to do this to ensure vbo_exec_begin_vertices gets called again:
     */
    ctx->Driver.NeedFlush &= ~(FLUSH_UPDATE_CURRENT | flags);
 
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 2bfb0c32b73..174cbc37c26 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -53,10 +53,10 @@ vbo_exec_debug_verts( struct vbo_exec_context *exec )
    for (i = 0 ; i < exec->vtx.prim_count ; i++) {
       struct _mesa_prim *prim = &exec->vtx.prim[i];
       printf("   prim %d: %s%s %d..%d %s %s\n",
-	     i, 
+	     i,
 	     _mesa_lookup_prim_by_nr(prim->mode),
 	     prim->weak ? " (weak)" : "",
-	     prim->start, 
+	     prim->start,
 	     prim->start + prim->count,
 	     prim->begin ? "BEGIN" : "(wrap)",
 	     prim->end ? "END" : "(wrap)");
@@ -79,7 +79,6 @@ vbo_copy_vertices( struct vbo_exec_context *exec )
                          exec->vtx.prim[exec->vtx.prim_count-1].start * 
                          exec->vtx.vertex_size);
 
-
    switch (exec->ctx->Driver.CurrentExecPrimitive) {
    case GL_POINTS:
       return 0;
@@ -219,7 +218,7 @@ vbo_exec_bind_arrays( struct gl_context *ctx )
          exec->vtx.inputs[attr] = &arrays[attr];
 
          if (_mesa_is_bufferobj(exec->vtx.bufferobj)) {
-            /* a real buffer obj: Ptr is an offset, not a pointer*/
+            /* a real buffer obj: Ptr is an offset, not a pointer */
             assert(exec->vtx.bufferobj->Mappings[MAP_INTERNAL].Pointer);
             assert(offset >= 0);
             arrays[attr].Ptr = (GLubyte *)
@@ -259,7 +258,7 @@ vbo_exec_vtx_unmap( struct vbo_exec_context *exec )
 {
    if (_mesa_is_bufferobj(exec->vtx.bufferobj)) {
       struct gl_context *ctx = exec->ctx;
-      
+
       if (ctx->Driver.FlushMappedBufferRange) {
          GLintptr offset = exec->vtx.buffer_used -
                            exec->vtx.bufferobj->Mappings[MAP_INTERNAL].Offset;
@@ -277,7 +276,7 @@ vbo_exec_vtx_unmap( struct vbo_exec_context *exec )
 
       assert(exec->vtx.buffer_used <= VBO_VERT_BUFFER_SIZE);
       assert(exec->vtx.buffer_ptr != NULL);
-      
+
       ctx->Driver.UnmapBuffer(ctx, exec->vtx.bufferobj, MAP_INTERNAL);
       exec->vtx.buffer_map = NULL;
       exec->vtx.buffer_ptr = NULL;
@@ -299,7 +298,7 @@ vbo_exec_vtx_map( struct vbo_exec_context *exec )
                               GL_MAP_FLUSH_EXPLICIT_BIT |
                               MESA_MAP_NOWAIT_BIT;
    const GLenum usage = GL_STREAM_DRAW_ARB;
-   
+
    if (!_mesa_is_bufferobj(exec->vtx.bufferobj))
       return;
 
@@ -323,7 +322,7 @@ vbo_exec_vtx_map( struct vbo_exec_context *exec )
          exec->vtx.buffer_ptr = exec->vtx.buffer_map = NULL;
       }
    }
-   
+
    if (!exec->vtx.buffer_map) {
       /* Need to allocate a new VBO */
       exec->vtx.buffer_used = 0;
@@ -381,14 +380,14 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
    if (0)
       vbo_exec_debug_verts( exec );
 
-   if (exec->vtx.prim_count && 
+   if (exec->vtx.prim_count &&
        exec->vtx.vert_count) {
 
-      exec->vtx.copied.nr = vbo_copy_vertices( exec ); 
+      exec->vtx.copied.nr = vbo_copy_vertices( exec );
 
       if (exec->vtx.copied.nr != exec->vtx.vert_count) {
 	 struct gl_context *ctx = exec->ctx;
-	 
+
 	 /* Before the update_state() as this may raise _NEW_VARYING_VP_INPUTS
           * from _mesa_set_varying_vp_inputs().
 	  */
@@ -405,7 +404,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
             printf("%s %d %d\n", __func__, exec->vtx.prim_count,
 		   exec->vtx.vert_count);
 
-	 vbo_context(ctx)->draw_prims( ctx, 
+	 vbo_context(ctx)->draw_prims( ctx,
 				       exec->vtx.prim,
 				       exec->vtx.prim_count,
 				       NULL,
@@ -433,7 +432,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
    if (keepUnmapped || exec->vtx.vertex_size == 0)
       exec->vtx.max_vert = 0;
    else
-      exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / 
+      exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) /
                             (exec->vtx.vertex_size * sizeof(GLfloat)));
 
    exec->vtx.buffer_ptr = exec->vtx.buffer_map;
diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index 1a70d168c55..fdc677f9a07 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -648,7 +648,8 @@ _save_upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz)
 
    /* Recalculate all the attrptr[] values:
     */
-   for (i = 0, tmp = save->vertex; i < VBO_ATTRIB_MAX; i++) {
+   tmp = save->vertex;
+   for (i = 0; i < VBO_ATTRIB_MAX; i++) {
       if (save->attrsz[i]) {
          save->attrptr[i] = tmp;
          tmp += save->attrsz[i];
@@ -1543,7 +1544,7 @@ vbo_print_vertex_list(struct gl_context *ctx, void *data, FILE *f)
       node->vertex_store->bufferobj : NULL;
    (void) ctx;
 
-   fprintf(f, "VBO-VERTEX-LIST, %u vertices %d primitives, %d vertsize "
+   fprintf(f, "VBO-VERTEX-LIST, %u vertices, %d primitives, %d vertsize, "
            "buffer %p\n",
            node->count, node->prim_count, node->vertex_size,
            buffer);
diff --git a/src/util/Makefile.sources b/src/util/Makefile.sources
index e45431d1de8..a87114601c8 100644
--- a/src/util/Makefile.sources
+++ b/src/util/Makefile.sources
@@ -3,6 +3,8 @@ MESA_UTIL_FILES :=	\
 	debug.c \
 	debug.h \
 	format_srgb.h \
+	half_float.c \
+	half_float.h \
 	hash_table.c	\
 	hash_table.h \
 	list.h \
diff --git a/src/util/half_float.c b/src/util/half_float.c
new file mode 100644
index 00000000000..4df64c2ccf9
--- /dev/null
+++ b/src/util/half_float.c
@@ -0,0 +1,177 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <math.h>
+#include <assert.h>
+#include "half_float.h"
+#include "rounding.h"
+
+typedef union { float f; int32_t i; uint32_t u; } fi_type;
+
+/**
+ * Convert a 4-byte float to a 2-byte half float.
+ *
+ * Not all float32 values can be represented exactly as a float16 value. We
+ * round such intermediate float32 values to the nearest float16. When the
+ * float32 lies exactly between to float16 values, we round to the one with
+ * an even mantissa.
+ *
+ * This rounding behavior has several benefits:
+ *   - It has no sign bias.
+ *
+ *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
+ *     GPU ISA.
+ *
+ *   - By reproducing the behavior of the GPU (at least on Intel hardware),
+ *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
+ *     result in the same value as if the expression were executed on the GPU.
+ */
+uint16_t
+_mesa_float_to_half(float val)
+{
+   const fi_type fi = {val};
+   const int flt_m = fi.i & 0x7fffff;
+   const int flt_e = (fi.i >> 23) & 0xff;
+   const int flt_s = (fi.i >> 31) & 0x1;
+   int s, e, m = 0;
+   uint16_t result;
+
+   /* sign bit */
+   s = flt_s;
+
+   /* handle special cases */
+   if ((flt_e == 0) && (flt_m == 0)) {
+      /* zero */
+      /* m = 0; - already set */
+      e = 0;
+   }
+   else if ((flt_e == 0) && (flt_m != 0)) {
+      /* denorm -- denorm float maps to 0 half */
+      /* m = 0; - already set */
+      e = 0;
+   }
+   else if ((flt_e == 0xff) && (flt_m == 0)) {
+      /* infinity */
+      /* m = 0; - already set */
+      e = 31;
+   }
+   else if ((flt_e == 0xff) && (flt_m != 0)) {
+      /* NaN */
+      m = 1;
+      e = 31;
+   }
+   else {
+      /* regular number */
+      const int new_exp = flt_e - 127;
+      if (new_exp < -14) {
+         /* The float32 lies in the range (0.0, min_normal16) and is rounded
+          * to a nearby float16 value. The result will be either zero, subnormal,
+          * or normal.
+          */
+         e = 0;
+         m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f));
+      }
+      else if (new_exp > 15) {
+         /* map this value to infinity */
+         /* m = 0; - already set */
+         e = 31;
+      }
+      else {
+         /* The float32 lies in the range
+          *   [min_normal16, max_normal16 + max_step16)
+          * and is rounded to a nearby float16 value. The result will be
+          * either normal or infinite.
+          */
+         e = new_exp + 15;
+         m = _mesa_lroundevenf(flt_m / (float) (1 << 13));
+      }
+   }
+
+   assert(0 <= m && m <= 1024);
+   if (m == 1024) {
+      /* The float32 was rounded upwards into the range of the next exponent,
+       * so bump the exponent. This correctly handles the case where f32
+       * should be rounded up to float16 infinity.
+       */
+      ++e;
+      m = 0;
+   }
+
+   result = (s << 15) | (e << 10) | m;
+   return result;
+}
+
+
+/**
+ * Convert a 2-byte half float to a 4-byte float.
+ * Based on code from:
+ * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html
+ */
+float
+_mesa_half_to_float(uint16_t val)
+{
+   /* XXX could also use a 64K-entry lookup table */
+   const int m = val & 0x3ff;
+   const int e = (val >> 10) & 0x1f;
+   const int s = (val >> 15) & 0x1;
+   int flt_m, flt_e, flt_s;
+   fi_type fi;
+   float result;
+
+   /* sign bit */
+   flt_s = s;
+
+   /* handle special cases */
+   if ((e == 0) && (m == 0)) {
+      /* zero */
+      flt_m = 0;
+      flt_e = 0;
+   }
+   else if ((e == 0) && (m != 0)) {
+      /* denorm -- denorm half will fit in non-denorm single */
+      const float half_denorm = 1.0f / 16384.0f; /* 2^-14 */
+      float mantissa = ((float) (m)) / 1024.0f;
+      float sign = s ? -1.0f : 1.0f;
+      return sign * mantissa * half_denorm;
+   }
+   else if ((e == 31) && (m == 0)) {
+      /* infinity */
+      flt_e = 0xff;
+      flt_m = 0;
+   }
+   else if ((e == 31) && (m != 0)) {
+      /* NaN */
+      flt_e = 0xff;
+      flt_m = 1;
+   }
+   else {
+      /* regular */
+      flt_e = e + 112;
+      flt_m = m << 13;
+   }
+
+   fi.i = (flt_s << 31) | (flt_e << 23) | flt_m;
+   result = fi.f;
+   return result;
+}
diff --git a/src/util/half_float.h b/src/util/half_float.h
new file mode 100644
index 00000000000..64f20421018
--- /dev/null
+++ b/src/util/half_float.h
@@ -0,0 +1,41 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _HALF_FLOAT_H_
+#define _HALF_FLOAT_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+uint16_t _mesa_float_to_half(float val);
+float _mesa_half_to_float(uint16_t val);
+
+#ifdef __cplusplus
+} /* extern C */
+#endif
+
+#endif /* _HALF_FLOAT_H_ */
diff --git a/src/vulkan/Makefile.am b/src/vulkan/Makefile.am
index 985864a87fe..5abbd379b54 100644
--- a/src/vulkan/Makefile.am
+++ b/src/vulkan/Makefile.am
@@ -42,6 +42,7 @@ AM_CPPFLAGS = \
 	$(DEFINES) \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/src/glsl/nir \
 	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/mesa \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
diff --git a/src/vulkan/anv_compiler.cpp b/src/vulkan/anv_compiler.cpp
index a3b8d1cc80c..2b8e7cee9aa 100644
--- a/src/vulkan/anv_compiler.cpp
+++ b/src/vulkan/anv_compiler.cpp
@@ -36,6 +36,7 @@
 #include <brw_gs.h>
 #include <brw_cs.h>
 #include "brw_vec4_gs_visitor.h"
+#include <brw_compiler.h>
 
 #include <mesa/main/shaderobj.h>
 #include <mesa/main/fbobject.h>
@@ -307,8 +308,9 @@ really_do_vs_prog(struct brw_context *brw,
 
    /* Emit GEN4 code.
     */
-   program = brw_vs_emit(brw, mem_ctx, key, prog_data, &vp->program,
-                         prog, -1, &program_size);
+   program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx,
+                            key, prog_data, vs->Program->nir, NULL, false, -1,
+                            &program_size, NULL);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
@@ -562,8 +564,9 @@ really_do_wm_prog(struct brw_context *brw,
     */
    prog_data->binding_table.render_target_start = 0;
 
-   program = brw_wm_fs_emit(brw, mem_ctx, key, prog_data,
-                            &fp->program, prog, -1, -1, &program_size);
+   program = brw_compile_fs(brw->intelScreen->compiler, brw, mem_ctx, key,
+                            prog_data, fp->program.Base.nir, fs->Program,
+                            -1, -1, brw->use_rep_send, &program_size, NULL);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
@@ -831,7 +834,8 @@ anv_codegen_gs_prog(struct brw_context *brw,
    void *mem_ctx = ralloc_context(NULL);
    unsigned program_size;
    const unsigned *program =
-      brw_gs_emit(brw, prog, &c, mem_ctx, -1, &program_size);
+      brw_compile_gs(brw->intelScreen->compiler, brw, &c, gp->program.Base.nir,
+                     prog, mem_ctx, -1, &program_size, NULL);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
@@ -867,8 +871,9 @@ brw_codegen_cs_prog(struct brw_context *brw,
    anv_nir_apply_dynamic_offsets(pipeline, cs->Program->nir, &prog_data->base);
    anv_nir_apply_pipeline_layout(cs->Program->nir, pipeline->layout);
 
-   program = brw_cs_emit(brw, mem_ctx, key, prog_data,
-                         &cp->program, prog, -1, &program_size);
+   program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx, key,
+                            prog_data, cs->Program->nir, -1,
+                            &program_size, NULL);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
@@ -1142,10 +1147,13 @@ setup_nir_io(struct gl_shader *mesa_shader,
       prog->OutputsWritten |= BITFIELD64_BIT(var->data.location);
    }
 
+   shader->info.system_values_read = 0;
+   foreach_list_typed(nir_variable, var, node, &shader->system_values) {
+      shader->info.system_values_read |= BITFIELD64_BIT(var->data.location);
+   }
+
    shader->info.inputs_read = prog->InputsRead;
    shader->info.outputs_written = prog->OutputsWritten;
-
-   mesa_shader->num_uniform_components = shader->num_uniforms;
 }
 
 static void
@@ -1163,7 +1171,7 @@ anv_compile_shader_spirv(struct anv_compiler *compiler,
            "failed to create %s shader\n", stage_info[stage].name);
 
 #define CREATE_PROGRAM(stage) \
-   _mesa_init_##stage##_program(&brw->ctx, &ralloc(mesa_shader, struct brw_##stage##_program)->program, 0, 0)
+   &ralloc(mesa_shader, struct brw_##stage##_program)->program.Base
 
    bool is_scalar;
    struct gl_program *prog;
@@ -1187,6 +1195,7 @@ anv_compile_shader_spirv(struct anv_compiler *compiler,
    default:
       unreachable("Unsupported shader stage");
    }
+   _mesa_init_gl_program(prog, 0, 0);
    _mesa_reference_program(&brw->ctx, &mesa_shader->Program, prog);
 
    mesa_shader->Program->Parameters =
@@ -1215,11 +1224,14 @@ anv_compile_shader_spirv(struct anv_compiler *compiler,
    }
    nir_validate_shader(mesa_shader->Program->nir);
 
+   setup_nir_io(mesa_shader, mesa_shader->Program->nir);
+
    brw_process_nir(mesa_shader->Program->nir,
                    compiler->screen->devinfo,
                    NULL, mesa_shader->Stage, is_scalar);
 
-   setup_nir_io(mesa_shader, mesa_shader->Program->nir);
+   mesa_shader->num_uniform_components =
+      mesa_shader->Program->nir->num_uniforms;
 
    fail_if(mesa_shader->Program->nir == NULL,
            "failed to translate SPIR-V to NIR\n");
diff --git a/src/vulkan/anv_meta.c b/src/vulkan/anv_meta.c
index 8f6bc421194..76b8c4173e6 100644
--- a/src/vulkan/anv_meta.c
+++ b/src/vulkan/anv_meta.c
@@ -39,13 +39,11 @@ build_nir_vertex_shader(bool attr_flat)
 
    nir_builder_init_simple_shader(&b, MESA_SHADER_VERTEX);
 
-   nir_variable *pos_in = nir_variable_create(b.shader, "a_pos",
-                                              vertex_type,
-                                              nir_var_shader_in);
+   nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                              vertex_type, "a_pos");
    pos_in->data.location = VERT_ATTRIB_GENERIC0;
-   nir_variable *pos_out = nir_variable_create(b.shader, "gl_Position",
-                                               vertex_type,
-                                               nir_var_shader_out);
+   nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                               vertex_type, "gl_Position");
    pos_in->data.location = VARYING_SLOT_POS;
    nir_copy_var(&b, pos_out, pos_in);
 
@@ -53,11 +51,11 @@ build_nir_vertex_shader(bool attr_flat)
     * to store the color and for blit shaders it's the texture coordinate.
     */
    const struct glsl_type *attr_type = glsl_vec4_type();
-   nir_variable *attr_in = nir_variable_create(b.shader, "a_attr", attr_type,
-                                               nir_var_shader_in);
+   nir_variable *attr_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                               attr_type, "a_attr");
    attr_in->data.location = VERT_ATTRIB_GENERIC1;
-   nir_variable *attr_out = nir_variable_create(b.shader, "v_attr", attr_type,
-                                                nir_var_shader_out);
+   nir_variable *attr_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                                attr_type, "v_attr");
    attr_out->data.location = VARYING_SLOT_VAR0;
    attr_out->data.interpolation = attr_flat ? INTERP_QUALIFIER_FLAT :
                                               INTERP_QUALIFIER_SMOOTH;
@@ -75,14 +73,12 @@ build_nir_clear_fragment_shader(void)
 
    nir_builder_init_simple_shader(&b, MESA_SHADER_FRAGMENT);
 
-   nir_variable *color_in = nir_variable_create(b.shader, "v_attr",
-                                                color_type,
-                                                nir_var_shader_in);
+   nir_variable *color_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                                color_type, "v_attr");
    color_in->data.location = VARYING_SLOT_VAR0;
    color_in->data.interpolation = INTERP_QUALIFIER_FLAT;
-   nir_variable *color_out = nir_variable_create(b.shader, "f_color",
-                                                 color_type,
-                                                 nir_var_shader_out);
+   nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                                 color_type, "f_color");
    color_out->data.location = FRAG_RESULT_DATA0;
    nir_copy_var(&b, color_out, color_in);
 
@@ -98,15 +94,14 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim)
 
    const struct glsl_type *color_type = glsl_vec4_type();
 
-   nir_variable *tex_pos_in = nir_variable_create(b.shader, "v_attr",
-                                                  glsl_vec4_type(),
-                                                  nir_var_shader_in);
+   nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                                  glsl_vec4_type(), "v_attr");
    tex_pos_in->data.location = VARYING_SLOT_VAR0;
 
    const struct glsl_type *sampler_type =
       glsl_sampler_type(tex_dim, false, false, glsl_get_base_type(color_type));
-   nir_variable *sampler = nir_variable_create(b.shader, "s_tex", sampler_type,
-                                               nir_var_uniform);
+   nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform,
+                                               sampler_type, "s_tex");
    sampler->data.descriptor_set = 0;
    sampler->data.binding = 0;
 
@@ -133,9 +128,8 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim)
    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, "tex");
    nir_builder_instr_insert(&b, &tex->instr);
 
-   nir_variable *color_out = nir_variable_create(b.shader, "f_color",
-                                                 color_type,
-                                                 nir_var_shader_out);
+   nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                                 color_type, "f_color");
    color_out->data.location = FRAG_RESULT_DATA0;
    nir_store_var(&b, color_out, &tex->dest.ssa);
 
diff --git a/src/vulkan/anv_nir_builder.h b/src/vulkan/anv_nir_builder.h
index 299c8c1aad0..f26cb046a6b 100644
--- a/src/vulkan/anv_nir_builder.h
+++ b/src/vulkan/anv_nir_builder.h
@@ -54,49 +54,3 @@ nir_copy_var(nir_builder *build, nir_variable *dest, nir_variable *src)
    copy->variables[1] = nir_deref_var_create(copy, src);
    nir_builder_instr_insert(build, &copy->instr);
 }
-
-static inline nir_variable *
-nir_variable_create(nir_shader *shader, const char *name,
-                    const struct glsl_type *type, nir_variable_mode mode)
-{
-   nir_variable *var = rzalloc(shader, nir_variable);
-   var->name = ralloc_strdup(var, name);
-   var->type = type;
-   var->data.mode = mode;
-
-   if ((mode == nir_var_shader_in && shader->stage != MESA_SHADER_VERTEX) ||
-       (mode == nir_var_shader_out && shader->stage != MESA_SHADER_FRAGMENT))
-      var->data.interpolation = INTERP_QUALIFIER_SMOOTH;
-
-   switch (var->data.mode) {
-   case nir_var_local:
-      assert(!"nir_variable_create cannot be used for local variables");
-      break;
-
-   case nir_var_global:
-      exec_list_push_tail(&shader->globals, &var->node);
-      break;
-
-   case nir_var_shader_in:
-      exec_list_push_tail(&shader->inputs, &var->node);
-      break;
-
-   case nir_var_shader_out:
-      exec_list_push_tail(&shader->outputs, &var->node);
-      break;
-
-   case nir_var_uniform:
-   case nir_var_shader_storage:
-      exec_list_push_tail(&shader->uniforms, &var->node);
-      break;
-
-   case nir_var_system_value:
-      exec_list_push_tail(&shader->system_values, &var->node);
-      break;
-
-   default:
-      unreachable("not reached");
-   }
-
-   return var;
-}