112 files changed, 4137 insertions, 4225 deletions
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 257fc73225c..e2aa52cc388 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -149,7 +149,7 @@ GL 4.2, GLSL 4.20:
 
 GL 4.3, GLSL 4.30:
 
-  GL_ARB_arrays_of_arrays                              DONE (i965)
+  GL_ARB_arrays_of_arrays                              DONE (all drivers that support GLSL 1.30)
   GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
   GL_ARB_clear_buffer_object                           DONE (all drivers)
   GL_ARB_compute_shader                                DONE (i965)
@@ -209,7 +209,7 @@ GL 4.5, GLSL 4.50:
 
 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
-  GL_ARB_arrays_of_arrays                              DONE (i965)
+  GL_ARB_arrays_of_arrays                              DONE (all drivers that support GLSL 1.30)
   GL_ARB_compute_shader                                DONE (i965)
   GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
diff --git a/docs/relnotes/11.2.0.html b/docs/relnotes/11.2.0.html
index 0d92ed41ee8..069eca2b70c 100644
--- a/docs/relnotes/11.2.0.html
+++ b/docs/relnotes/11.2.0.html
@@ -44,6 +44,7 @@ Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
+<li>GL_ARB_arrays_of_arrays on all gallium drivers that provide GLSL 1.30</li>
 <li>GL_ARB_base_instance on freedreno/a4xx</li>
 <li>GL_ARB_compute_shader on i965</li>
 <li>GL_ARB_copy_image on r600</li>
diff --git a/src/compiler/glsl/ast_function.cpp b/src/compiler/glsl/ast_function.cpp
index 0eb456a2b1f..c7fdcb24379 100644
--- a/src/compiler/glsl/ast_function.cpp
+++ b/src/compiler/glsl/ast_function.cpp
@@ -560,7 +560,8 @@ done:
 	    state->symbols->add_global_function(f);
 	    emit_function(state, f);
 	 }
-	 f->add_signature(sig->clone_prototype(f, NULL));
+	 sig = sig->clone_prototype(f, NULL);
+	 f->add_signature(sig);
       }
    }
    return sig;
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 7213ad8ebec..a4842400288 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -4211,33 +4211,46 @@ ast_declarator_list::hir(exec_list *instructions,
          _mesa_glsl_error(&loc, state,
                           "invalid type `%s' in empty declaration",
                           type_name);
-      } else if (decl_type->base_type == GLSL_TYPE_ATOMIC_UINT) {
-         /* Empty atomic counter declarations are allowed and useful
-          * to set the default offset qualifier.
-          */
-         return NULL;
-      } else if (this->type->qualifier.precision != ast_precision_none) {
-         if (this->type->specifier->structure != NULL) {
-            _mesa_glsl_error(&loc, state,
-                             "precision qualifiers can't be applied "
-                             "to structures");
-         } else {
-            static const char *const precision_names[] = {
-               "highp",
-               "highp",
-               "mediump",
-               "lowp"
-            };
+      } else {
+         if (decl_type->base_type == GLSL_TYPE_ARRAY) {
+            /* From Section 4.12 (Empty Declarations) of the GLSL 4.5 spec:
+             *
+             *    "The combinations of types and qualifiers that cause
+             *    compile-time or link-time errors are the same whether or not
+             *    the declaration is empty."
+             */
+            validate_array_dimensions(decl_type, state, &loc);
+         }
 
-            _mesa_glsl_warning(&loc, state,
-                               "empty declaration with precision qualifier, "
-                               "to set the default precision, use "
-                               "`precision %s %s;'",
-                               precision_names[this->type->qualifier.precision],
-                               type_name);
+         if (decl_type->base_type == GLSL_TYPE_ATOMIC_UINT) {
+            /* Empty atomic counter declarations are allowed and useful
+             * to set the default offset qualifier.
+             */
+            return NULL;
+         } else if (this->type->qualifier.precision != ast_precision_none) {
+            if (this->type->specifier->structure != NULL) {
+               _mesa_glsl_error(&loc, state,
+                                "precision qualifiers can't be applied "
+                                "to structures");
+            } else {
+               static const char *const precision_names[] = {
+                  "highp",
+                  "highp",
+                  "mediump",
+                  "lowp"
+               };
+
+               _mesa_glsl_warning(&loc, state,
+                                  "empty declaration with precision "
+                                  "qualifier, to set the default precision, "
+                                  "use `precision %s %s;'",
+                                  precision_names[this->type->
+                                     qualifier.precision],
+                                  type_name);
+            }
+         } else if (this->type->specifier->structure == NULL) {
+            _mesa_glsl_warning(&loc, state, "empty declaration");
          }
-      } else if (this->type->specifier->structure == NULL) {
-         _mesa_glsl_warning(&loc, state, "empty declaration");
       }
    }
 
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index d7a4b254aa2..73d378c4bc9 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -27,6 +27,7 @@
 
 #include "main/core.h" /* for struct gl_context */
 #include "main/context.h"
+#include "main/debug_output.h"
 #include "main/shaderobj.h"
 #include "util/u_atomic.h" /* for p_atomic_cmpxchg */
 #include "util/ralloc.h"
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index 09e21b22188..bf9b7caffae 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -864,6 +864,14 @@ public:
       int location;
 
       /**
+       * for glsl->tgsi/mesa IR we need to store the index into the
+       * parameters for uniforms, initially the code overloaded location
+       * but this causes problems with indirect samplers and AoA.
+       * This is assigned in _mesa_generate_parameters_list_for_uniforms.
+       */
+      int param_index;
+
+      /**
        * Vertex stream output identifier.
        */
       unsigned stream;
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index a4c730ffdcf..590de174507 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -1352,7 +1352,7 @@ private:
 
 namespace linker {
 
-bool
+void
 populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
                              hash_table *consumer_inputs,
                              hash_table *consumer_interface_inputs,
@@ -1366,8 +1366,8 @@ populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
       ir_variable *const input_var = node->as_variable();
 
       if ((input_var != NULL) && (input_var->data.mode == ir_var_shader_in)) {
-         if (input_var->type->is_interface())
-            return false;
+         /* All interface blocks should have been lowered by this point */
+         assert(!input_var->type->is_interface());
 
          if (input_var->data.explicit_location) {
             /* assign_varying_locations only cares about finding the
@@ -1401,8 +1401,6 @@ populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
          }
       }
    }
-
-   return true;
 }
 
 /**
@@ -1626,18 +1624,11 @@ assign_varying_locations(struct gl_context *ctx,
    if (producer)
       canonicalize_shader_io(producer->ir, ir_var_shader_out);
 
-   if (consumer
-       && !linker::populate_consumer_input_sets(mem_ctx,
-                                                consumer->ir,
-                                                consumer_inputs,
-                                                consumer_interface_inputs,
-                                                consumer_inputs_with_locations)) {
-      assert(!"populate_consumer_input_sets failed");
-      hash_table_dtor(tfeedback_candidates);
-      hash_table_dtor(consumer_inputs);
-      hash_table_dtor(consumer_interface_inputs);
-      return false;
-   }
+   if (consumer)
+      linker::populate_consumer_input_sets(mem_ctx, consumer->ir,
+                                           consumer_inputs,
+                                           consumer_interface_inputs,
+                                           consumer_inputs_with_locations);
 
    if (producer) {
       foreach_in_list(ir_instruction, node, producer->ir) {
@@ -1652,8 +1643,10 @@ assign_varying_locations(struct gl_context *ctx,
                 (output_var->data.stream < MAX_VERTEX_STREAMS &&
                  producer->Stage == MESA_SHADER_GEOMETRY));
 
-         tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates);
-         g.process(output_var);
+         if (num_tfeedback_decls > 0) {
+            tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates);
+            g.process(output_var);
+         }
 
          ir_variable *const input_var =
             linker::get_matching_input(mem_ctx, output_var, consumer_inputs,
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 4776ffa6acd..bad1c1742b7 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -82,8 +82,6 @@
 #include "main/enums.h"
 
 
-void linker_error(gl_shader_program *, const char *, ...);
-
 namespace {
 
 /**
@@ -2125,6 +2123,7 @@ link_intrastage_shaders(void *mem_ctx,
 
       if (ok) {
          memcpy(linking_shaders, shader_list, num_shaders * sizeof(gl_shader *));
+         _mesa_glsl_initialize_builtin_functions();
          linking_shaders[num_shaders] = _mesa_glsl_get_builtin_function_shader();
 
          ok = link_function_calls(prog, linked, linking_shaders, num_shaders + 1);
@@ -4105,15 +4104,34 @@ disable_varying_optimizations_for_sso(struct gl_shader_program *prog)
 void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 {
+   prog->LinkStatus = true; /* All error paths will set this to false */
+   prog->Validated = false;
+   prog->_Used = false;
+
+   /* Section 7.3 (Program Objects) of the OpenGL 4.5 Core Profile spec says:
+    *
+    *     "Linking can fail for a variety of reasons as specified in the
+    *     OpenGL Shading Language Specification, as well as any of the
+    *     following reasons:
+    *
+    *     - No shader objects are attached to program."
+    *
+    * The Compatibility Profile specification does not list the error.  In
+    * Compatibility Profile missing shader stages are replaced by
+    * fixed-function.  This applies to the case where all stages are
+    * missing.
+    */
+   if (prog->NumShaders == 0) {
+      if (ctx->API != API_OPENGL_COMPAT)
+         linker_error(prog, "no shaders attached to the program\n");
+      return;
+   }
+
    tfeedback_decl *tfeedback_decls = NULL;
    unsigned num_tfeedback_decls = prog->TransformFeedback.NumVarying;
 
    void *mem_ctx = ralloc_context(NULL); // temporary linker context
 
-   prog->LinkStatus = true; /* All error paths will set this to false */
-   prog->Validated = false;
-   prog->_Used = false;
-
    prog->ARB_fragment_coord_conventions_enable = false;
 
    /* Separate the shaders into groups based on their type.
@@ -4129,13 +4147,11 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 
    unsigned min_version = UINT_MAX;
    unsigned max_version = 0;
-   const bool is_es_prog =
-      (prog->NumShaders > 0 && prog->Shaders[0]->IsES) ? true : false;
    for (unsigned i = 0; i < prog->NumShaders; i++) {
       min_version = MIN2(min_version, prog->Shaders[i]->Version);
       max_version = MAX2(max_version, prog->Shaders[i]->Version);
 
-      if (prog->Shaders[i]->IsES != is_es_prog) {
+      if (prog->Shaders[i]->IsES != prog->Shaders[0]->IsES) {
 	 linker_error(prog, "all shaders must use same shading "
 		      "language version\n");
 	 goto done;
@@ -4153,80 +4169,59 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    /* In desktop GLSL, different shader versions may be linked together.  In
     * GLSL ES, all shader versions must be the same.
     */
-   if (is_es_prog && min_version != max_version) {
+   if (prog->Shaders[0]->IsES && min_version != max_version) {
       linker_error(prog, "all shaders must use same shading "
 		   "language version\n");
       goto done;
    }
 
    prog->Version = max_version;
-   prog->IsES = is_es_prog;
-
-   /* From OpenGL 4.5 Core specification (7.3 Program Objects):
-    *     "Linking can fail for a variety of reasons as specified in the OpenGL
-    *     Shading Language Specification, as well as any of the following
-    *     reasons:
-    *
-    *     * No shader objects are attached to program.
-    *
-    *     ..."
-    *
-    *     Same rule applies for OpenGL ES >= 3.1.
-    */
-
-   if (prog->NumShaders == 0 &&
-       ((ctx->API == API_OPENGL_CORE && ctx->Version >= 45) ||
-        (ctx->API == API_OPENGLES2 && ctx->Version >= 31))) {
-      linker_error(prog, "No shader objects are attached to program.\n");
-      goto done;
-   }
+   prog->IsES = prog->Shaders[0]->IsES;
 
    /* Some shaders have to be linked with some other shaders present.
     */
-   if (num_shaders[MESA_SHADER_GEOMETRY] > 0 &&
-       num_shaders[MESA_SHADER_VERTEX] == 0 &&
-       !prog->SeparateShader) {
-      linker_error(prog, "Geometry shader must be linked with "
-		   "vertex shader\n");
-      goto done;
-   }
-   if (num_shaders[MESA_SHADER_TESS_EVAL] > 0 &&
-       num_shaders[MESA_SHADER_VERTEX] == 0 &&
-       !prog->SeparateShader) {
-      linker_error(prog, "Tessellation evaluation shader must be linked with "
-		   "vertex shader\n");
-      goto done;
-   }
-   if (num_shaders[MESA_SHADER_TESS_CTRL] > 0 &&
-       num_shaders[MESA_SHADER_VERTEX] == 0 &&
-       !prog->SeparateShader) {
-      linker_error(prog, "Tessellation control shader must be linked with "
-		   "vertex shader\n");
-      goto done;
-   }
+   if (!prog->SeparateShader) {
+      if (num_shaders[MESA_SHADER_GEOMETRY] > 0 &&
+          num_shaders[MESA_SHADER_VERTEX] == 0) {
+         linker_error(prog, "Geometry shader must be linked with "
+		      "vertex shader\n");
+         goto done;
+      }
+      if (num_shaders[MESA_SHADER_TESS_EVAL] > 0 &&
+          num_shaders[MESA_SHADER_VERTEX] == 0) {
+         linker_error(prog, "Tessellation evaluation shader must be linked "
+		      "with vertex shader\n");
+         goto done;
+      }
+      if (num_shaders[MESA_SHADER_TESS_CTRL] > 0 &&
+          num_shaders[MESA_SHADER_VERTEX] == 0) {
+         linker_error(prog, "Tessellation control shader must be linked with "
+		      "vertex shader\n");
+         goto done;
+      }
 
-   /* The spec is self-contradictory here. It allows linking without a tess
-    * eval shader, but that can only be used with transform feedback and
-    * rasterization disabled. However, transform feedback isn't allowed
-    * with GL_PATCHES, so it can't be used.
-    *
-    * More investigation showed that the idea of transform feedback after
-    * a tess control shader was dropped, because some hw vendors couldn't
-    * support tessellation without a tess eval shader, but the linker section
-    * wasn't updated to reflect that.
-    *
-    * All specifications (ARB_tessellation_shader, GL 4.0-4.5) have this
-    * spec bug.
-    *
-    * Do what's reasonable and always require a tess eval shader if a tess
-    * control shader is present.
-    */
-   if (num_shaders[MESA_SHADER_TESS_CTRL] > 0 &&
-       num_shaders[MESA_SHADER_TESS_EVAL] == 0 &&
-       !prog->SeparateShader) {
-      linker_error(prog, "Tessellation control shader must be linked with "
-		   "tessellation evaluation shader\n");
-      goto done;
+      /* The spec is self-contradictory here. It allows linking without a tess
+       * eval shader, but that can only be used with transform feedback and
+       * rasterization disabled. However, transform feedback isn't allowed
+       * with GL_PATCHES, so it can't be used.
+       *
+       * More investigation showed that the idea of transform feedback after
+       * a tess control shader was dropped, because some hw vendors couldn't
+       * support tessellation without a tess eval shader, but the linker
+       * section wasn't updated to reflect that.
+       *
+       * All specifications (ARB_tessellation_shader, GL 4.0-4.5) have this
+       * spec bug.
+       *
+       * Do what's reasonable and always require a tess eval shader if a tess
+       * control shader is present.
+       */
+      if (num_shaders[MESA_SHADER_TESS_CTRL] > 0 &&
+          num_shaders[MESA_SHADER_TESS_EVAL] == 0) {
+         linker_error(prog, "Tessellation control shader must be linked with "
+		      "tessellation evaluation shader\n");
+         goto done;
+      }
    }
 
    /* Compute shaders have additional restrictions. */
@@ -4362,7 +4357,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
     *
     * This rule also applies to GLSL ES 3.00.
     */
-   if (max_version >= (is_es_prog ? 300 : 130)) {
+   if (max_version >= (prog->IsES ? 300 : 130)) {
       struct gl_shader *sh = prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
       if (sh) {
 	 lower_discard_flow(sh->ir);
@@ -4451,9 +4446,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
        *     non-zero, but the program object has no vertex or geometry
        *     shader;
        */
-      if (first == MESA_SHADER_FRAGMENT) {
+      if (first >= MESA_SHADER_FRAGMENT) {
          linker_error(prog, "Transform feedback varyings specified, but "
-                      "no vertex or geometry shader is present.\n");
+                      "no vertex, tessellation, or geometry shader is "
+                      "present.\n");
          goto done;
       }
 
@@ -4465,91 +4461,80 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
          goto done;
    }
 
-   /* Linking the stages in the opposite order (from fragment to vertex)
-    * ensures that inter-shader outputs written to in an earlier stage are
-    * eliminated if they are (transitively) not used in a later stage.
+   /* If there is no fragment shader we need to set transform feedback.
+    *
+    * For SSO we need also need to assign output locations, we assign them
+    * here because we need to do it for both single stage programs and multi
+    * stage programs.
     */
-   int next;
-
-   if (first < MESA_SHADER_FRAGMENT) {
-      gl_shader *const sh = prog->_LinkedShaders[last];
-
-      if (first != MESA_SHADER_VERTEX) {
-         /* There was no vertex shader, but we still have to assign varying
-          * locations for use by tessellation/geometry shader inputs in SSO.
-          *
-          * If the shader is not separable (i.e., prog->SeparateShader is
-          * false), linking will have already failed when first is not
-          * MESA_SHADER_VERTEX.
-          */
-         if (!assign_varying_locations(ctx, mem_ctx, prog,
-                                       NULL, prog->_LinkedShaders[first],
-                                       num_tfeedback_decls, tfeedback_decls))
-            goto done;
-      }
-
-      if (last != MESA_SHADER_FRAGMENT &&
-         (num_tfeedback_decls != 0 || prog->SeparateShader)) {
-         /* There was no fragment shader, but we still have to assign varying
-          * locations for use by transform feedback.
-          */
-         if (!assign_varying_locations(ctx, mem_ctx, prog,
-                                       sh, NULL,
-                                       num_tfeedback_decls, tfeedback_decls))
-            goto done;
-      }
-
-      do_dead_builtin_varyings(ctx, sh, NULL,
-                               num_tfeedback_decls, tfeedback_decls);
+   if (last < MESA_SHADER_FRAGMENT &&
+       (num_tfeedback_decls != 0 || prog->SeparateShader)) {
+      if (!assign_varying_locations(ctx, mem_ctx, prog,
+                                    prog->_LinkedShaders[last], NULL,
+                                    num_tfeedback_decls, tfeedback_decls))
+         goto done;
+   }
 
-      remove_unused_shader_inputs_and_outputs(prog->SeparateShader, sh,
+   if (last <= MESA_SHADER_FRAGMENT) {
+      /* Remove unused varyings from the first/last stage unless SSO */
+      remove_unused_shader_inputs_and_outputs(prog->SeparateShader,
+                                              prog->_LinkedShaders[first],
+                                              ir_var_shader_in);
+      remove_unused_shader_inputs_and_outputs(prog->SeparateShader,
+                                              prog->_LinkedShaders[last],
                                               ir_var_shader_out);
-   }
-   else if (first == MESA_SHADER_FRAGMENT) {
-      /* If the program only contains a fragment shader...
-       */
-      gl_shader *const sh = prog->_LinkedShaders[first];
 
-      do_dead_builtin_varyings(ctx, NULL, sh,
-                               num_tfeedback_decls, tfeedback_decls);
+      /* If the program is made up of only a single stage */
+      if (first == last) {
 
-      if (prog->SeparateShader) {
-         if (!assign_varying_locations(ctx, mem_ctx, prog,
-                                       NULL /* producer */,
-                                       sh /* consumer */,
-                                       0 /* num_tfeedback_decls */,
-                                       NULL /* tfeedback_decls */))
-            goto done;
-      } else {
-         remove_unused_shader_inputs_and_outputs(false, sh,
-                                                 ir_var_shader_in);
-      }
-   }
+         gl_shader *const sh = prog->_LinkedShaders[last];
+         if (prog->SeparateShader) {
+            /* Assign input locations for SSO, output locations are already
+             * assigned.
+             */
+            if (!assign_varying_locations(ctx, mem_ctx, prog,
+                                          NULL /* producer */,
+                                          sh /* consumer */,
+                                          0 /* num_tfeedback_decls */,
+                                          NULL /* tfeedback_decls */))
+               goto done;
+         }
 
-   next = last;
-   for (int i = next - 1; i >= 0; i--) {
-      if (prog->_LinkedShaders[i] == NULL)
-         continue;
+         do_dead_builtin_varyings(ctx, NULL, sh, 0, NULL);
+         do_dead_builtin_varyings(ctx, sh, NULL, num_tfeedback_decls,
+                                  tfeedback_decls);
+      } else {
+         /* Linking the stages in the opposite order (from fragment to vertex)
+          * ensures that inter-shader outputs written to in an earlier stage
+          * are eliminated if they are (transitively) not used in a later
+          * stage.
+          */
+         int next = last;
+         for (int i = next - 1; i >= 0; i--) {
+            if (prog->_LinkedShaders[i] == NULL)
+               continue;
 
-      gl_shader *const sh_i = prog->_LinkedShaders[i];
-      gl_shader *const sh_next = prog->_LinkedShaders[next];
+            gl_shader *const sh_i = prog->_LinkedShaders[i];
+            gl_shader *const sh_next = prog->_LinkedShaders[next];
 
-      if (!assign_varying_locations(ctx, mem_ctx, prog, sh_i, sh_next,
-                next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
-                tfeedback_decls))
-         goto done;
+            if (!assign_varying_locations(ctx, mem_ctx, prog, sh_i, sh_next,
+                      next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
+                      tfeedback_decls))
+               goto done;
 
-      do_dead_builtin_varyings(ctx, sh_i, sh_next,
-                next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
-                tfeedback_decls);
+            do_dead_builtin_varyings(ctx, sh_i, sh_next,
+                      next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
+                      tfeedback_decls);
 
-      /* This must be done after all dead varyings are eliminated. */
-      if (!check_against_output_limit(ctx, prog, sh_i))
-         goto done;
-      if (!check_against_input_limit(ctx, prog, sh_next))
-         goto done;
+            /* This must be done after all dead varyings are eliminated. */
+            if (!check_against_output_limit(ctx, prog, sh_i))
+               goto done;
+            if (!check_against_input_limit(ctx, prog, sh_next))
+               goto done;
 
-      next = i;
+            next = i;
+         }
+      }
    }
 
    if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls))
@@ -4569,38 +4554,38 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    if (!prog->LinkStatus)
       goto done;
 
-   /* OpenGL ES requires that a vertex shader and a fragment shader both be
-    * present in a linked program. GL_ARB_ES2_compatibility doesn't say
+   /* OpenGL ES < 3.1 requires that a vertex shader and a fragment shader both
+    * be present in a linked program. GL_ARB_ES2_compatibility doesn't say
     * anything about shader linking when one of the shaders (vertex or
     * fragment shader) is absent. So, the extension shouldn't change the
     * behavior specified in GLSL specification.
+    *
+    * From OpenGL ES 3.1 specification (7.3 Program Objects):
+    *     "Linking can fail for a variety of reasons as specified in the
+    *     OpenGL ES Shading Language Specification, as well as any of the
+    *     following reasons:
+    *
+    *     ...
+    *
+    *     * program contains objects to form either a vertex shader or
+    *       fragment shader, and program is not separable, and does not
+    *       contain objects to form both a vertex shader and fragment
+    *       shader."
+    *
+    * However, the only scenario in 3.1+ where we don't require them both is
+    * when we have a compute shader. For example:
+    *
+    * - No shaders is a link error.
+    * - Geom or Tess without a Vertex shader is a link error which means we
+    *   always require a Vertex shader and hence a Fragment shader.
+    * - Finally a Compute shader linked with any other stage is a link error.
     */
-   if (!prog->SeparateShader && ctx->API == API_OPENGLES2) {
-      /* With ES < 3.1 one needs to have always vertex + fragment shader. */
-      if (ctx->Version < 31) {
-         if (prog->_LinkedShaders[MESA_SHADER_VERTEX] == NULL) {
-	    linker_error(prog, "program lacks a vertex shader\n");
-         } else if (prog->_LinkedShaders[MESA_SHADER_FRAGMENT] == NULL) {
-	    linker_error(prog, "program lacks a fragment shader\n");
-         }
-      } else {
-         /* From OpenGL ES 3.1 specification (7.3 Program Objects):
-          *     "Linking can fail for a variety of reasons as specified in the
-          *     OpenGL ES Shading Language Specification, as well as any of the
-          *     following reasons:
-          *
-          *     ...
-          *
-          *     * program contains objects to form either a vertex shader or
-          *       fragment shader, and program is not separable, and does not
-          *       contain objects to form both a vertex shader and fragment
-          *       shader."
-          */
-         if (!!prog->_LinkedShaders[MESA_SHADER_VERTEX] ^
-             !!prog->_LinkedShaders[MESA_SHADER_FRAGMENT]) {
-            linker_error(prog, "Program needs to contain both vertex and "
-                         "fragment shaders.\n");
-         }
+   if (!prog->SeparateShader && ctx->API == API_OPENGLES2 &&
+       num_shaders[MESA_SHADER_COMPUTE] == 0) {
+      if (prog->_LinkedShaders[MESA_SHADER_VERTEX] == NULL) {
+	 linker_error(prog, "program lacks a vertex shader\n");
+      } else if (prog->_LinkedShaders[MESA_SHADER_FRAGMENT] == NULL) {
+	 linker_error(prog, "program lacks a fragment shader\n");
       }
    }
 
diff --git a/src/compiler/glsl/tests/varyings_test.cpp b/src/compiler/glsl/tests/varyings_test.cpp
index 0c4e0a471b8..9be5e8344b4 100644
--- a/src/compiler/glsl/tests/varyings_test.cpp
+++ b/src/compiler/glsl/tests/varyings_test.cpp
@@ -156,11 +156,11 @@ TEST_F(link_varyings, single_simple_input)
 
    ir.push_tail(v);
 
-   ASSERT_TRUE(linker::populate_consumer_input_sets(mem_ctx,
-                                                    &ir,
-                                                    consumer_inputs,
-                                                    consumer_interface_inputs,
-                                                    junk));
+   linker::populate_consumer_input_sets(mem_ctx,
+                                        &ir,
+                                        consumer_inputs,
+                                        consumer_interface_inputs,
+                                        junk);
 
    EXPECT_EQ((void *) v, hash_table_find(consumer_inputs, "a"));
    EXPECT_EQ(1u, num_elements(consumer_inputs));
@@ -183,11 +183,11 @@ TEST_F(link_varyings, gl_ClipDistance)
 
    ir.push_tail(clipdistance);
 
-   ASSERT_TRUE(linker::populate_consumer_input_sets(mem_ctx,
-                                                    &ir,
-                                                    consumer_inputs,
-                                                    consumer_interface_inputs,
-                                                    junk));
+   linker::populate_consumer_input_sets(mem_ctx,
+                                        &ir,
+                                        consumer_inputs,
+                                        consumer_interface_inputs,
+                                        junk);
 
    EXPECT_EQ(clipdistance, junk[VARYING_SLOT_CLIP_DIST0]);
    EXPECT_TRUE(is_empty(consumer_inputs));
@@ -205,11 +205,11 @@ TEST_F(link_varyings, single_interface_input)
 
    ir.push_tail(v);
 
-   ASSERT_TRUE(linker::populate_consumer_input_sets(mem_ctx,
-                                                    &ir,
-                                                    consumer_inputs,
-                                                    consumer_interface_inputs,
-                                                    junk));
+   linker::populate_consumer_input_sets(mem_ctx,
+                                        &ir,
+                                        consumer_inputs,
+                                        consumer_interface_inputs,
+                                        junk);
    char *const full_name = interface_field_name(simple_interface);
 
    EXPECT_EQ((void *) v, hash_table_find(consumer_interface_inputs, full_name));
@@ -236,11 +236,11 @@ TEST_F(link_varyings, one_interface_and_one_simple_input)
 
    ir.push_tail(iface);
 
-   ASSERT_TRUE(linker::populate_consumer_input_sets(mem_ctx,
-                                                    &ir,
-                                                    consumer_inputs,
-                                                    consumer_interface_inputs,
-                                                    junk));
+   linker::populate_consumer_input_sets(mem_ctx,
+                                        &ir,
+                                        consumer_inputs,
+                                        consumer_interface_inputs,
+                                        junk);
 
    char *const iface_field_name = interface_field_name(simple_interface);
 
@@ -252,24 +252,6 @@ TEST_F(link_varyings, one_interface_and_one_simple_input)
    EXPECT_EQ(1u, num_elements(consumer_inputs));
 }
 
-TEST_F(link_varyings, invalid_interface_input)
-{
-   ir_variable *const v =
-      new(mem_ctx) ir_variable(simple_interface,
-                               "named_interface",
-                               ir_var_shader_in);
-
-   ASSERT_EQ(simple_interface, v->get_interface_type());
-
-   ir.push_tail(v);
-
-   EXPECT_FALSE(linker::populate_consumer_input_sets(mem_ctx,
-                                                    &ir,
-                                                    consumer_inputs,
-                                                     consumer_interface_inputs,
-                                                     junk));
-}
-
 TEST_F(link_varyings, interface_field_doesnt_match_noninterface)
 {
    char *const iface_field_name = interface_field_name(simple_interface);
@@ -283,11 +265,11 @@ TEST_F(link_varyings, interface_field_doesnt_match_noninterface)
 
    ir.push_tail(in_v);
 
-   ASSERT_TRUE(linker::populate_consumer_input_sets(mem_ctx,
-                                                    &ir,
-                                                    consumer_inputs,
-                                                    consumer_interface_inputs,
-                                                    junk));
+   linker::populate_consumer_input_sets(mem_ctx,
+                                        &ir,
+                                        consumer_inputs,
+                                        consumer_interface_inputs,
+                                        junk);
 
    /* Create an output variable, "v", that is part of an interface block named
     * "a".  They should not match.
@@ -325,11 +307,11 @@ TEST_F(link_varyings, interface_field_doesnt_match_noninterface_vice_versa)
 
    ir.push_tail(in_v);
 
-   ASSERT_TRUE(linker::populate_consumer_input_sets(mem_ctx,
-                                                    &ir,
-                                                    consumer_inputs,
-                                                    consumer_interface_inputs,
-                                                    junk));
+   linker::populate_consumer_input_sets(mem_ctx,
+                                        &ir,
+                                        consumer_inputs,
+                                        consumer_interface_inputs,
+                                        junk);
 
    /* Create an output variable "a.v".  They should not match.
     */
diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index 2a3047dd33c..6a30023bc53 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -376,8 +376,6 @@ nir_visitor::visit(ir_variable *ir)
    var->data.explicit_binding = ir->data.explicit_binding;
    var->data.has_initializer = ir->data.has_initializer;
    var->data.location_frac = ir->data.location_frac;
-   var->data.from_named_ifc_block_array = ir->data.from_named_ifc_block_array;
-   var->data.from_named_ifc_block_nonarray = ir->data.from_named_ifc_block_nonarray;
 
    switch (ir->data.depth_layout) {
    case ir_depth_layout_none:
@@ -600,7 +598,7 @@ nir_visitor::visit(ir_emit_vertex *ir)
 {
    nir_intrinsic_instr *instr =
       nir_intrinsic_instr_create(this->shader, nir_intrinsic_emit_vertex);
-   instr->const_index[0] = ir->stream_id();
+   nir_intrinsic_set_stream_id(instr, ir->stream_id());
    nir_builder_instr_insert(&b, &instr->instr);
 }
 
@@ -609,7 +607,7 @@ nir_visitor::visit(ir_end_primitive *ir)
 {
    nir_intrinsic_instr *instr =
       nir_intrinsic_instr_create(this->shader, nir_intrinsic_end_primitive);
-   instr->const_index[0] = ir->stream_id();
+   nir_intrinsic_set_stream_id(instr, ir->stream_id());
    nir_builder_instr_insert(&b, &instr->instr);
 }
 
@@ -889,7 +887,7 @@ nir_visitor::visit(ir_call *ir)
          instr->src[0] = nir_src_for_ssa(evaluate_rvalue(val));
          instr->src[1] = nir_src_for_ssa(evaluate_rvalue(block));
          instr->src[2] = nir_src_for_ssa(evaluate_rvalue(offset));
-         instr->const_index[0] = write_mask->value.u[0];
+         nir_intrinsic_set_write_mask(instr, write_mask->value.u[0]);
          instr->num_components = val->type->vector_elements;
 
          nir_builder_instr_insert(&b, &instr->instr);
@@ -987,7 +985,7 @@ nir_visitor::visit(ir_call *ir)
          exec_node *param = ir->actual_parameters.get_head();
          ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
 
-         instr->const_index[0] = 0;
+         nir_intrinsic_set_base(instr, 0);
          instr->src[0] = nir_src_for_ssa(evaluate_rvalue(offset));
 
          const glsl_type *type = ir->return_deref->var->type;
@@ -1011,10 +1009,10 @@ nir_visitor::visit(ir_call *ir)
          ir_constant *write_mask = ((ir_instruction *)param)->as_constant();
          assert(write_mask);
 
-         instr->const_index[0] = 0;
+         nir_intrinsic_set_base(instr, 0);
          instr->src[1] = nir_src_for_ssa(evaluate_rvalue(offset));
 
-         instr->const_index[1] = write_mask->value.u[0];
+         nir_intrinsic_set_write_mask(instr, write_mask->value.u[0]);
 
          instr->src[0] = nir_src_for_ssa(evaluate_rvalue(val));
          instr->num_components = val->type->vector_elements;
@@ -1069,7 +1067,8 @@ nir_visitor::visit(ir_call *ir)
          nir_intrinsic_instr *store_instr =
             nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
          store_instr->num_components = ir->return_deref->type->vector_elements;
-         store_instr->const_index[0] = (1 << store_instr->num_components) - 1;
+         nir_intrinsic_set_write_mask(store_instr,
+                                      (1 << store_instr->num_components) - 1);
 
          store_instr->variables[0] =
             evaluate_deref(&store_instr->instr, ir->return_deref);
@@ -1147,7 +1146,7 @@ nir_visitor::visit(ir_assignment *ir)
    nir_intrinsic_instr *store =
       nir_intrinsic_instr_create(this->shader, nir_intrinsic_store_var);
    store->num_components = ir->lhs->type->vector_elements;
-   store->const_index[0] = ir->write_mask;
+   nir_intrinsic_set_write_mask(store, ir->write_mask);
    nir_deref *store_deref = nir_copy_deref(store, &lhs_deref->deref);
    store->variables[0] = nir_deref_as_var(store_deref);
    store->src[0] = nir_src_for_ssa(src);
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 7aba195fa69..ca5e2f2b779 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -226,24 +226,6 @@ typedef struct nir_variable {
       unsigned location_frac:2;
 
       /**
-       * Non-zero if this variable was created by lowering a named interface
-       * block which was not an array.
-       *
-       * Note that this variable and \c from_named_ifc_block_array will never
-       * both be non-zero.
-       */
-      unsigned from_named_ifc_block_nonarray:1;
-
-      /**
-       * Non-zero if this variable was created by lowering a named interface
-       * block which was an array.
-       *
-       * Note that this variable and \c from_named_ifc_block_nonarray will never
-       * both be non-zero.
-       */
-      unsigned from_named_ifc_block_array:1;
-
-      /**
        * \brief Layout qualifier for gl_FragDepth.
        *
        * This is not equal to \c ir_depth_layout_none if and only if this
@@ -835,7 +817,7 @@ typedef struct {
 } nir_call_instr;
 
 #define INTRINSIC(name, num_srcs, src_components, has_dest, dest_components, \
-                  num_variables, num_indices, flags) \
+                  num_variables, num_indices, idx0, idx1, idx2, flags) \
    nir_intrinsic_##name,
 
 #define LAST_INTRINSIC(name) nir_last_intrinsic = nir_intrinsic_##name,
@@ -848,6 +830,8 @@ typedef enum {
 #undef INTRINSIC
 #undef LAST_INTRINSIC
 
+#define NIR_INTRINSIC_MAX_CONST_INDEX 3
+
 /** Represents an intrinsic
  *
  * An intrinsic is an instruction type for handling things that are
@@ -891,7 +875,7 @@ typedef struct {
     */
    uint8_t num_components;
 
-   int const_index[3];
+   int const_index[NIR_INTRINSIC_MAX_CONST_INDEX];
 
    nir_deref_var *variables[2];
 
@@ -920,6 +904,55 @@ typedef enum {
    NIR_INTRINSIC_CAN_REORDER = (1 << 1),
 } nir_intrinsic_semantic_flag;
 
+/**
+ * \name NIR intrinsics const-index flag
+ *
+ * Indicates the usage of a const_index slot.
+ *
+ * \sa nir_intrinsic_info::index_map
+ */
+typedef enum {
+   /**
+    * Generally instructions that take a offset src argument, can encode
+    * a constant 'base' value which is added to the offset.
+    */
+   NIR_INTRINSIC_BASE = 1,
+
+   /**
+    * For store instructions, a writemask for the store.
+    */
+   NIR_INTRINSIC_WRMASK = 2,
+
+   /**
+    * The stream-id for GS emit_vertex/end_primitive intrinsics.
+    */
+   NIR_INTRINSIC_STREAM_ID = 3,
+
+   /**
+    * The clip-plane id for load_user_clip_plane intrinsic.
+    */
+   NIR_INTRINSIC_UCP_ID = 4,
+
+   /**
+    * The range of a load operation.  This specifies the maximum amount of
+    * data starting at the base offset (if any) that can be accessed.
+    */
+   NIR_INTRINSIC_RANGE = 5,
+
+   /**
+    * The Vulkan descriptor set for vulkan_resource_index intrinsic.
+    */
+   NIR_INTRINSIC_DESC_SET = 6,
+
+   /**
+    * The Vulkan descriptor set binding for vulkan_resource_index intrinsic.
+    */
+   NIR_INTRINSIC_BINDING = 7,
+
+   NIR_INTRINSIC_NUM_INDEX_FLAGS,
+
+} nir_intrinsic_index_flag;
+
 #define NIR_INTRINSIC_MAX_INPUTS 4
 
 typedef struct {
@@ -949,12 +982,40 @@ typedef struct {
    /** the number of constant indices used by the intrinsic */
    unsigned num_indices;
 
+   /** indicates the usage of intr->const_index[n] */
+   unsigned index_map[NIR_INTRINSIC_NUM_INDEX_FLAGS];
+
    /** semantic flags for calls to this intrinsic */
    nir_intrinsic_semantic_flag flags;
 } nir_intrinsic_info;
 
 extern const nir_intrinsic_info nir_intrinsic_infos[nir_num_intrinsics];
 
+
+#define INTRINSIC_IDX_ACCESSORS(name, flag, type)                             \
+static inline type                                                            \
+nir_intrinsic_##name(nir_intrinsic_instr *instr)                              \
+{                                                                             \
+   const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];   \
+   assert(info->index_map[NIR_INTRINSIC_##flag] > 0);                         \
+   return instr->const_index[info->index_map[NIR_INTRINSIC_##flag] - 1];      \
+}                                                                             \
+static inline void                                                            \
+nir_intrinsic_set_##name(nir_intrinsic_instr *instr, type val)                \
+{                                                                             \
+   const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];   \
+   assert(info->index_map[NIR_INTRINSIC_##flag] > 0);                         \
+   instr->const_index[info->index_map[NIR_INTRINSIC_##flag] - 1] = val;       \
+}
+
+INTRINSIC_IDX_ACCESSORS(write_mask, WRMASK, unsigned)
+INTRINSIC_IDX_ACCESSORS(base, BASE, int)
+INTRINSIC_IDX_ACCESSORS(stream_id, STREAM_ID, unsigned)
+INTRINSIC_IDX_ACCESSORS(ucp_id, UCP_ID, unsigned)
+INTRINSIC_IDX_ACCESSORS(range, RANGE, unsigned)
+INTRINSIC_IDX_ACCESSORS(desc_set, DESC_SET, unsigned)
+INTRINSIC_IDX_ACCESSORS(binding, BINDING, unsigned)
+
 /**
  * \group texture information
  *
diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py
index 14c0e822ad8..2357b57117a 100644
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -102,13 +102,10 @@ class Constant(Value):
       self.value = val
 
    def __hex__(self):
-      # Even if it's an integer, we still need to unpack as an unsigned
-      # int.  This is because, without C99, we can only assign to the first
-      # element of a union in an initializer.
       if isinstance(self.value, (bool)):
          return 'NIR_TRUE' if self.value else 'NIR_FALSE'
       if isinstance(self.value, (int, long)):
-         return hex(struct.unpack('I', struct.pack('i' if self.value < 0 else 'I', self.value))[0])
+         return hex(self.value)
       elif isinstance(self.value, float):
          return hex(struct.unpack('I', struct.pack('f', self.value))[0])
       else:
@@ -216,7 +213,7 @@ ${pass_name}_block(nir_block *block, void *void_state)
 {
    struct opt_state *state = void_state;
 
-   nir_foreach_instr_safe(block, instr) {
+   nir_foreach_instr_reverse_safe(block, instr) {
       if (instr->type != nir_instr_type_alu)
          continue;
 
@@ -255,7 +252,7 @@ ${pass_name}_impl(nir_function_impl *impl, const bool *condition_flags)
    state.progress = false;
    state.condition_flags = condition_flags;
 
-   nir_foreach_block(impl, ${pass_name}_block, &state);
+   nir_foreach_block_reverse(impl, ${pass_name}_block, &state);
 
    if (state.progress)
       nir_metadata_preserve(impl, nir_metadata_block_index |
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 1c7c78acae8..b4dde54f7e7 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -374,7 +374,7 @@ nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value,
    nir_intrinsic_instr *store =
       nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_var);
    store->num_components = num_components;
-   store->const_index[0] = writemask;
+   nir_intrinsic_set_write_mask(store, writemask);
    store->variables[0] = nir_deref_var_create(store, var);
    store->src[0] = nir_src_for_ssa(value);
    nir_builder_instr_insert(build, &store->instr);
diff --git a/src/compiler/nir/nir_intrinsics.c b/src/compiler/nir/nir_intrinsics.c
index a7c868c39af..0257b19b348 100644
--- a/src/compiler/nir/nir_intrinsics.c
+++ b/src/compiler/nir/nir_intrinsics.c
@@ -30,7 +30,8 @@
 #define OPCODE(name) nir_intrinsic_##name
 
 #define INTRINSIC(_name, _num_srcs, _src_components, _has_dest, \
-                  _dest_components, _num_variables, _num_indices, _flags) \
+                  _dest_components, _num_variables, _num_indices, \
+                  idx0, idx1, idx2, _flags) \
 { \
    .name = #_name, \
    .num_srcs = _num_srcs, \
@@ -39,9 +40,16 @@
    .dest_components = _dest_components, \
    .num_variables = _num_variables, \
    .num_indices = _num_indices, \
+   .index_map = { \
+      [NIR_INTRINSIC_ ## idx0] = 1, \
+      [NIR_INTRINSIC_ ## idx1] = 2, \
+      [NIR_INTRINSIC_ ## idx2] = 3, \
+   }, \
    .flags = _flags \
 },
 
+#define NIR_INTRINSIC_xx 0
+
 #define LAST_INTRINSIC(name)
 
 const nir_intrinsic_info nir_intrinsic_infos[nir_num_intrinsics] = {
diff --git a/src/compiler/nir/nir_intrinsics.h b/src/compiler/nir/nir_intrinsics.h
index 3e7cf735a1b..fa162f9d126 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -30,7 +30,7 @@
  * expands to a list of macros of the form:
  *
  * INTRINSIC(name, num_srcs, src_components, has_dest, dest_components,
- *              num_variables, num_indices, flags)
+ *              num_variables, num_indices, idx0, idx1, idx2, flags)
  *
  * Which should correspond one-to-one with the nir_intrinsic_info structure. It
  * is included in both ir.h to create the nir_intrinsic enum (with members of
@@ -42,9 +42,9 @@
 #define ARR(...) { __VA_ARGS__ }
 
 
-INTRINSIC(load_var, 0, ARR(), true, 0, 1, 0, NIR_INTRINSIC_CAN_ELIMINATE)
-INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 1, 0)
-INTRINSIC(copy_var, 0, ARR(), false, 0, 2, 0, 0)
+INTRINSIC(load_var, 0, ARR(), true, 0, 1, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 1, WRMASK, xx, xx, 0)
+INTRINSIC(copy_var, 0, ARR(), false, 0, 2, 0, xx, xx, xx, 0)
 
 /*
  * Interpolation of input.  The interp_var_at* intrinsics are similar to the
@@ -54,25 +54,25 @@ INTRINSIC(copy_var, 0, ARR(), false, 0, 2, 0, 0)
  * respectively.
  */
 
-INTRINSIC(interp_var_at_centroid, 0, ARR(0), true, 0, 1, 0,
+INTRINSIC(interp_var_at_centroid, 0, ARR(0), true, 0, 1, 0, xx, xx, xx,
           NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
-INTRINSIC(interp_var_at_sample, 1, ARR(1), true, 0, 1, 0,
+INTRINSIC(interp_var_at_sample, 1, ARR(1), true, 0, 1, 0, xx, xx, xx,
           NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
-INTRINSIC(interp_var_at_offset, 1, ARR(2), true, 0, 1, 0,
+INTRINSIC(interp_var_at_offset, 1, ARR(2), true, 0, 1, 0, xx, xx, xx,
           NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 
 /*
  * Ask the driver for the size of a given buffer. It takes the buffer index
  * as source.
  */
-INTRINSIC(get_buffer_size, 1, ARR(1), true, 1, 0, 0,
+INTRINSIC(get_buffer_size, 1, ARR(1), true, 1, 0, 0, xx, xx, xx,
           NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 
 /*
  * a barrier is an intrinsic with no inputs/outputs but which can't be moved
  * around/optimized in general
  */
-#define BARRIER(name) INTRINSIC(name, 0, ARR(), false, 0, 0, 0, 0)
+#define BARRIER(name) INTRINSIC(name, 0, ARR(), false, 0, 0, 0, xx, xx, xx, 0)
 
 BARRIER(barrier)
 BARRIER(discard)
@@ -89,7 +89,7 @@ BARRIER(memory_barrier)
  * The latter can be used as code motion barrier, which is currently not
  * feasible with NIR.
  */
-INTRINSIC(shader_clock, 0, ARR(), true, 1, 0, 0, NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(shader_clock, 0, ARR(), true, 1, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 
 /*
  * Memory barrier with semantics analogous to the compute shader
@@ -103,7 +103,7 @@ BARRIER(memory_barrier_image)
 BARRIER(memory_barrier_shared)
 
 /** A conditional discard, with a single boolean source. */
-INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, 0)
+INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, xx, xx, xx, 0)
 
 /**
  * Basic Geometry Shader intrinsics.
@@ -113,8 +113,8 @@ INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, 0)
  *
  * end_primitive implements GLSL's EndPrimitive() built-in.
  */
-INTRINSIC(emit_vertex,   0, ARR(), false, 0, 0, 1, 0)
-INTRINSIC(end_primitive, 0, ARR(), false, 0, 0, 1, 0)
+INTRINSIC(emit_vertex,   0, ARR(), false, 0, 0, 1, STREAM_ID, xx, xx, 0)
+INTRINSIC(end_primitive, 0, ARR(), false, 0, 0, 1, STREAM_ID, xx, xx, 0)
 
 /**
  * Geometry Shader intrinsics with a vertex count.
@@ -125,9 +125,9 @@ INTRINSIC(end_primitive, 0, ARR(), false, 0, 0, 1, 0)
  * These maintain a count of the number of vertices emitted, as an additional
  * unsigned integer source.
  */
-INTRINSIC(emit_vertex_with_counter, 1, ARR(1), false, 0, 0, 1, 0)
-INTRINSIC(end_primitive_with_counter, 1, ARR(1), false, 0, 0, 1, 0)
-INTRINSIC(set_vertex_count, 1, ARR(1), false, 0, 0, 0, 0)
+INTRINSIC(emit_vertex_with_counter, 1, ARR(1), false, 0, 0, 1, STREAM_ID, xx, xx, 0)
+INTRINSIC(end_primitive_with_counter, 1, ARR(1), false, 0, 0, 1, STREAM_ID, xx, xx, 0)
+INTRINSIC(set_vertex_count, 1, ARR(1), false, 0, 0, 0, xx, xx, xx, 0)
 
 /*
  * Atomic counters
@@ -137,8 +137,8 @@ INTRINSIC(set_vertex_count, 1, ARR(1), false, 0, 0, 0, 0)
  */
 
 #define ATOMIC(name, flags) \
-   INTRINSIC(atomic_counter_##name##_var, 0, ARR(), true, 1, 1, 0, flags) \
-   INTRINSIC(atomic_counter_##name, 1, ARR(1), true, 1, 0, 1, flags)
+   INTRINSIC(atomic_counter_##name##_var, 0, ARR(), true, 1, 1, 0, xx, xx, xx, flags) \
+   INTRINSIC(atomic_counter_##name, 1, ARR(1), true, 1, 0, 1, BASE, xx, xx, flags)
 
 ATOMIC(inc, 0)
 ATOMIC(dec, 0)
@@ -159,20 +159,20 @@ ATOMIC(read, NIR_INTRINSIC_CAN_ELIMINATE)
  * either one or two additional scalar arguments with the same meaning as in
  * the ARB_shader_image_load_store specification.
  */
-INTRINSIC(image_load, 2, ARR(4, 1), true, 4, 1, 0,
+INTRINSIC(image_load, 2, ARR(4, 1), true, 4, 1, 0, xx, xx, xx,
           NIR_INTRINSIC_CAN_ELIMINATE)
-INTRINSIC(image_store, 3, ARR(4, 1, 4), false, 0, 1, 0, 0)
-INTRINSIC(image_atomic_add, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
-INTRINSIC(image_atomic_min, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
-INTRINSIC(image_atomic_max, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
-INTRINSIC(image_atomic_and, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
-INTRINSIC(image_atomic_or, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
-INTRINSIC(image_atomic_xor, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
-INTRINSIC(image_atomic_exchange, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
-INTRINSIC(image_atomic_comp_swap, 4, ARR(4, 1, 1, 1), true, 1, 1, 0, 0)
-INTRINSIC(image_size, 0, ARR(), true, 4, 1, 0,
+INTRINSIC(image_store, 3, ARR(4, 1, 4), false, 0, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_atomic_add, 3, ARR(4, 1, 1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_atomic_min, 3, ARR(4, 1, 1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_atomic_max, 3, ARR(4, 1, 1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_atomic_and, 3, ARR(4, 1, 1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_atomic_or, 3, ARR(4, 1, 1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_atomic_xor, 3, ARR(4, 1, 1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_atomic_exchange, 3, ARR(4, 1, 1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_atomic_comp_swap, 4, ARR(4, 1, 1, 1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(image_size, 0, ARR(), true, 4, 1, 0, xx, xx, xx,
           NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
-INTRINSIC(image_samples, 0, ARR(), true, 1, 1, 0,
+INTRINSIC(image_samples, 0, ARR(), true, 1, 1, 0, xx, xx, xx,
           NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 
 /*
@@ -191,7 +191,8 @@ INTRINSIC(image_samples, 0, ARR(), true, 1, 1, 0,
  * The intended usage is that the shader will call vulkan_surface_index to
  * get an index and then pass that as the buffer index ubo/ssbo calls.
  */
-INTRINSIC(vulkan_resource_index, 1, ARR(1), true, 1, 0, 3,
+INTRINSIC(vulkan_resource_index, 1, ARR(1), true, 1, 0, 2,
+          DESC_SET, BINDING, xx,
           NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 
 /*
@@ -210,16 +211,16 @@ INTRINSIC(vulkan_resource_index, 1, ARR(1), true, 1, 0, 3,
  *
  * All operations take 1 variable deref.
  */
-INTRINSIC(var_atomic_add, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_imin, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_umin, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_imax, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_umax, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_and, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_or, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_xor, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_exchange, 1, ARR(1), true, 1, 1, 0, 0)
-INTRINSIC(var_atomic_comp_swap, 2, ARR(1, 1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_add, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_imin, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_umin, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_imax, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_umax, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_and, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_or, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_xor, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_exchange, 1, ARR(1), true, 1, 1, 0, xx, xx, xx, 0)
+INTRINSIC(var_atomic_comp_swap, 2, ARR(1, 1), true, 1, 1, 0, xx, xx, xx, 0)
 
 /*
  * SSBO atomic intrinsics
@@ -238,16 +239,16 @@ INTRINSIC(var_atomic_comp_swap, 2, ARR(1, 1), true, 1, 1, 0, 0)
  *    in ssbo_atomic_add, etc).
  * 3: For CompSwap only: the second data parameter.
  */
-INTRINSIC(ssbo_atomic_add, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_imin, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_umin, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_imax, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_umax, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_and, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_or, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_xor, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_exchange, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-INTRINSIC(ssbo_atomic_comp_swap, 4, ARR(1, 1, 1, 1), true, 1, 0, 0, 0)
+INTRINSIC(ssbo_atomic_add, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_imin, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_umin, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_imax, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_umax, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_and, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_or, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_xor, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_exchange, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(ssbo_atomic_comp_swap, 4, ARR(1, 1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
 
 /*
  * CS shared variable atomic intrinsics
@@ -265,42 +266,43 @@ INTRINSIC(ssbo_atomic_comp_swap, 4, ARR(1, 1, 1, 1), true, 1, 0, 0, 0)
  *    in shared_atomic_add, etc).
  * 2: For CompSwap only: the second data parameter.
  */
-INTRINSIC(shared_atomic_add, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_imin, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_umin, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_imax, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_umax, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_and, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_or, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_xor, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_exchange, 2, ARR(1, 1), true, 1, 0, 0, 0)
-INTRINSIC(shared_atomic_comp_swap, 3, ARR(1, 1, 1), true, 1, 0, 0, 0)
-
-#define SYSTEM_VALUE(name, components, num_indices) \
+INTRINSIC(shared_atomic_add, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_imin, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_umin, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_imax, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_umax, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_and, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_or, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_xor, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_exchange, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_comp_swap, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+
+#define SYSTEM_VALUE(name, components, num_indices, idx0, idx1, idx2) \
    INTRINSIC(load_##name, 0, ARR(), true, components, 0, num_indices, \
+   idx0, idx1, idx2, \
    NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 
-SYSTEM_VALUE(front_face, 1, 0)
-SYSTEM_VALUE(vertex_id, 1, 0)
-SYSTEM_VALUE(vertex_id_zero_base, 1, 0)
-SYSTEM_VALUE(base_vertex, 1, 0)
-SYSTEM_VALUE(instance_id, 1, 0)
-SYSTEM_VALUE(base_instance, 1, 0)
-SYSTEM_VALUE(draw_id, 1, 0)
-SYSTEM_VALUE(sample_id, 1, 0)
-SYSTEM_VALUE(sample_pos, 2, 0)
-SYSTEM_VALUE(sample_mask_in, 1, 0)
-SYSTEM_VALUE(primitive_id, 1, 0)
-SYSTEM_VALUE(invocation_id, 1, 0)
-SYSTEM_VALUE(tess_coord, 3, 0)
-SYSTEM_VALUE(tess_level_outer, 4, 0)
-SYSTEM_VALUE(tess_level_inner, 2, 0)
-SYSTEM_VALUE(patch_vertices_in, 1, 0)
-SYSTEM_VALUE(local_invocation_id, 3, 0)
-SYSTEM_VALUE(work_group_id, 3, 0)
-SYSTEM_VALUE(user_clip_plane, 4, 1) /* const_index[0] is user_clip_plane[idx] */
-SYSTEM_VALUE(num_work_groups, 3, 0)
-SYSTEM_VALUE(helper_invocation, 1, 0)
+SYSTEM_VALUE(front_face, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(vertex_id, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(vertex_id_zero_base, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(base_vertex, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(instance_id, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(base_instance, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(draw_id, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(sample_id, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(sample_pos, 2, 0, xx, xx, xx)
+SYSTEM_VALUE(sample_mask_in, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(primitive_id, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(invocation_id, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(tess_coord, 3, 0, xx, xx, xx)
+SYSTEM_VALUE(tess_level_outer, 4, 0, xx, xx, xx)
+SYSTEM_VALUE(tess_level_inner, 2, 0, xx, xx, xx)
+SYSTEM_VALUE(patch_vertices_in, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(local_invocation_id, 3, 0, xx, xx, xx)
+SYSTEM_VALUE(work_group_id, 3, 0, xx, xx, xx)
+SYSTEM_VALUE(user_clip_plane, 4, 1, UCP_ID, xx, xx)
+SYSTEM_VALUE(num_work_groups, 3, 0, xx, xx, xx)
+SYSTEM_VALUE(helper_invocation, 1, 0, xx, xx, xx)
 
 /*
  * Load operations pull data from some piece of GPU memory.  All load
@@ -323,27 +325,29 @@ SYSTEM_VALUE(helper_invocation, 1, 0)
  * offsets are always in bytes.
  */
 
-#define LOAD(name, srcs, indices, flags) \
-   INTRINSIC(load_##name, srcs, ARR(1, 1, 1, 1), true, 0, 0, indices, flags)
+#define LOAD(name, srcs, num_indices, idx0, idx1, idx2, flags) \
+   INTRINSIC(load_##name, srcs, ARR(1, 1, 1, 1), true, 0, 0, num_indices, idx0, idx1, idx2, flags)
 
-/* src[] = { offset }. const_index[] = { base, size } */
-LOAD(uniform, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+/* src[] = { offset }. const_index[] = { base, range } */
+LOAD(uniform, 1, 2, BASE, RANGE, xx,
+     NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 /* src[] = { buffer_index, offset }. No const_index */
-LOAD(ubo, 2, 0, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+LOAD(ubo, 2, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 /* src[] = { offset }. const_index[] = { base } */
-LOAD(input, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+LOAD(input, 1, 1, BASE, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 /* src[] = { vertex, offset }. const_index[] = { base } */
-LOAD(per_vertex_input, 2, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+LOAD(per_vertex_input, 2, 1, BASE, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 /* src[] = { buffer_index, offset }. No const_index */
-LOAD(ssbo, 2, 0, NIR_INTRINSIC_CAN_ELIMINATE)
+LOAD(ssbo, 2, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { offset }. const_index[] = { base } */
-LOAD(output, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE)
+LOAD(output, 1, 1, BASE, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { vertex, offset }. const_index[] = { base } */
-LOAD(per_vertex_output, 2, 1, NIR_INTRINSIC_CAN_ELIMINATE)
+LOAD(per_vertex_output, 2, 1, BASE, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { offset }. const_index[] = { base } */
-LOAD(shared, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE)
-/* src[] = { offset }. const_index[] = { base, size } */
-LOAD(push_constant, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+LOAD(shared, 1, 1, BASE, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+/* src[] = { offset }. const_index[] = { base, range } */
+LOAD(push_constant, 1, 2, BASE, RANGE, xx,
+     NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 
 /*
  * Stores work the same way as loads, except now the first source is the value
@@ -352,16 +356,16 @@ LOAD(push_constant, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDE
  * const_index[0].
  */
 
-#define STORE(name, srcs, indices, flags) \
-   INTRINSIC(store_##name, srcs, ARR(0, 1, 1, 1), false, 0, 0, indices, flags)
+#define STORE(name, srcs, num_indices, idx0, idx1, idx2, flags) \
+   INTRINSIC(store_##name, srcs, ARR(0, 1, 1, 1), false, 0, 0, num_indices, idx0, idx1, idx2, flags)
 
 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
-STORE(output, 2, 2, 0)
+STORE(output, 2, 2, BASE, WRMASK, xx, 0)
 /* src[] = { value, vertex, offset }. const_index[] = { base, write_mask } */
-STORE(per_vertex_output, 3, 2, 0)
+STORE(per_vertex_output, 3, 2, BASE, WRMASK, xx, 0)
 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
-STORE(ssbo, 3, 1, 0)
+STORE(ssbo, 3, 1, WRMASK, xx, xx, 0)
 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
-STORE(shared, 2, 2, 0)
+STORE(shared, 2, 2, BASE, WRMASK, xx, 0)
 
 LAST_INTRINSIC(store_shared)
diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c
index b07e199d71b..eefcb55a0a6 100644
--- a/src/compiler/nir/nir_lower_atomics.c
+++ b/src/compiler/nir/nir_lower_atomics.c
@@ -71,8 +71,8 @@ lower_instr(nir_intrinsic_instr *instr,
    unsigned uniform_loc = instr->variables[0]->var->data.location;
 
    nir_intrinsic_instr *new_instr = nir_intrinsic_instr_create(mem_ctx, op);
-   new_instr->const_index[0] =
-      state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index;
+   nir_intrinsic_set_base(new_instr,
+      state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index);
 
    nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
    offset_const->value.u[0] = instr->variables[0]->var->data.offset;
diff --git a/src/compiler/nir/nir_lower_clip.c b/src/compiler/nir/nir_lower_clip.c
index 0ca6a289396..bcbad536874 100644
--- a/src/compiler/nir/nir_lower_clip.c
+++ b/src/compiler/nir/nir_lower_clip.c
@@ -71,8 +71,8 @@ store_clipdist_output(nir_builder *b, nir_variable *out, nir_ssa_def **val)
 
    store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
    store->num_components = 4;
-   store->const_index[0] = out->data.driver_location;
-   store->const_index[1] = 0xf;   /* wrmask */
+   nir_intrinsic_set_base(store, out->data.driver_location);
+   nir_intrinsic_set_write_mask(store, 0xf);
    store->src[0].ssa = nir_vec4(b, val[0], val[1], val[2], val[3]);
    store->src[0].is_ssa = true;
    store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
@@ -86,7 +86,7 @@ load_clipdist_input(nir_builder *b, nir_variable *in, nir_ssa_def **val)
 
    load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_input);
    load->num_components = 4;
-   load->const_index[0] = in->data.driver_location;
+   nir_intrinsic_set_base(load, in->data.driver_location);
    load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
    nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
    nir_builder_instr_insert(b, &load->instr);
@@ -112,7 +112,7 @@ find_output_in_block(nir_block *block, void *void_state)
       if (instr->type == nir_instr_type_intrinsic) {
          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
          if ((intr->intrinsic == nir_intrinsic_store_output) &&
-             intr->const_index[0] == state->drvloc) {
+             nir_intrinsic_base(intr) == state->drvloc) {
             assert(state->def == NULL);
             assert(intr->src[0].is_ssa);
             assert(nir_src_as_const_value(intr->src[1]));
diff --git a/src/compiler/nir/nir_lower_gs_intrinsics.c b/src/compiler/nir/nir_lower_gs_intrinsics.c
index fdff1656b4d..14abfe3f509 100644
--- a/src/compiler/nir/nir_lower_gs_intrinsics.c
+++ b/src/compiler/nir/nir_lower_gs_intrinsics.c
@@ -93,7 +93,7 @@ rewrite_emit_vertex(nir_intrinsic_instr *intrin, struct state *state)
    nir_intrinsic_instr *lowered =
       nir_intrinsic_instr_create(b->shader,
                                  nir_intrinsic_emit_vertex_with_counter);
-   lowered->const_index[0] = intrin->const_index[0];
+   nir_intrinsic_set_stream_id(lowered, nir_intrinsic_stream_id(intrin));
    lowered->src[0] = nir_src_for_ssa(count);
    nir_builder_instr_insert(b, &lowered->instr);
 
@@ -121,7 +121,7 @@ rewrite_end_primitive(nir_intrinsic_instr *intrin, struct state *state)
    nir_intrinsic_instr *lowered =
       nir_intrinsic_instr_create(b->shader,
                                  nir_intrinsic_end_primitive_with_counter);
-   lowered->const_index[0] = intrin->const_index[0];
+   nir_intrinsic_set_stream_id(lowered, nir_intrinsic_stream_id(intrin));
    lowered->src[0] = nir_src_for_ssa(count);
    nir_builder_instr_insert(b, &lowered->instr);
 
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index 2c5fa16af5e..84e353775cf 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -274,8 +274,8 @@ nir_lower_io_block(nir_block *block, void *void_state)
                                        load_op(state, mode, per_vertex));
          load->num_components = intrin->num_components;
 
-         load->const_index[0] =
-            intrin->variables[0]->var->data.driver_location;
+         nir_intrinsic_set_base(load,
+            intrin->variables[0]->var->data.driver_location);
 
          if (load->intrinsic == nir_intrinsic_load_uniform) {
             load->const_index[1] =
@@ -321,11 +321,9 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
          nir_src_copy(&store->src[0], &intrin->src[0], store);
 
-         store->const_index[0] =
-            intrin->variables[0]->var->data.driver_location;
-
-         /* Copy the writemask */
-         store->const_index[1] = intrin->const_index[0];
+         nir_intrinsic_set_base(store,
+            intrin->variables[0]->var->data.driver_location);
+         nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intrin));
 
          if (per_vertex)
             store->src[1] = nir_src_for_ssa(vertex_index);
diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c
index 51b0fa733f2..45036fa7787 100644
--- a/src/compiler/nir/nir_lower_locals_to_regs.c
+++ b/src/compiler/nir/nir_lower_locals_to_regs.c
@@ -243,7 +243,7 @@ lower_locals_to_regs_block(nir_block *block, void *void_state)
 
          nir_alu_instr *mov = nir_alu_instr_create(state->shader, nir_op_imov);
          nir_src_copy(&mov->src[0].src, &intrin->src[0], mov);
-         mov->dest.write_mask = intrin->const_index[0];
+         mov->dest.write_mask = nir_intrinsic_write_mask(intrin);
          mov->dest.dest.is_ssa = false;
          mov->dest.dest.reg.reg = reg_src.reg.reg;
          mov->dest.dest.reg.base_offset = reg_src.reg.base_offset;
diff --git a/src/compiler/nir/nir_lower_two_sided_color.c b/src/compiler/nir/nir_lower_two_sided_color.c
index 1294cb89004..fe3507cb7a3 100644
--- a/src/compiler/nir/nir_lower_two_sided_color.c
+++ b/src/compiler/nir/nir_lower_two_sided_color.c
@@ -72,7 +72,7 @@ load_input(nir_builder *b, nir_variable *in)
 
    load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_input);
    load->num_components = 4;
-   load->const_index[0] = in->data.driver_location;
+   nir_intrinsic_set_base(load, in->data.driver_location);
    load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
    nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
    nir_builder_instr_insert(b, &load->instr);
@@ -151,7 +151,7 @@ nir_lower_two_sided_color_block(nir_block *block, void *void_state)
       for (idx = 0; idx < state->colors_count; idx++) {
          unsigned drvloc =
             state->colors[idx].front->data.driver_location;
-         if (intr->const_index[0] == drvloc) {
+         if (nir_intrinsic_base(intr) == drvloc) {
             assert(nir_src_as_const_value(intr->src[0]));
             break;
          }
diff --git a/src/compiler/nir/nir_lower_var_copies.c b/src/compiler/nir/nir_lower_var_copies.c
index 8cb3edd0a84..7db9839c369 100644
--- a/src/compiler/nir/nir_lower_var_copies.c
+++ b/src/compiler/nir/nir_lower_var_copies.c
@@ -128,7 +128,7 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr,
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_store_var);
       store->num_components = num_components;
-      store->const_index[0] = (1 << num_components) - 1;
+      nir_intrinsic_set_write_mask(store, (1 << num_components) - 1);
       store->variables[0] = nir_deref_as_var(nir_copy_deref(store, &dest_head->deref));
 
       store->src[0].is_ssa = true;
diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c
index e1f368d2f2b..a3f3fcfd9b4 100644
--- a/src/compiler/nir/nir_lower_vars_to_ssa.c
+++ b/src/compiler/nir/nir_lower_vars_to_ssa.c
@@ -560,7 +560,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
          nir_ssa_def *new_def;
          b.cursor = nir_before_instr(&intrin->instr);
 
-         if (intrin->const_index[0] == (1 << intrin->num_components) - 1) {
+         unsigned wrmask = nir_intrinsic_write_mask(intrin);
+         if (wrmask == (1 << intrin->num_components) - 1) {
             /* Whole variable store - just copy the source.  Note that
              * intrin->num_components and intrin->src[0].ssa->num_components
              * may differ.
@@ -580,7 +581,7 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
              */
             nir_ssa_def *srcs[4];
             for (unsigned i = 0; i < intrin->num_components; i++) {
-               if (intrin->const_index[0] & (1 << i)) {
+               if (wrmask & (1 << i)) {
                   srcs[i] = nir_channel(&b, intrin->src[0].ssa, i);
                } else {
                   srcs[i] = nir_channel(&b, old_def, i);
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index d4f4a3d903c..c9c917b77a5 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -151,6 +151,8 @@ optimizations = [
    (('ior', a, 0), a),
    (('fxor', a, a), 0.0),
    (('ixor', a, a), 0),
+   (('fxor', a, 0.0), a),
+   (('ixor', a, 0), a),
    (('inot', ('inot', a)), a),
    # DeMorgan's Laws
    (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
@@ -167,6 +169,8 @@ optimizations = [
    (('flog2', ('fexp2', a)), a), # lg2(2^a) = a
    (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
    (('fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
+   (('fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
+    ('fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
    (('fpow', a, 1.0), a),
    (('fpow', a, 2.0), ('fmul', a, a)),
    (('fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
@@ -313,6 +317,19 @@ optimizations = [
      'options->lower_unpack_snorm_4x8'),
 ]
 
+# Unreal Engine 4 demo applications open-codes bitfieldReverse()
+def bitfield_reverse(u):
+    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
+    step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
+    step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
+    step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
+    step5 = ('ior', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
+
+    return step5
+
+optimizations += [(bitfield_reverse('x'), ('bitfield_reverse', 'x'))]
+
+
 # Add optimizations to handle the case where the result of a ternary is
 # compared to a constant.  This way we can take things like
 #
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index 276a948460c..f0ac0f21dd0 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -451,15 +451,16 @@ print_deref(nir_deref_var *deref, print_state *state)
 static void
 print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
 {
-   unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
+   const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+   unsigned num_srcs = info->num_srcs;
    FILE *fp = state->fp;
 
-   if (nir_intrinsic_infos[instr->intrinsic].has_dest) {
+   if (info->has_dest) {
       print_dest(&instr->dest, state);
       fprintf(fp, " = ");
    }
 
-   fprintf(fp, "intrinsic %s (", nir_intrinsic_infos[instr->intrinsic].name);
+   fprintf(fp, "intrinsic %s (", info->name);
 
    for (unsigned i = 0; i < num_srcs; i++) {
       if (i != 0)
@@ -470,9 +471,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
 
    fprintf(fp, ") (");
 
-   unsigned num_vars = nir_intrinsic_infos[instr->intrinsic].num_variables;
-
-   for (unsigned i = 0; i < num_vars; i++) {
+   for (unsigned i = 0; i < info->num_variables; i++) {
       if (i != 0)
          fprintf(fp, ", ");
 
@@ -481,9 +480,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
 
    fprintf(fp, ") (");
 
-   unsigned num_indices = nir_intrinsic_infos[instr->intrinsic].num_indices;
-
-   for (unsigned i = 0; i < num_indices; i++) {
+   for (unsigned i = 0; i < info->num_indices; i++) {
       if (i != 0)
          fprintf(fp, ", ");
 
@@ -492,6 +489,34 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
 
    fprintf(fp, ")");
 
+   static const char *index_name[NIR_INTRINSIC_NUM_INDEX_FLAGS] = {
+      [NIR_INTRINSIC_BASE] = "base",
+      [NIR_INTRINSIC_WRMASK] = "wrmask",
+      [NIR_INTRINSIC_STREAM_ID] = "stream-id",
+      [NIR_INTRINSIC_UCP_ID] = "ucp-id",
+      [NIR_INTRINSIC_RANGE] = "range",
+      [NIR_INTRINSIC_DESC_SET] = "desc-set",
+      [NIR_INTRINSIC_BINDING] = "binding",
+   };
+   for (unsigned idx = 1; idx < NIR_INTRINSIC_NUM_INDEX_FLAGS; idx++) {
+      if (!info->index_map[idx])
+         continue;
+      fprintf(fp, " /*");
+      if (idx == NIR_INTRINSIC_WRMASK) {
+         /* special case wrmask to show it as a writemask.. */
+         unsigned wrmask = nir_intrinsic_write_mask(instr);
+         fprintf(fp, " wrmask=");
+         for (unsigned i = 0; i < 4; i++)
+            if ((wrmask >> i) & 1)
+               fprintf(fp, "%c", "xyzw"[i]);
+      } else {
+         unsigned off = info->index_map[idx] - 1;
+         assert(index_name[idx]);  /* forgot to update index_name table? */
+         fprintf(fp, " %s=%d", index_name[idx], instr->const_index[off]);
+      }
+      fprintf(fp, " */");
+   }
+
    if (!state->shader)
       return;
 
@@ -515,7 +540,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
    }
 
    nir_foreach_variable(var, var_list) {
-      if ((var->data.driver_location == instr->const_index[0]) &&
+      if ((var->data.driver_location == nir_intrinsic_base(instr)) &&
           var->name) {
          fprintf(fp, "\t/* %s */", var->name);
          break;
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 00184cabe20..0509d482f0b 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -417,7 +417,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
       assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
              instr->variables[0]->var->data.mode != nir_var_uniform &&
              instr->variables[0]->var->data.mode != nir_var_shader_storage);
-      assert((instr->const_index[0] & ~((1 << instr->num_components) - 1)) == 0);
+      assert((nir_intrinsic_write_mask(instr) & ~((1 << instr->num_components) - 1)) == 0);
       break;
    }
    case nir_intrinsic_copy_var:
diff --git a/src/compiler/nir/spirv/vtn_variables.c b/src/compiler/nir/spirv/vtn_variables.c
index 3ad98aa5310..5ca24201498 100644
--- a/src/compiler/nir/spirv/vtn_variables.c
+++ b/src/compiler/nir/spirv/vtn_variables.c
@@ -319,8 +319,8 @@ get_vulkan_resource_index(struct vtn_builder *b, struct vtn_access_chain *chain,
       nir_intrinsic_instr_create(b->nb.shader,
                                  nir_intrinsic_vulkan_resource_index);
    instr->src[0] = nir_src_for_ssa(array_index);
-   instr->const_index[0] = chain->var->descriptor_set;
-   instr->const_index[1] = chain->var->binding;
+   nir_intrinsic_set_desc_set(instr, chain->var->descriptor_set);
+   nir_intrinsic_set_binding(instr, chain->var->binding);
 
    nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
    nir_builder_instr_insert(&b->nb, &instr->instr);
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 6f50f714c3f..84da85c5b96 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -191,11 +191,13 @@ C_SOURCES := \
 	util/u_cpu_detect.c \
 	util/u_cpu_detect.h \
 	util/u_debug.c \
+	util/u_debug.h \
 	util/u_debug_describe.c \
 	util/u_debug_describe.h \
 	util/u_debug_flush.c \
 	util/u_debug_flush.h \
-	util/u_debug.h \
+	util/u_debug_image.c \
+	util/u_debug_image.h \
 	util/u_debug_memory.c \
 	util/u_debug_refcnt.c \
 	util/u_debug_refcnt.h \
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index cf52ca48b26..0298334a28f 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -43,10 +43,10 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_pstipple.h"
 #include "util/u_sampler.h"
 
 #include "tgsi/tgsi_transform.h"
-#include "tgsi/tgsi_dump.h"
 
 #include "draw_context.h"
 #include "draw_pipe.h"
@@ -114,178 +114,6 @@ struct pstip_stage
 };
 
 
-
-/**
- * Subclass of tgsi_transform_context, used for transforming the
- * user's fragment shader to add the extra texture sample and fragment kill
- * instructions.
- */
-struct pstip_transform_context {
-   struct tgsi_transform_context base;
-   uint tempsUsed;  /**< bitmask */
-   int wincoordInput;
-   int maxInput;
-   uint samplersUsed;  /**< bitfield of samplers used */
-   bool hasSview;
-   int freeSampler;  /** an available sampler for the pstipple */
-   int texTemp;  /**< temp registers */
-   int numImmed;
-};
-
-
-/**
- * TGSI declaration transform callback.
- * Look for a free sampler, a free input attrib, and two free temp regs.
- */
-static void
-pstip_transform_decl(struct tgsi_transform_context *ctx,
-                     struct tgsi_full_declaration *decl)
-{
-   struct pstip_transform_context *pctx = (struct pstip_transform_context *) ctx;
-
-   if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
-      uint i;
-      for (i = decl->Range.First;
-           i <= decl->Range.Last; i++) {
-         pctx->samplersUsed |= 1 << i;
-      }
-   }
-   else if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
-      pctx->hasSview = true;
-   }
-   else if (decl->Declaration.File == TGSI_FILE_INPUT) {
-      pctx->maxInput = MAX2(pctx->maxInput, (int) decl->Range.Last);
-      if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION)
-         pctx->wincoordInput = (int) decl->Range.First;
-   }
-   else if (decl->Declaration.File == TGSI_FILE_TEMPORARY) {
-      uint i;
-      for (i = decl->Range.First;
-           i <= decl->Range.Last; i++) {
-         pctx->tempsUsed |= (1 << i);
-      }
-   }
-
-   ctx->emit_declaration(ctx, decl);
-}
-
-
-/**
- * TGSI immediate declaration transform callback.
- * We're just counting the number of immediates here.
- */
-static void
-pstip_transform_immed(struct tgsi_transform_context *ctx,
-                      struct tgsi_full_immediate *immed)
-{
-   struct pstip_transform_context *pctx = (struct pstip_transform_context *) ctx;
-   ctx->emit_immediate(ctx, immed); /* emit to output shader */
-   pctx->numImmed++;
-}
-
-
-/**
- * Find the lowest zero bit in the given word, or -1 if bitfield is all ones.
- */
-static int
-free_bit(uint bitfield)
-{
-   return ffs(~bitfield) - 1;
-}
-
-
-/**
- * TGSI transform prolog callback.
- */
-static void
-pstip_transform_prolog(struct tgsi_transform_context *ctx)
-{
-   struct pstip_transform_context *pctx = (struct pstip_transform_context *) ctx;
-   uint i;
-   int wincoordInput;
-
-   /* find free sampler */
-   pctx->freeSampler = free_bit(pctx->samplersUsed);
-   if (pctx->freeSampler >= PIPE_MAX_SAMPLERS)
-      pctx->freeSampler = PIPE_MAX_SAMPLERS - 1;
-
-   if (pctx->wincoordInput < 0)
-      wincoordInput = pctx->maxInput + 1;
-   else
-      wincoordInput = pctx->wincoordInput;
-
-   /* find one free temp reg */
-   for (i = 0; i < 32; i++) {
-      if ((pctx->tempsUsed & (1 << i)) == 0) {
-      /* found a free temp */
-      if (pctx->texTemp < 0)
-         pctx->texTemp  = i;
-      else
-         break;
-      }
-   }
-   assert(pctx->texTemp >= 0);
-
-   if (pctx->wincoordInput < 0) {
-      /* declare new position input reg */
-      tgsi_transform_input_decl(ctx, wincoordInput,
-                                TGSI_SEMANTIC_POSITION, 1,
-                                TGSI_INTERPOLATE_LINEAR);
-   }
-
-   /* declare new sampler */
-   tgsi_transform_sampler_decl(ctx, pctx->freeSampler);
-
-   /* if the src shader has SVIEW decl's for each SAMP decl, we
-    * need to continue the trend and ensure there is a matching
-    * SVIEW for the new SAMP we just created
-    */
-   if (pctx->hasSview) {
-      tgsi_transform_sampler_view_decl(ctx,
-                                       pctx->freeSampler,
-                                       TGSI_TEXTURE_2D,
-                                       TGSI_RETURN_TYPE_FLOAT);
-   }
-
-   /* declare new temp regs */
-   tgsi_transform_temp_decl(ctx, pctx->texTemp);
-
-   /* emit immediate = {1/32, 1/32, 1, 1}
-    * The index/position of this immediate will be pctx->numImmed
-    */
-   tgsi_transform_immediate_decl(ctx, 1.0/32.0, 1.0/32.0, 1.0, 1.0);
-
-   /* 
-    * Insert new MUL/TEX/KILL_IF instructions at start of program
-    * Take gl_FragCoord, divide by 32 (stipple size), sample the
-    * texture and kill fragment if needed.
-    *
-    * We'd like to use non-normalized texcoords to index into a RECT
-    * texture, but we can only use GL_REPEAT wrap mode with normalized
-    * texcoords.  Darn.
-    */
-
-   /* MUL texTemp, INPUT[wincoord], 1/32; */
-   tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
-                           TGSI_FILE_TEMPORARY, pctx->texTemp,
-                           TGSI_WRITEMASK_XYZW,
-                           TGSI_FILE_INPUT, wincoordInput,
-                           TGSI_FILE_IMMEDIATE, pctx->numImmed);
-
-   /* TEX texTemp, texTemp, sampler; */
-   tgsi_transform_tex_2d_inst(ctx,
-                              TGSI_FILE_TEMPORARY, pctx->texTemp,
-                              TGSI_FILE_TEMPORARY, pctx->texTemp,
-                              pctx->freeSampler);
-
-   /* KILL_IF -texTemp.wwww;   # if -texTemp < 0, KILL fragment */
-   tgsi_transform_kill_inst(ctx,
-                            TGSI_FILE_TEMPORARY, pctx->texTemp,
-                            TGSI_SWIZZLE_W, TRUE);
-}
-
-
-
 /**
  * Generate the frag shader we'll use for doing polygon stipple.
  * This will be the user's shader prefixed with a TEX and KIL instruction.
@@ -293,40 +121,27 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
 static boolean
 generate_pstip_fs(struct pstip_stage *pstip)
 {
+   struct pipe_context *pipe = pstip->pipe;
+   struct pipe_screen *screen = pipe->screen;
    const struct pipe_shader_state *orig_fs = &pstip->fs->state;
    /*struct draw_context *draw = pstip->stage.draw;*/
    struct pipe_shader_state pstip_fs;
-   struct pstip_transform_context transform;
-   const uint newLen = tgsi_num_tokens(orig_fs->tokens) + NUM_NEW_TOKENS;
+   enum tgsi_file_type wincoord_file;
+
+   wincoord_file = screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL) ?
+                   TGSI_FILE_SYSTEM_VALUE : TGSI_FILE_INPUT;
 
    pstip_fs = *orig_fs; /* copy to init */
-   pstip_fs.tokens = tgsi_alloc_tokens(newLen);
+   pstip_fs.tokens = util_pstipple_create_fragment_shader(orig_fs->tokens,
+                                                          &pstip->fs->sampler_unit,
+                                                          0,
+                                                          wincoord_file);
    if (pstip_fs.tokens == NULL)
       return FALSE;
 
-   memset(&transform, 0, sizeof(transform));
-   transform.wincoordInput = -1;
-   transform.maxInput = -1;
-   transform.texTemp = -1;
-   transform.base.prolog = pstip_transform_prolog;
-   transform.base.transform_declaration = pstip_transform_decl;
-   transform.base.transform_immediate = pstip_transform_immed;
-
-   tgsi_transform_shader(orig_fs->tokens,
-                         (struct tgsi_token *) pstip_fs.tokens,
-                         newLen, &transform.base);
-
-#if 0 /* DEBUG */
-   tgsi_dump(orig_fs->tokens, 0);
-   tgsi_dump(pstip_fs.tokens, 0);
-#endif
-
-   assert(pstip->fs);
-
-   pstip->fs->sampler_unit = transform.freeSampler;
    assert(pstip->fs->sampler_unit < PIPE_MAX_SAMPLERS);
 
-   pstip->fs->pstip_fs = pstip->driver_create_fs_state(pstip->pipe, &pstip_fs);
+   pstip->fs->pstip_fs = pstip->driver_create_fs_state(pipe, &pstip_fs);
    
    FREE((void *)pstip_fs.tokens);
 
@@ -338,113 +153,6 @@ generate_pstip_fs(struct pstip_stage *pstip)
 
 
 /**
- * Load texture image with current stipple pattern.
- */
-static void
-pstip_update_texture(struct pstip_stage *pstip)
-{
-   static const uint bit31 = 1 << 31;
-   struct pipe_context *pipe = pstip->pipe;
-   struct pipe_transfer *transfer;
-   const uint *stipple = pstip->state.stipple->stipple;
-   uint i, j;
-   ubyte *data;
-
-   data = pipe_transfer_map(pipe, pstip->texture, 0, 0,
-                                PIPE_TRANSFER_WRITE, 0, 0, 32, 32, &transfer);
-
-   /*
-    * Load alpha texture.
-    * Note: 0 means keep the fragment, 255 means kill it.
-    * We'll negate the texel value and use KILL_IF which kills if value
-    * is negative.
-    */
-   for (i = 0; i < 32; i++) {
-      for (j = 0; j < 32; j++) {
-         if (stipple[i] & (bit31 >> j)) {
-            /* fragment "on" */
-            data[i * transfer->stride + j] = 0;
-         }
-         else {
-            /* fragment "off" */
-            data[i * transfer->stride + j] = 255;
-         }
-      }
-   }
-
-   /* unmap */
-   pipe_transfer_unmap(pipe, transfer);
-}
-
-
-/**
- * Create the texture map we'll use for stippling.
- */
-static boolean
-pstip_create_texture(struct pstip_stage *pstip)
-{
-   struct pipe_context *pipe = pstip->pipe;
-   struct pipe_screen *screen = pipe->screen;
-   struct pipe_resource texTemp;
-   struct pipe_sampler_view viewTempl;
-
-   memset(&texTemp, 0, sizeof(texTemp));
-   texTemp.target = PIPE_TEXTURE_2D;
-   texTemp.format = PIPE_FORMAT_A8_UNORM; /* XXX verify supported by driver! */
-   texTemp.last_level = 0;
-   texTemp.width0 = 32;
-   texTemp.height0 = 32;
-   texTemp.depth0 = 1;
-   texTemp.array_size = 1;
-   texTemp.bind = PIPE_BIND_SAMPLER_VIEW;
-
-   pstip->texture = screen->resource_create(screen, &texTemp);
-   if (pstip->texture == NULL)
-      return FALSE;
-
-   u_sampler_view_default_template(&viewTempl,
-                                   pstip->texture,
-                                   pstip->texture->format);
-   pstip->sampler_view = pipe->create_sampler_view(pipe,
-                                                   pstip->texture,
-                                                   &viewTempl);
-   if (!pstip->sampler_view) {
-      return FALSE;
-   }
-
-   return TRUE;
-}
-
-
-/**
- * Create the sampler CSO that'll be used for stippling.
- */
-static boolean
-pstip_create_sampler(struct pstip_stage *pstip)
-{
-   struct pipe_sampler_state sampler;
-   struct pipe_context *pipe = pstip->pipe;
-
-   memset(&sampler, 0, sizeof(sampler));
-   sampler.wrap_s = PIPE_TEX_WRAP_REPEAT;
-   sampler.wrap_t = PIPE_TEX_WRAP_REPEAT;
-   sampler.wrap_r = PIPE_TEX_WRAP_REPEAT;
-   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-   sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
-   sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-   sampler.normalized_coords = 1;
-   sampler.min_lod = 0.0f;
-   sampler.max_lod = 0.0f;
-
-   pstip->sampler_cso = pipe->create_sampler_state(pipe, &sampler);
-   if (pstip->sampler_cso == NULL)
-      return FALSE;
-   
-   return TRUE;
-}
-
-
-/**
  * When we're about to draw our first stipple polygon in a batch, this function
  * is called to tell the driver to bind our modified fragment shader.
  */
@@ -722,7 +430,8 @@ pstip_set_polygon_stipple(struct pipe_context *pipe,
    /* pass-through */
    pstip->driver_set_polygon_stipple(pstip->pipe, stipple);
 
-   pstip_update_texture(pstip);
+   util_pstipple_update_stipple_texture(pstip->pipe, pstip->texture,
+                                        pstip->state.stipple->stipple);
 }
 
 
@@ -758,10 +467,17 @@ draw_install_pstipple_stage(struct draw_context *draw,
    pstip->driver_set_polygon_stipple = pipe->set_polygon_stipple;
 
    /* create special texture, sampler state */
-   if (!pstip_create_texture(pstip))
+   pstip->texture = util_pstipple_create_stipple_texture(pipe, NULL);
+   if (!pstip->texture)
+      goto fail;
+
+   pstip->sampler_view = util_pstipple_create_sampler_view(pipe,
+                                                           pstip->texture);
+   if (!pstip->sampler_view)
       goto fail;
 
-   if (!pstip_create_sampler(pstip))
+   pstip->sampler_cso = util_pstipple_create_sampler(pipe);
+   if (!pstip->sampler_cso)
       goto fail;
 
    /* override the driver's functions */
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 3e7d69f73ed..61ff0a74379 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -614,8 +614,8 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
       }
 
       nir_ssa_def *offset;
-      if (dim) {
-         /* UBO loads don't have a const_index[0] base offset. */
+      if (op == nir_intrinsic_load_ubo) {
+         /* UBO loads don't have a base offset. */
          offset = nir_imm_int(b, index);
          if (indirect) {
             offset = nir_iadd(b, offset, ttn_src_for_indirect(c, indirect));
@@ -623,7 +623,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
          /* UBO offsets are in bytes, but TGSI gives them to us in vec4's */
          offset = nir_ishl(b, offset, nir_imm_int(b, 4));
       } else {
-         load->const_index[0] = index;
+         nir_intrinsic_set_base(load, index);
          if (indirect) {
             offset = ttn_src_for_indirect(c, indirect);
          } else {
@@ -1875,7 +1875,7 @@ ttn_emit_instruction(struct ttn_compile *c)
                                            &tgsi_dst->Indirect : NULL;
 
       store->num_components = 4;
-      store->const_index[0] = dest.write_mask;
+      nir_intrinsic_set_write_mask(store, dest.write_mask);
       store->variables[0] = ttn_array_deref(c, store, var, offset, indirect);
       store->src[0] = nir_src_for_reg(dest.dest.reg.reg);
 
@@ -1907,8 +1907,8 @@ ttn_add_output_stores(struct ttn_compile *c)
          store->num_components = 4;
          store->src[0].reg.reg = c->output_regs[loc].reg;
          store->src[0].reg.base_offset = c->output_regs[loc].offset;
-         store->const_index[0] = loc;
-         store->const_index[1] = 0xf;  /* writemask */
+         nir_intrinsic_set_base(store, loc);
+         nir_intrinsic_set_write_mask(store, 0xf);
          store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
          nir_builder_instr_insert(b, &store->instr);
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 687fb54830d..489423d7f12 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -44,6 +44,387 @@
 
 
 
+static void
+scan_instruction(struct tgsi_shader_info *info,
+                 const struct tgsi_full_instruction *fullinst,
+                 unsigned *current_depth)
+{
+   unsigned i;
+
+   assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST);
+   info->opcode_count[fullinst->Instruction.Opcode]++;
+
+   switch (fullinst->Instruction.Opcode) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_UIF:
+   case TGSI_OPCODE_BGNLOOP:
+      (*current_depth)++;
+      info->max_depth = MAX2(info->max_depth, *current_depth);
+      break;
+   case TGSI_OPCODE_ENDIF:
+   case TGSI_OPCODE_ENDLOOP:
+      (*current_depth)--;
+      break;
+   default:
+      break;
+   }
+
+   if (fullinst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID ||
+       fullinst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
+       fullinst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
+      const struct tgsi_full_src_register *src0 = &fullinst->Src[0];
+      unsigned input;
+
+      if (src0->Register.Indirect && src0->Indirect.ArrayID)
+         input = info->input_array_first[src0->Indirect.ArrayID];
+      else
+         input = src0->Register.Index;
+
+      /* For the INTERP opcodes, the interpolation is always
+       * PERSPECTIVE unless LINEAR is specified.
+       */
+      switch (info->input_interpolate[input]) {
+      case TGSI_INTERPOLATE_COLOR:
+      case TGSI_INTERPOLATE_CONSTANT:
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+         switch (fullinst->Instruction.Opcode) {
+         case TGSI_OPCODE_INTERP_CENTROID:
+            info->uses_persp_opcode_interp_centroid = TRUE;
+            break;
+         case TGSI_OPCODE_INTERP_OFFSET:
+            info->uses_persp_opcode_interp_offset = TRUE;
+            break;
+         case TGSI_OPCODE_INTERP_SAMPLE:
+            info->uses_persp_opcode_interp_sample = TRUE;
+            break;
+         }
+         break;
+
+      case TGSI_INTERPOLATE_LINEAR:
+         switch (fullinst->Instruction.Opcode) {
+         case TGSI_OPCODE_INTERP_CENTROID:
+            info->uses_linear_opcode_interp_centroid = TRUE;
+            break;
+         case TGSI_OPCODE_INTERP_OFFSET:
+            info->uses_linear_opcode_interp_offset = TRUE;
+            break;
+         case TGSI_OPCODE_INTERP_SAMPLE:
+            info->uses_linear_opcode_interp_sample = TRUE;
+            break;
+         }
+         break;
+      }
+   }
+
+   if (fullinst->Instruction.Opcode >= TGSI_OPCODE_F2D &&
+       fullinst->Instruction.Opcode <= TGSI_OPCODE_DSSG)
+      info->uses_doubles = TRUE;
+
+   for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *src = &fullinst->Src[i];
+      int ind = src->Register.Index;
+
+      /* Mark which inputs are effectively used */
+      if (src->Register.File == TGSI_FILE_INPUT) {
+         unsigned usage_mask;
+         usage_mask = tgsi_util_get_inst_usage_mask(fullinst, i);
+         if (src->Register.Indirect) {
+            for (ind = 0; ind < info->num_inputs; ++ind) {
+               info->input_usage_mask[ind] |= usage_mask;
+            }
+         } else {
+            assert(ind >= 0);
+            assert(ind < PIPE_MAX_SHADER_INPUTS);
+            info->input_usage_mask[ind] |= usage_mask;
+         }
+
+         if (info->processor == TGSI_PROCESSOR_FRAGMENT &&
+             !src->Register.Indirect) {
+            unsigned name =
+               info->input_semantic_name[src->Register.Index];
+            unsigned index =
+               info->input_semantic_index[src->Register.Index];
+
+            if (name == TGSI_SEMANTIC_POSITION &&
+                (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
+                 src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
+                 src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||
+                 src->Register.SwizzleW == TGSI_SWIZZLE_Z))
+               info->reads_z = TRUE;
+
+            if (name == TGSI_SEMANTIC_COLOR) {
+               unsigned mask =
+                  (1 << src->Register.SwizzleX) |
+                  (1 << src->Register.SwizzleY) |
+                  (1 << src->Register.SwizzleZ) |
+                  (1 << src->Register.SwizzleW);
+
+               info->colors_read |= mask << (index * 4);
+            }
+         }
+      }
+
+      /* check for indirect register reads */
+      if (src->Register.Indirect) {
+         info->indirect_files |= (1 << src->Register.File);
+         info->indirect_files_read |= (1 << src->Register.File);
+      }
+
+      /* MSAA samplers */
+      if (src->Register.File == TGSI_FILE_SAMPLER) {
+         assert(fullinst->Instruction.Texture);
+         assert(src->Register.Index < Elements(info->is_msaa_sampler));
+
+         if (fullinst->Instruction.Texture &&
+             (fullinst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
+              fullinst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
+            info->is_msaa_sampler[src->Register.Index] = TRUE;
+         }
+      }
+   }
+
+   /* check for indirect register writes */
+   for (i = 0; i < fullinst->Instruction.NumDstRegs; i++) {
+      const struct tgsi_full_dst_register *dst = &fullinst->Dst[i];
+      if (dst->Register.Indirect) {
+         info->indirect_files |= (1 << dst->Register.File);
+         info->indirect_files_written |= (1 << dst->Register.File);
+      }
+   }
+
+   info->num_instructions++;
+}
+     
+
+static void
+scan_declaration(struct tgsi_shader_info *info,
+                 const struct tgsi_full_declaration *fulldecl)
+{
+   const uint file = fulldecl->Declaration.File;
+   const unsigned procType = info->processor;
+   uint reg;
+
+   if (fulldecl->Declaration.Array) {
+      unsigned array_id = fulldecl->Array.ArrayID;
+
+      switch (file) {
+      case TGSI_FILE_INPUT:
+         assert(array_id < ARRAY_SIZE(info->input_array_first));
+         info->input_array_first[array_id] = fulldecl->Range.First;
+         info->input_array_last[array_id] = fulldecl->Range.Last;
+         break;
+      case TGSI_FILE_OUTPUT:
+         assert(array_id < ARRAY_SIZE(info->output_array_first));
+         info->output_array_first[array_id] = fulldecl->Range.First;
+         info->output_array_last[array_id] = fulldecl->Range.Last;
+         break;
+      }
+      info->array_max[file] = MAX2(info->array_max[file], array_id);
+   }
+
+   for (reg = fulldecl->Range.First; reg <= fulldecl->Range.Last; reg++) {
+      unsigned semName = fulldecl->Semantic.Name;
+      unsigned semIndex = fulldecl->Semantic.Index +
+         (reg - fulldecl->Range.First);
+
+      /* only first 32 regs will appear in this bitfield */
+      info->file_mask[file] |= (1 << reg);
+      info->file_count[file]++;
+      info->file_max[file] = MAX2(info->file_max[file], (int)reg);
+
+      if (file == TGSI_FILE_CONSTANT) {
+         int buffer = 0;
+
+         if (fulldecl->Declaration.Dimension)
+            buffer = fulldecl->Dim.Index2D;
+
+         info->const_file_max[buffer] =
+            MAX2(info->const_file_max[buffer], (int)reg);
+      }
+      else if (file == TGSI_FILE_INPUT) {
+         info->input_semantic_name[reg] = (ubyte) semName;
+         info->input_semantic_index[reg] = (ubyte) semIndex;
+         info->input_interpolate[reg] = (ubyte)fulldecl->Interp.Interpolate;
+         info->input_interpolate_loc[reg] = (ubyte)fulldecl->Interp.Location;
+         info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Interp.CylindricalWrap;
+         info->num_inputs++;
+
+         /* Only interpolated varyings. Don't include POSITION.
+          * Don't include integer varyings, because they are not
+          * interpolated.
+          */
+         if (semName == TGSI_SEMANTIC_GENERIC ||
+             semName == TGSI_SEMANTIC_TEXCOORD ||
+             semName == TGSI_SEMANTIC_COLOR ||
+             semName == TGSI_SEMANTIC_BCOLOR ||
+             semName == TGSI_SEMANTIC_FOG ||
+             semName == TGSI_SEMANTIC_CLIPDIST ||
+             semName == TGSI_SEMANTIC_CULLDIST) {
+            switch (fulldecl->Interp.Interpolate) {
+            case TGSI_INTERPOLATE_COLOR:
+            case TGSI_INTERPOLATE_PERSPECTIVE:
+               switch (fulldecl->Interp.Location) {
+               case TGSI_INTERPOLATE_LOC_CENTER:
+                  info->uses_persp_center = TRUE;
+                  break;
+               case TGSI_INTERPOLATE_LOC_CENTROID:
+                  info->uses_persp_centroid = TRUE;
+                  break;
+               case TGSI_INTERPOLATE_LOC_SAMPLE:
+                  info->uses_persp_sample = TRUE;
+                  break;
+               }
+               break;
+            case TGSI_INTERPOLATE_LINEAR:
+               switch (fulldecl->Interp.Location) {
+               case TGSI_INTERPOLATE_LOC_CENTER:
+                  info->uses_linear_center = TRUE;
+                  break;
+               case TGSI_INTERPOLATE_LOC_CENTROID:
+                  info->uses_linear_centroid = TRUE;
+                  break;
+               case TGSI_INTERPOLATE_LOC_SAMPLE:
+                  info->uses_linear_sample = TRUE;
+                  break;
+               }
+               break;
+               /* TGSI_INTERPOLATE_CONSTANT doesn't do any interpolation. */
+            }
+         }
+
+         if (semName == TGSI_SEMANTIC_PRIMID)
+            info->uses_primid = TRUE;
+         else if (procType == TGSI_PROCESSOR_FRAGMENT) {
+            if (semName == TGSI_SEMANTIC_POSITION)
+               info->reads_position = TRUE;
+            else if (semName == TGSI_SEMANTIC_FACE)
+               info->uses_frontface = TRUE;
+         }
+      }
+      else if (file == TGSI_FILE_SYSTEM_VALUE) {
+         unsigned index = fulldecl->Range.First;
+
+         info->system_value_semantic_name[index] = semName;
+         info->num_system_values = MAX2(info->num_system_values, index + 1);
+
+         switch (semName) {
+         case TGSI_SEMANTIC_INSTANCEID:
+            info->uses_instanceid = TRUE;
+            break;
+         case TGSI_SEMANTIC_VERTEXID:
+            info->uses_vertexid = TRUE;
+            break;
+         case TGSI_SEMANTIC_VERTEXID_NOBASE:
+            info->uses_vertexid_nobase = TRUE;
+            break;
+         case TGSI_SEMANTIC_BASEVERTEX:
+            info->uses_basevertex = TRUE;
+            break;
+         case TGSI_SEMANTIC_PRIMID:
+            info->uses_primid = TRUE;
+            break;
+         case TGSI_SEMANTIC_INVOCATIONID:
+            info->uses_invocationid = TRUE;
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            info->reads_position = TRUE;
+            break;
+         case TGSI_SEMANTIC_FACE:
+            info->uses_frontface = TRUE;
+            break;
+         case TGSI_SEMANTIC_SAMPLEMASK:
+            info->reads_samplemask = TRUE;
+            break;
+         }
+      }
+      else if (file == TGSI_FILE_OUTPUT) {
+         info->output_semantic_name[reg] = (ubyte) semName;
+         info->output_semantic_index[reg] = (ubyte) semIndex;
+         info->num_outputs++;
+
+         if (semName == TGSI_SEMANTIC_COLOR)
+            info->colors_written |= 1 << semIndex;
+
+         if (procType == TGSI_PROCESSOR_VERTEX ||
+             procType == TGSI_PROCESSOR_GEOMETRY ||
+             procType == TGSI_PROCESSOR_TESS_CTRL ||
+             procType == TGSI_PROCESSOR_TESS_EVAL) {
+            switch (semName) {
+            case TGSI_SEMANTIC_VIEWPORT_INDEX:
+               info->writes_viewport_index = TRUE;
+               break;
+            case TGSI_SEMANTIC_LAYER:
+               info->writes_layer = TRUE;
+               break;
+            case TGSI_SEMANTIC_PSIZE:
+               info->writes_psize = TRUE;
+               break;
+            case TGSI_SEMANTIC_CLIPVERTEX:
+               info->writes_clipvertex = TRUE;
+               break;
+            }
+         }
+
+         if (procType == TGSI_PROCESSOR_FRAGMENT) {
+            switch (semName) {
+            case TGSI_SEMANTIC_POSITION:
+               info->writes_z = TRUE;
+               break;
+            case TGSI_SEMANTIC_STENCIL:
+               info->writes_stencil = TRUE;
+               break;
+            case TGSI_SEMANTIC_SAMPLEMASK:
+               info->writes_samplemask = TRUE;
+               break;
+            }
+         }
+
+         if (procType == TGSI_PROCESSOR_VERTEX) {
+            if (semName == TGSI_SEMANTIC_EDGEFLAG) {
+               info->writes_edgeflag = TRUE;
+            }
+         }
+      } else if (file == TGSI_FILE_SAMPLER) {
+         info->samplers_declared |= 1 << reg;
+      }
+   }
+}
+
+
+static void
+scan_immediate(struct tgsi_shader_info *info)
+{
+   uint reg = info->immediate_count++;
+   uint file = TGSI_FILE_IMMEDIATE;
+
+   info->file_mask[file] |= (1 << reg);
+   info->file_count[file]++;
+   info->file_max[file] = MAX2(info->file_max[file], (int)reg);
+}
+
+
+static void
+scan_property(struct tgsi_shader_info *info,
+              const struct tgsi_full_property *fullprop)
+{
+   unsigned name = fullprop->Property.PropertyName;
+   unsigned value = fullprop->u[0].Data;
+
+   assert(name < Elements(info->properties));
+   info->properties[name] = value;
+
+   switch (name) {
+   case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
+      info->num_written_clipdistance = value;
+      info->clipdist_writemask |= (1 << value) - 1;
+      break;
+   case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+      info->num_written_culldistance = value;
+      info->culldist_writemask |= (1 << value) - 1;
+      break;
+   }
+}
+
 
 /**
  * Scan the given TGSI shader to collect information such as number of
@@ -81,390 +462,30 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
           procType == TGSI_PROCESSOR_COMPUTE);
    info->processor = procType;
 
-
    /**
     ** Loop over incoming program tokens/instructions
     */
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-
+   while (!tgsi_parse_end_of_tokens(&parse)) {
       info->num_tokens++;
 
       tgsi_parse_token( &parse );
 
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         {
-            const struct tgsi_full_instruction *fullinst
-               = &parse.FullToken.FullInstruction;
-            uint i;
-
-            assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST);
-            info->opcode_count[fullinst->Instruction.Opcode]++;
-
-            switch (fullinst->Instruction.Opcode) {
-            case TGSI_OPCODE_IF:
-            case TGSI_OPCODE_UIF:
-            case TGSI_OPCODE_BGNLOOP:
-               current_depth++;
-               info->max_depth = MAX2(info->max_depth, current_depth);
-               break;
-            case TGSI_OPCODE_ENDIF:
-            case TGSI_OPCODE_ENDLOOP:
-               current_depth--;
-               break;
-            default:
-               break;
-            }
-
-            if (fullinst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID ||
-                fullinst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
-                fullinst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
-               const struct tgsi_full_src_register *src0 = &fullinst->Src[0];
-               unsigned input;
-
-               if (src0->Register.Indirect && src0->Indirect.ArrayID)
-                  input = info->input_array_first[src0->Indirect.ArrayID];
-               else
-                  input = src0->Register.Index;
-
-               /* For the INTERP opcodes, the interpolation is always
-                * PERSPECTIVE unless LINEAR is specified.
-                */
-               switch (info->input_interpolate[input]) {
-               case TGSI_INTERPOLATE_COLOR:
-               case TGSI_INTERPOLATE_CONSTANT:
-               case TGSI_INTERPOLATE_PERSPECTIVE:
-                  switch (fullinst->Instruction.Opcode) {
-                  case TGSI_OPCODE_INTERP_CENTROID:
-                     info->uses_persp_opcode_interp_centroid = true;
-                     break;
-                  case TGSI_OPCODE_INTERP_OFFSET:
-                     info->uses_persp_opcode_interp_offset = true;
-                     break;
-                  case TGSI_OPCODE_INTERP_SAMPLE:
-                     info->uses_persp_opcode_interp_sample = true;
-                     break;
-                  }
-                  break;
-
-               case TGSI_INTERPOLATE_LINEAR:
-                  switch (fullinst->Instruction.Opcode) {
-                  case TGSI_OPCODE_INTERP_CENTROID:
-                     info->uses_linear_opcode_interp_centroid = true;
-                     break;
-                  case TGSI_OPCODE_INTERP_OFFSET:
-                     info->uses_linear_opcode_interp_offset = true;
-                     break;
-                  case TGSI_OPCODE_INTERP_SAMPLE:
-                     info->uses_linear_opcode_interp_sample = true;
-                     break;
-                  }
-                  break;
-               }
-            }
-
-            if (fullinst->Instruction.Opcode >= TGSI_OPCODE_F2D &&
-                fullinst->Instruction.Opcode <= TGSI_OPCODE_DSSG)
-               info->uses_doubles = true;
-
-            for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) {
-               const struct tgsi_full_src_register *src =
-                  &fullinst->Src[i];
-               int ind = src->Register.Index;
-
-               /* Mark which inputs are effectively used */
-               if (src->Register.File == TGSI_FILE_INPUT) {
-                  unsigned usage_mask;
-                  usage_mask = tgsi_util_get_inst_usage_mask(fullinst, i);
-                  if (src->Register.Indirect) {
-                     for (ind = 0; ind < info->num_inputs; ++ind) {
-                        info->input_usage_mask[ind] |= usage_mask;
-                     }
-                  } else {
-                     assert(ind >= 0);
-                     assert(ind < PIPE_MAX_SHADER_INPUTS);
-                     info->input_usage_mask[ind] |= usage_mask;
-                  }
-
-                  if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                      !src->Register.Indirect) {
-                     unsigned name =
-                        info->input_semantic_name[src->Register.Index];
-                     unsigned index =
-                        info->input_semantic_index[src->Register.Index];
-
-                     if (name == TGSI_SEMANTIC_POSITION &&
-                         (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
-                          src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
-                          src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||
-                          src->Register.SwizzleW == TGSI_SWIZZLE_Z))
-                        info->reads_z = TRUE;
-
-                     if (name == TGSI_SEMANTIC_COLOR) {
-                        unsigned mask =
-                              (1 << src->Register.SwizzleX) |
-                              (1 << src->Register.SwizzleY) |
-                              (1 << src->Register.SwizzleZ) |
-                              (1 << src->Register.SwizzleW);
-
-                        info->colors_read |= mask << (index * 4);
-                     }
-                  }
-               }
-
-               /* check for indirect register reads */
-               if (src->Register.Indirect) {
-                  info->indirect_files |= (1 << src->Register.File);
-                  info->indirect_files_read |= (1 << src->Register.File);
-               }
-
-               /* MSAA samplers */
-               if (src->Register.File == TGSI_FILE_SAMPLER) {
-                  assert(fullinst->Instruction.Texture);
-                  assert(src->Register.Index < Elements(info->is_msaa_sampler));
-
-                  if (fullinst->Instruction.Texture &&
-                      (fullinst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
-                       fullinst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
-                     info->is_msaa_sampler[src->Register.Index] = TRUE;
-                  }
-               }
-            }
-
-            /* check for indirect register writes */
-            for (i = 0; i < fullinst->Instruction.NumDstRegs; i++) {
-               const struct tgsi_full_dst_register *dst = &fullinst->Dst[i];
-               if (dst->Register.Indirect) {
-                  info->indirect_files |= (1 << dst->Register.File);
-                  info->indirect_files_written |= (1 << dst->Register.File);
-               }
-            }
-
-            info->num_instructions++;
-         }
+         scan_instruction(info, &parse.FullToken.FullInstruction,
+                          &current_depth);
          break;
-
       case TGSI_TOKEN_TYPE_DECLARATION:
-         {
-            const struct tgsi_full_declaration *fulldecl
-               = &parse.FullToken.FullDeclaration;
-            const uint file = fulldecl->Declaration.File;
-            uint reg;
-
-            if (fulldecl->Declaration.Array) {
-               unsigned array_id = fulldecl->Array.ArrayID;
-
-               switch (file) {
-               case TGSI_FILE_INPUT:
-                  assert(array_id < ARRAY_SIZE(info->input_array_first));
-                  info->input_array_first[array_id] = fulldecl->Range.First;
-                  info->input_array_last[array_id] = fulldecl->Range.Last;
-                  break;
-               case TGSI_FILE_OUTPUT:
-                  assert(array_id < ARRAY_SIZE(info->output_array_first));
-                  info->output_array_first[array_id] = fulldecl->Range.First;
-                  info->output_array_last[array_id] = fulldecl->Range.Last;
-                  break;
-               }
-               info->array_max[file] = MAX2(info->array_max[file], array_id);
-            }
-
-            for (reg = fulldecl->Range.First;
-                 reg <= fulldecl->Range.Last;
-                 reg++) {
-               unsigned semName = fulldecl->Semantic.Name;
-               unsigned semIndex =
-                  fulldecl->Semantic.Index + (reg - fulldecl->Range.First);
-
-               /* only first 32 regs will appear in this bitfield */
-               info->file_mask[file] |= (1 << reg);
-               info->file_count[file]++;
-               info->file_max[file] = MAX2(info->file_max[file], (int)reg);
-
-               if (file == TGSI_FILE_CONSTANT) {
-                  int buffer = 0;
-
-                  if (fulldecl->Declaration.Dimension)
-                     buffer = fulldecl->Dim.Index2D;
-
-                  info->const_file_max[buffer] =
-                        MAX2(info->const_file_max[buffer], (int)reg);
-               }
-               else if (file == TGSI_FILE_INPUT) {
-                  info->input_semantic_name[reg] = (ubyte) semName;
-                  info->input_semantic_index[reg] = (ubyte) semIndex;
-                  info->input_interpolate[reg] = (ubyte)fulldecl->Interp.Interpolate;
-                  info->input_interpolate_loc[reg] = (ubyte)fulldecl->Interp.Location;
-                  info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Interp.CylindricalWrap;
-                  info->num_inputs++;
-
-                  /* Only interpolated varyings. Don't include POSITION.
-                   * Don't include integer varyings, because they are not
-                   * interpolated.
-                   */
-                  if (semName == TGSI_SEMANTIC_GENERIC ||
-                      semName == TGSI_SEMANTIC_TEXCOORD ||
-                      semName == TGSI_SEMANTIC_COLOR ||
-                      semName == TGSI_SEMANTIC_BCOLOR ||
-                      semName == TGSI_SEMANTIC_FOG ||
-                      semName == TGSI_SEMANTIC_CLIPDIST ||
-                      semName == TGSI_SEMANTIC_CULLDIST) {
-                     switch (fulldecl->Interp.Interpolate) {
-                     case TGSI_INTERPOLATE_COLOR:
-                     case TGSI_INTERPOLATE_PERSPECTIVE:
-                        switch (fulldecl->Interp.Location) {
-                        case TGSI_INTERPOLATE_LOC_CENTER:
-                           info->uses_persp_center = true;
-                           break;
-                        case TGSI_INTERPOLATE_LOC_CENTROID:
-                           info->uses_persp_centroid = true;
-                           break;
-                        case TGSI_INTERPOLATE_LOC_SAMPLE:
-                           info->uses_persp_sample = true;
-                           break;
-                        }
-                        break;
-                     case TGSI_INTERPOLATE_LINEAR:
-                        switch (fulldecl->Interp.Location) {
-                        case TGSI_INTERPOLATE_LOC_CENTER:
-                           info->uses_linear_center = true;
-                           break;
-                        case TGSI_INTERPOLATE_LOC_CENTROID:
-                           info->uses_linear_centroid = true;
-                           break;
-                        case TGSI_INTERPOLATE_LOC_SAMPLE:
-                           info->uses_linear_sample = true;
-                           break;
-                        }
-                        break;
-                     /* TGSI_INTERPOLATE_CONSTANT doesn't do any interpolation. */
-                     }
-                  }
-
-                  if (semName == TGSI_SEMANTIC_PRIMID)
-                     info->uses_primid = TRUE;
-                  else if (procType == TGSI_PROCESSOR_FRAGMENT) {
-                     if (semName == TGSI_SEMANTIC_POSITION)
-                        info->reads_position = TRUE;
-                     else if (semName == TGSI_SEMANTIC_FACE)
-                        info->uses_frontface = TRUE;
-                  }
-               }
-               else if (file == TGSI_FILE_SYSTEM_VALUE) {
-                  unsigned index = fulldecl->Range.First;
-
-                  info->system_value_semantic_name[index] = semName;
-                  info->num_system_values = MAX2(info->num_system_values,
-                                                 index + 1);
-
-                  if (semName == TGSI_SEMANTIC_INSTANCEID) {
-                     info->uses_instanceid = TRUE;
-                  }
-                  else if (semName == TGSI_SEMANTIC_VERTEXID) {
-                     info->uses_vertexid = TRUE;
-                  }
-                  else if (semName == TGSI_SEMANTIC_VERTEXID_NOBASE) {
-                     info->uses_vertexid_nobase = TRUE;
-                  }
-                  else if (semName == TGSI_SEMANTIC_BASEVERTEX) {
-                     info->uses_basevertex = TRUE;
-                  }
-                  else if (semName == TGSI_SEMANTIC_PRIMID) {
-                     info->uses_primid = TRUE;
-                  } else if (semName == TGSI_SEMANTIC_INVOCATIONID) {
-                     info->uses_invocationid = TRUE;
-                  } else if (semName == TGSI_SEMANTIC_POSITION)
-                     info->reads_position = TRUE;
-                  else if (semName == TGSI_SEMANTIC_FACE)
-                     info->uses_frontface = TRUE;
-                  else if (semName == TGSI_SEMANTIC_SAMPLEMASK)
-                     info->reads_samplemask = TRUE;
-               }
-               else if (file == TGSI_FILE_OUTPUT) {
-                  info->output_semantic_name[reg] = (ubyte) semName;
-                  info->output_semantic_index[reg] = (ubyte) semIndex;
-                  info->num_outputs++;
-
-                  if (semName == TGSI_SEMANTIC_COLOR)
-                     info->colors_written |= 1 << semIndex;
-
-                  if (procType == TGSI_PROCESSOR_VERTEX ||
-                      procType == TGSI_PROCESSOR_GEOMETRY ||
-                      procType == TGSI_PROCESSOR_TESS_CTRL ||
-                      procType == TGSI_PROCESSOR_TESS_EVAL) {
-                     if (semName == TGSI_SEMANTIC_VIEWPORT_INDEX) {
-                        info->writes_viewport_index = TRUE;
-                     }
-                     else if (semName == TGSI_SEMANTIC_LAYER) {
-                        info->writes_layer = TRUE;
-                     }
-                     else if (semName == TGSI_SEMANTIC_PSIZE) {
-                        info->writes_psize = TRUE;
-                     }
-                     else if (semName == TGSI_SEMANTIC_CLIPVERTEX) {
-                        info->writes_clipvertex = TRUE;
-                     }
-                  }
-
-                  if (procType == TGSI_PROCESSOR_FRAGMENT) {
-                     if (semName == TGSI_SEMANTIC_POSITION) {
-                        info->writes_z = TRUE;
-                     }
-                     else if (semName == TGSI_SEMANTIC_STENCIL) {
-                        info->writes_stencil = TRUE;
-                     } else if (semName == TGSI_SEMANTIC_SAMPLEMASK) {
-                        info->writes_samplemask = TRUE;
-                     }
-                  }
-
-                  if (procType == TGSI_PROCESSOR_VERTEX) {
-                     if (semName == TGSI_SEMANTIC_EDGEFLAG) {
-                        info->writes_edgeflag = TRUE;
-                     }
-                  }
-               } else if (file == TGSI_FILE_SAMPLER) {
-                  info->samplers_declared |= 1 << reg;
-               }
-            }
-         }
+         scan_declaration(info, &parse.FullToken.FullDeclaration);
          break;
-
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         {
-            uint reg = info->immediate_count++;
-            uint file = TGSI_FILE_IMMEDIATE;
-
-            info->file_mask[file] |= (1 << reg);
-            info->file_count[file]++;
-            info->file_max[file] = MAX2(info->file_max[file], (int)reg);
-         }
+         scan_immediate(info);
          break;
-
       case TGSI_TOKEN_TYPE_PROPERTY:
-         {
-            const struct tgsi_full_property *fullprop
-               = &parse.FullToken.FullProperty;
-            unsigned name = fullprop->Property.PropertyName;
-            unsigned value = fullprop->u[0].Data;
-
-            assert(name < Elements(info->properties));
-            info->properties[name] = value;
-
-            switch (name) {
-            case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
-               info->num_written_clipdistance = value;
-               info->clipdist_writemask |= (1 << value) - 1;
-               break;
-            case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
-               info->num_written_culldistance = value;
-               info->culldist_writemask |= (1 << value) - 1;
-               break;
-            }
-         }
+         scan_property(info, &parse.FullToken.FullProperty);
          break;
-
       default:
-         assert( 0 );
+         assert(!"Unexpected TGSI token type");
       }
    }
 
@@ -487,7 +508,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
       }
    }
 
-   tgsi_parse_free (&parse);
+   tgsi_parse_free(&parse);
 }
 
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 3bd512b6f3e..27e6179c9ee 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -192,7 +192,7 @@ tgsi_transform_sampler_view_decl(struct tgsi_transform_context *ctx,
 
    decl = tgsi_default_full_declaration();
    decl.Declaration.File = TGSI_FILE_SAMPLER_VIEW;
-   decl.Declaration.UsageMask = 0xf;
+   decl.Declaration.UsageMask = TGSI_WRITEMASK_XYZW;
    decl.Range.First =
    decl.Range.Last = index;
    decl.SamplerView.Resource = target;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index d6811501d16..9654ac52bf2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -1593,7 +1593,7 @@ emit_decl_sampler_view(struct ureg_program *ureg,
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
    out[0].decl.NrTokens = 3;
    out[0].decl.File = TGSI_FILE_SAMPLER_VIEW;
-   out[0].decl.UsageMask = 0xf;
+   out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
 
    out[1].value = 0;
    out[1].decl_range.First = index;
@@ -1621,7 +1621,7 @@ emit_decl_image(struct ureg_program *ureg,
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
    out[0].decl.NrTokens = 3;
    out[0].decl.File = TGSI_FILE_IMAGE;
-   out[0].decl.UsageMask = 0xf;
+   out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
 
    out[1].value = 0;
    out[1].decl_range.First = index;
@@ -1645,7 +1645,7 @@ emit_decl_buffer(struct ureg_program *ureg,
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
    out[0].decl.NrTokens = 2;
    out[0].decl.File = TGSI_FILE_BUFFER;
-   out[0].decl.UsageMask = 0xf;
+   out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
    out[0].decl.Atomic = atomic;
 
    out[1].value = 0;
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 2b605594a2e..db6635713e5 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -1,9 +1,9 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2008 VMware, Inc.
  * Copyright (c) 2008 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -11,11 +11,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -23,24 +23,22 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 
-#include "pipe/p_config.h" 
+#include "pipe/p_config.h"
 
 #include "pipe/p_compiler.h"
-#include "util/u_debug.h" 
-#include "pipe/p_format.h" 
-#include "pipe/p_state.h" 
-#include "util/u_inlines.h" 
+#include "util/u_debug.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
 #include "util/u_format.h"
-#include "util/u_memory.h" 
-#include "util/u_string.h" 
-#include "util/u_math.h" 
-#include "util/u_tile.h" 
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "util/u_math.h"
 #include "util/u_prim.h"
-#include "util/u_surface.h"
 #include <inttypes.h>
 
 #include <stdio.h>
@@ -53,14 +51,15 @@
 #endif
 
 
-void _debug_vprintf(const char *format, va_list ap)
+void
+_debug_vprintf(const char *format, va_list ap)
 {
    static char buf[4096] = {'\0'};
 #if defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_EMBEDDED)
    /* We buffer until we find a newline. */
    size_t len = strlen(buf);
    int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
-   if(ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
+   if (ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
       os_log_message(buf);
       buf[0] = '\0';
    }
@@ -70,12 +69,12 @@ void _debug_vprintf(const char *format, va_list ap)
 #endif
 }
 
+
 void
-_pipe_debug_message(
-   struct pipe_debug_callback *cb,
-   unsigned *id,
-   enum pipe_debug_type type,
-   const char *fmt, ...)
+_pipe_debug_message(struct pipe_debug_callback *cb,
+                    unsigned *id,
+                    enum pipe_debug_type type,
+                    const char *fmt, ...)
 {
    va_list args;
    va_start(args, fmt);
@@ -112,9 +111,8 @@ debug_disable_error_message_boxes(void)
 
 
 #ifdef DEBUG
-void debug_print_blob( const char *name,
-                       const void *blob,
-                       unsigned size )
+void
+debug_print_blob(const char *name, const void *blob, unsigned size)
 {
    const unsigned *ublob = (const unsigned *)blob;
    unsigned i;
@@ -147,6 +145,7 @@ debug_get_option_should_print(void)
    return value;
 }
 
+
 const char *
 debug_get_option(const char *name, const char *dfault)
 {
@@ -157,39 +156,42 @@ debug_get_option(const char *name, const char *dfault)
       result = dfault;
 
    if (debug_get_option_should_print())
-      debug_printf("%s: %s = %s\n", __FUNCTION__, name, result ? result : "(null)");
-   
+      debug_printf("%s: %s = %s\n", __FUNCTION__, name,
+                   result ? result : "(null)");
+
    return result;
 }
 
+
 boolean
 debug_get_bool_option(const char *name, boolean dfault)
 {
    const char *str = os_get_option(name);
    boolean result;
-   
-   if(str == NULL)
+
+   if (str == NULL)
       result = dfault;
-   else if(!util_strcmp(str, "n"))
+   else if (!util_strcmp(str, "n"))
       result = FALSE;
-   else if(!util_strcmp(str, "no"))
+   else if (!util_strcmp(str, "no"))
       result = FALSE;
-   else if(!util_strcmp(str, "0"))
+   else if (!util_strcmp(str, "0"))
       result = FALSE;
-   else if(!util_strcmp(str, "f"))
+   else if (!util_strcmp(str, "f"))
       result = FALSE;
-   else if(!util_strcmp(str, "F"))
+   else if (!util_strcmp(str, "F"))
       result = FALSE;
-   else if(!util_strcmp(str, "false"))
+   else if (!util_strcmp(str, "false"))
       result = FALSE;
-   else if(!util_strcmp(str, "FALSE"))
+   else if (!util_strcmp(str, "FALSE"))
       result = FALSE;
    else
       result = TRUE;
 
    if (debug_get_option_should_print())
-      debug_printf("%s: %s = %s\n", __FUNCTION__, name, result ? "TRUE" : "FALSE");
-   
+      debug_printf("%s: %s = %s\n", __FUNCTION__, name,
+                   result ? "TRUE" : "FALSE");
+
    return result;
 }
 
@@ -199,23 +201,23 @@ debug_get_num_option(const char *name, long dfault)
 {
    long result;
    const char *str;
-   
+
    str = os_get_option(name);
-   if(!str)
+   if (!str)
       result = dfault;
    else {
       long sign;
       char c;
       c = *str++;
-      if(c == '-') {
+      if (c == '-') {
 	 sign = -1;
 	 c = *str++;
-      } 
+      }
       else {
 	 sign = 1;
       }
       result = 0;
-      while('0' <= c && c <= '9') {
+      while ('0' <= c && c <= '9') {
 	 result = result*10 + (c - '0');
 	 c = *str++;
       }
@@ -228,7 +230,9 @@ debug_get_num_option(const char *name, long dfault)
    return result;
 }
 
-static boolean str_has_option(const char *str, const char *name)
+
+static boolean
+str_has_option(const char *str, const char *name)
 {
    /* Empty string. */
    if (!*str) {
@@ -271,8 +275,9 @@ static boolean str_has_option(const char *str, const char *name)
    return FALSE;
 }
 
+
 uint64_t
-debug_get_flags_option(const char *name, 
+debug_get_flags_option(const char *name,
                        const struct debug_named_value *flags,
                        uint64_t dfault)
 {
@@ -280,9 +285,9 @@ debug_get_flags_option(const char *name,
    const char *str;
    const struct debug_named_value *orig = flags;
    unsigned namealign = 0;
-   
+
    str = os_get_option(name);
-   if(!str)
+   if (!str)
       result = dfault;
    else if (!util_strcmp(str, "help")) {
       result = dfault;
@@ -296,7 +301,7 @@ debug_get_flags_option(const char *name,
    }
    else {
       result = 0;
-      while( flags->name ) {
+      while (flags->name) {
 	 if (str_has_option(str, flags->name))
 	    result |= flags->value;
 	 ++flags;
@@ -305,7 +310,8 @@ debug_get_flags_option(const char *name,
 
    if (debug_get_option_should_print()) {
       if (str) {
-         debug_printf("%s: %s = 0x%"PRIx64" (%s)\n", __FUNCTION__, name, result, str);
+         debug_printf("%s: %s = 0x%"PRIx64" (%s)\n",
+                      __FUNCTION__, name, result, str);
       } else {
          debug_printf("%s: %s = 0x%"PRIx64"\n", __FUNCTION__, name, result);
       }
@@ -315,24 +321,24 @@ debug_get_flags_option(const char *name,
 }
 
 
-void _debug_assert_fail(const char *expr, 
-                        const char *file, 
-                        unsigned line, 
-                        const char *function) 
+void
+_debug_assert_fail(const char *expr, const char *file, unsigned line,
+                   const char *function)
 {
-   _debug_printf("%s:%u:%s: Assertion `%s' failed.\n", file, line, function, expr);
+   _debug_printf("%s:%u:%s: Assertion `%s' failed.\n",
+                 file, line, function, expr);
    os_abort();
 }
 
 
 const char *
-debug_dump_enum(const struct debug_named_value *names, 
+debug_dump_enum(const struct debug_named_value *names,
                 unsigned long value)
 {
    static char rest[64];
-   
-   while(names->name) {
-      if(names->value == value)
+
+   while (names->name) {
+      if (names->value == value)
 	 return names->name;
       ++names;
    }
@@ -343,14 +349,14 @@ debug_dump_enum(const struct debug_named_value *names,
 
 
 const char *
-debug_dump_enum_noprefix(const struct debug_named_value *names, 
+debug_dump_enum_noprefix(const struct debug_named_value *names,
                          const char *prefix,
                          unsigned long value)
 {
    static char rest[64];
-   
-   while(names->name) {
-      if(names->value == value) {
+
+   while (names->name) {
+      if (names->value == value) {
          const char *name = names->name;
          while (*name == *prefix) {
             name++;
@@ -361,16 +367,13 @@ debug_dump_enum_noprefix(const struct debug_named_value *names,
       ++names;
    }
 
-   
-
    util_snprintf(rest, sizeof(rest), "0x%08lx", value);
    return rest;
 }
 
 
 const char *
-debug_dump_flags(const struct debug_named_value *names, 
-                 unsigned long value)
+debug_dump_flags(const struct debug_named_value *names, unsigned long value)
 {
    static char output[4096];
    static char rest[256];
@@ -378,8 +381,8 @@ debug_dump_flags(const struct debug_named_value *names,
 
    output[0] = '\0';
 
-   while(names->name) {
-      if((names->value & value) == names->value) {
+   while (names->name) {
+      if ((names->value & value) == names->value) {
 	 if (!first)
 	    util_strncat(output, "|", sizeof(output) - strlen(output) - 1);
 	 else
@@ -390,27 +393,28 @@ debug_dump_flags(const struct debug_named_value *names,
       }
       ++names;
    }
-   
+
    if (value) {
       if (!first)
 	 util_strncat(output, "|", sizeof(output) - strlen(output) - 1);
       else
 	 first = 0;
-      
+
       util_snprintf(rest, sizeof(rest), "0x%08lx", value);
       util_strncat(output, rest, sizeof(output) - strlen(output) - 1);
       output[sizeof(output) - 1] = '\0';
    }
-   
-   if(first)
+
+   if (first)
       return "0";
-   
+
    return output;
 }
 
 
 #ifdef DEBUG
-void debug_print_format(const char *msg, unsigned fmt )
+void
+debug_print_format(const char *msg, unsigned fmt )
 {
    debug_printf("%s: %s\n", msg, util_format_name(fmt));
 }
@@ -447,7 +451,8 @@ u_prim_name(unsigned prim)
 int fl_indent = 0;
 const char* fl_function[1024];
 
-int debug_funclog_enter(const char* f, const int line, const char* file)
+int
+debug_funclog_enter(const char* f, const int line, const char* file)
 {
    int i;
 
@@ -461,14 +466,16 @@ int debug_funclog_enter(const char* f, const int line, const char* file)
    return 0;
 }
 
-void debug_funclog_exit(const char* f, const int line, const char* file)
+void
+debug_funclog_exit(const char* f, const int line, const char* file)
 {
    --fl_indent;
    assert(fl_indent >= 0);
    assert(fl_function[fl_indent] == f);
 }
 
-void debug_funclog_enter_exit(const char* f, const int line, const char* file)
+void
+debug_funclog_enter_exit(const char* f, const int line, const char* file)
 {
    int i;
    for (i = 0; i < fl_indent; i++)
@@ -481,313 +488,6 @@ void debug_funclog_enter_exit(const char* f, const int line, const char* file)
 
 #ifdef DEBUG
 /**
- * Dump an image to .ppm file.
- * \param format  PIPE_FORMAT_x
- * \param cpp  bytes per pixel
- * \param width  width in pixels
- * \param height height in pixels
- * \param stride  row stride in bytes
- */
-void debug_dump_image(const char *prefix,
-                      enum pipe_format format, unsigned cpp,
-                      unsigned width, unsigned height,
-                      unsigned stride,
-                      const void *data)     
-{
-   /* write a ppm file */
-   char filename[256];
-   unsigned char *rgb8;
-   FILE *f;
-
-   util_snprintf(filename, sizeof(filename), "%s.ppm", prefix);
-
-   rgb8 = MALLOC(height * width * 3);
-   if (!rgb8) {
-      return;
-   }
-
-   util_format_translate(
-         PIPE_FORMAT_R8G8B8_UNORM,
-         rgb8, width * 3,
-         0, 0,
-         format,
-         data, stride,
-         0, 0, width, height);
-
-   /* Must be opened in binary mode or DOS line ending causes data
-    * to be read with one byte offset.
-    */
-   f = fopen(filename, "wb");
-   if (f) {
-      fprintf(f, "P6\n");
-      fprintf(f, "# ppm-file created by gallium\n");
-      fprintf(f, "%i %i\n", width, height);
-      fprintf(f, "255\n");
-      fwrite(rgb8, 1, height * width * 3, f);
-      fclose(f);
-   }
-   else {
-      fprintf(stderr, "Can't open %s for writing\n", filename);
-   }
-
-   FREE(rgb8);
-}
-
-/* FIXME: dump resources, not surfaces... */
-void debug_dump_surface(struct pipe_context *pipe,
-                        const char *prefix,
-                        struct pipe_surface *surface)
-{
-   struct pipe_resource *texture;
-   struct pipe_transfer *transfer;
-   void *data;
-
-   if (!surface)
-      return;
-
-   /* XXX: this doesn't necessarily work, as the driver may be using
-    * temporary storage for the surface which hasn't been propagated
-    * back into the texture.  Need to nail down the semantics of views
-    * and transfers a bit better before we can say if extra work needs
-    * to be done here:
-    */
-   texture = surface->texture;
-
-   data = pipe_transfer_map(pipe, texture, surface->u.tex.level,
-                            surface->u.tex.first_layer,
-                            PIPE_TRANSFER_READ,
-                            0, 0, surface->width, surface->height, &transfer);
-   if (!data)
-      return;
-
-   debug_dump_image(prefix,
-                    texture->format,
-                    util_format_get_blocksize(texture->format),
-                    util_format_get_nblocksx(texture->format, surface->width),
-                    util_format_get_nblocksy(texture->format, surface->height),
-                    transfer->stride,
-                    data);
-
-   pipe->transfer_unmap(pipe, transfer);
-}
-
-
-void debug_dump_texture(struct pipe_context *pipe,
-                        const char *prefix,
-                        struct pipe_resource *texture)
-{
-   struct pipe_surface *surface, surf_tmpl;
-
-   if (!texture)
-      return;
-
-   /* XXX for now, just dump image for layer=0, level=0 */
-   u_surface_default_template(&surf_tmpl, texture);
-   surface = pipe->create_surface(pipe, texture, &surf_tmpl);
-   if (surface) {
-      debug_dump_surface(pipe, prefix, surface);
-      pipe->surface_destroy(pipe, surface);
-   }
-}
-
-
-#pragma pack(push,2)
-struct bmp_file_header {
-   uint16_t bfType;
-   uint32_t bfSize;
-   uint16_t bfReserved1;
-   uint16_t bfReserved2;
-   uint32_t bfOffBits;
-};
-#pragma pack(pop)
-
-struct bmp_info_header {
-   uint32_t biSize;
-   int32_t biWidth;
-   int32_t biHeight;
-   uint16_t biPlanes;
-   uint16_t biBitCount;
-   uint32_t biCompression;
-   uint32_t biSizeImage;
-   int32_t biXPelsPerMeter;
-   int32_t biYPelsPerMeter;
-   uint32_t biClrUsed;
-   uint32_t biClrImportant;
-};
-
-struct bmp_rgb_quad {
-   uint8_t rgbBlue;
-   uint8_t rgbGreen;
-   uint8_t rgbRed;
-   uint8_t rgbAlpha;
-};
-
-void
-debug_dump_surface_bmp(struct pipe_context *pipe,
-                       const char *filename,
-                       struct pipe_surface *surface)
-{
-   struct pipe_transfer *transfer;
-   struct pipe_resource *texture = surface->texture;
-   void *ptr;
-
-   ptr = pipe_transfer_map(pipe, texture, surface->u.tex.level,
-                           surface->u.tex.first_layer, PIPE_TRANSFER_READ,
-                           0, 0, surface->width, surface->height, &transfer);
-
-   debug_dump_transfer_bmp(pipe, filename, transfer, ptr);
-
-   pipe->transfer_unmap(pipe, transfer);
-}
-
-void
-debug_dump_transfer_bmp(struct pipe_context *pipe,
-                        const char *filename,
-                        struct pipe_transfer *transfer, void *ptr)
-{
-   float *rgba;
-
-   if (!transfer)
-      goto error1;
-
-   rgba = MALLOC(transfer->box.width *
-		 transfer->box.height *
-		 transfer->box.depth *
-		 4*sizeof(float));
-   if (!rgba)
-      goto error1;
-
-   pipe_get_tile_rgba(transfer, ptr, 0, 0,
-                      transfer->box.width, transfer->box.height,
-                      rgba);
-
-   debug_dump_float_rgba_bmp(filename,
-                             transfer->box.width, transfer->box.height,
-                             rgba, transfer->box.width);
-
-   FREE(rgba);
-error1:
-   ;
-}
-
-void
-debug_dump_float_rgba_bmp(const char *filename,
-                          unsigned width, unsigned height,
-                          float *rgba, unsigned stride)
-{
-   FILE *stream;
-   struct bmp_file_header bmfh;
-   struct bmp_info_header bmih;
-   unsigned x, y;
-
-   if (!rgba)
-      goto error1;
-
-   bmfh.bfType = 0x4d42;
-   bmfh.bfSize = 14 + 40 + height*width*4;
-   bmfh.bfReserved1 = 0;
-   bmfh.bfReserved2 = 0;
-   bmfh.bfOffBits = 14 + 40;
-
-   bmih.biSize = 40;
-   bmih.biWidth = width;
-   bmih.biHeight = height;
-   bmih.biPlanes = 1;
-   bmih.biBitCount = 32;
-   bmih.biCompression = 0;
-   bmih.biSizeImage = height*width*4;
-   bmih.biXPelsPerMeter = 0;
-   bmih.biYPelsPerMeter = 0;
-   bmih.biClrUsed = 0;
-   bmih.biClrImportant = 0;
-
-   stream = fopen(filename, "wb");
-   if (!stream)
-      goto error1;
-
-   fwrite(&bmfh, 14, 1, stream);
-   fwrite(&bmih, 40, 1, stream);
-
-   y = height;
-   while(y--) {
-      float *ptr = rgba + (stride * y * 4);
-      for(x = 0; x < width; ++x)
-      {
-         struct bmp_rgb_quad pixel;
-         pixel.rgbRed   = float_to_ubyte(ptr[x*4 + 0]);
-         pixel.rgbGreen = float_to_ubyte(ptr[x*4 + 1]);
-         pixel.rgbBlue  = float_to_ubyte(ptr[x*4 + 2]);
-         pixel.rgbAlpha = float_to_ubyte(ptr[x*4 + 3]);
-         fwrite(&pixel, 1, 4, stream);
-      }
-   }
-
-   fclose(stream);
-error1:
-   ;
-}
-
-void
-debug_dump_ubyte_rgba_bmp(const char *filename,
-                          unsigned width, unsigned height,
-                          const ubyte *rgba, unsigned stride)
-{
-   FILE *stream;
-   struct bmp_file_header bmfh;
-   struct bmp_info_header bmih;
-   unsigned x, y;
-
-   assert(rgba);
-   if(!rgba)
-      goto error1;
-
-   bmfh.bfType = 0x4d42;
-   bmfh.bfSize = 14 + 40 + height*width*4;
-   bmfh.bfReserved1 = 0;
-   bmfh.bfReserved2 = 0;
-   bmfh.bfOffBits = 14 + 40;
-
-   bmih.biSize = 40;
-   bmih.biWidth = width;
-   bmih.biHeight = height;
-   bmih.biPlanes = 1;
-   bmih.biBitCount = 32;
-   bmih.biCompression = 0;
-   bmih.biSizeImage = height*width*4;
-   bmih.biXPelsPerMeter = 0;
-   bmih.biYPelsPerMeter = 0;
-   bmih.biClrUsed = 0;
-   bmih.biClrImportant = 0;
-
-   stream = fopen(filename, "wb");
-   assert(stream);
-   if(!stream)
-      goto error1;
-
-   fwrite(&bmfh, 14, 1, stream);
-   fwrite(&bmih, 40, 1, stream);
-
-   y = height;
-   while(y--) {
-      const ubyte *ptr = rgba + (stride * y * 4);
-      for(x = 0; x < width; ++x)
-      {
-         struct bmp_rgb_quad pixel;
-         pixel.rgbRed   = ptr[x*4 + 0];
-         pixel.rgbGreen = ptr[x*4 + 1];
-         pixel.rgbBlue  = ptr[x*4 + 2];
-         pixel.rgbAlpha = ptr[x*4 + 3];
-         fwrite(&pixel, 1, 4, stream);
-      }
-   }
-
-   fclose(stream);
-error1:
-   ;
-}
-
-
-/**
  * Print PIPE_TRANSFER_x flags with a message.
  */
 void
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 671bd37a085..c2707b402cb 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -464,45 +464,6 @@ void
 debug_memory_end(unsigned long beginning);
 
 
-#ifdef DEBUG
-struct pipe_context;
-struct pipe_surface;
-struct pipe_transfer;
-struct pipe_resource;
-
-void debug_dump_image(const char *prefix,
-                      enum pipe_format format, unsigned cpp,
-                      unsigned width, unsigned height,
-                      unsigned stride,
-                      const void *data);
-void debug_dump_surface(struct pipe_context *pipe,
-			const char *prefix,
-                        struct pipe_surface *surface);   
-void debug_dump_texture(struct pipe_context *pipe,
-			const char *prefix,
-                        struct pipe_resource *texture);
-void debug_dump_surface_bmp(struct pipe_context *pipe,
-                            const char *filename,
-                            struct pipe_surface *surface);
-void debug_dump_transfer_bmp(struct pipe_context *pipe,
-                             const char *filename,
-                             struct pipe_transfer *transfer, void *ptr);
-void debug_dump_float_rgba_bmp(const char *filename,
-                               unsigned width, unsigned height,
-                               float *rgba, unsigned stride);
-void debug_dump_ubyte_rgba_bmp(const char *filename,
-                               unsigned width, unsigned height,
-                               const ubyte *rgba, unsigned stride);
-#else
-#define debug_dump_image(prefix, format, cpp, width, height, stride, data) ((void)0)
-#define debug_dump_surface(pipe, prefix, surface) ((void)0)
-#define debug_dump_surface_bmp(pipe, filename, surface) ((void)0)
-#define debug_dump_transfer_bmp(filename, transfer, ptr) ((void)0)
-#define debug_dump_float_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
-#define debug_dump_ubyte_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
-#endif
-
-
 void
 debug_print_transfer_flags(const char *msg, unsigned usage);
 
diff --git a/src/gallium/auxiliary/util/u_debug_image.c b/src/gallium/auxiliary/util/u_debug_image.c
new file mode 100644
index 00000000000..98d73a63de2
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_image.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2008-2016 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include "util/u_debug_image.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "util/u_surface.h"
+#include "util/u_tile.h"
+
+#include <stdio.h>
+
+
+#ifdef DEBUG
+
+/**
+ * Dump an image to .ppm file.
+ * \param format  PIPE_FORMAT_x
+ * \param cpp  bytes per pixel
+ * \param width  width in pixels
+ * \param height height in pixels
+ * \param stride  row stride in bytes
+ */
+void
+debug_dump_image(const char *prefix,
+                 enum pipe_format format, unsigned cpp,
+                 unsigned width, unsigned height,
+                 unsigned stride,
+                 const void *data)
+{
+   /* write a ppm file */
+   char filename[256];
+   unsigned char *rgb8;
+   FILE *f;
+
+   util_snprintf(filename, sizeof(filename), "%s.ppm", prefix);
+
+   rgb8 = MALLOC(height * width * 3);
+   if (!rgb8) {
+      return;
+   }
+
+   util_format_translate(
+         PIPE_FORMAT_R8G8B8_UNORM,
+         rgb8, width * 3,
+         0, 0,
+         format,
+         data, stride,
+         0, 0, width, height);
+
+   /* Must be opened in binary mode or DOS line ending causes data
+    * to be read with one byte offset.
+    */
+   f = fopen(filename, "wb");
+   if (f) {
+      fprintf(f, "P6\n");
+      fprintf(f, "# ppm-file created by gallium\n");
+      fprintf(f, "%i %i\n", width, height);
+      fprintf(f, "255\n");
+      fwrite(rgb8, 1, height * width * 3, f);
+      fclose(f);
+   }
+   else {
+      fprintf(stderr, "Can't open %s for writing\n", filename);
+   }
+
+   FREE(rgb8);
+}
+
+
+/* FIXME: dump resources, not surfaces... */
+void
+debug_dump_surface(struct pipe_context *pipe,
+                   const char *prefix,
+                   struct pipe_surface *surface)
+{
+   struct pipe_resource *texture;
+   struct pipe_transfer *transfer;
+   void *data;
+
+   if (!surface)
+      return;
+
+   /* XXX: this doesn't necessarily work, as the driver may be using
+    * temporary storage for the surface which hasn't been propagated
+    * back into the texture.  Need to nail down the semantics of views
+    * and transfers a bit better before we can say if extra work needs
+    * to be done here:
+    */
+   texture = surface->texture;
+
+   data = pipe_transfer_map(pipe, texture, surface->u.tex.level,
+                            surface->u.tex.first_layer,
+                            PIPE_TRANSFER_READ,
+                            0, 0, surface->width, surface->height, &transfer);
+   if (!data)
+      return;
+
+   debug_dump_image(prefix,
+                    texture->format,
+                    util_format_get_blocksize(texture->format),
+                    util_format_get_nblocksx(texture->format, surface->width),
+                    util_format_get_nblocksy(texture->format, surface->height),
+                    transfer->stride,
+                    data);
+
+   pipe->transfer_unmap(pipe, transfer);
+}
+
+
+void
+debug_dump_texture(struct pipe_context *pipe,
+                   const char *prefix,
+                   struct pipe_resource *texture)
+{
+   struct pipe_surface *surface, surf_tmpl;
+
+   if (!texture)
+      return;
+
+   /* XXX for now, just dump image for layer=0, level=0 */
+   u_surface_default_template(&surf_tmpl, texture);
+   surface = pipe->create_surface(pipe, texture, &surf_tmpl);
+   if (surface) {
+      debug_dump_surface(pipe, prefix, surface);
+      pipe->surface_destroy(pipe, surface);
+   }
+}
+
+
+#pragma pack(push,2)
+struct bmp_file_header {
+   uint16_t bfType;
+   uint32_t bfSize;
+   uint16_t bfReserved1;
+   uint16_t bfReserved2;
+   uint32_t bfOffBits;
+};
+#pragma pack(pop)
+
+struct bmp_info_header {
+   uint32_t biSize;
+   int32_t biWidth;
+   int32_t biHeight;
+   uint16_t biPlanes;
+   uint16_t biBitCount;
+   uint32_t biCompression;
+   uint32_t biSizeImage;
+   int32_t biXPelsPerMeter;
+   int32_t biYPelsPerMeter;
+   uint32_t biClrUsed;
+   uint32_t biClrImportant;
+};
+
+struct bmp_rgb_quad {
+   uint8_t rgbBlue;
+   uint8_t rgbGreen;
+   uint8_t rgbRed;
+   uint8_t rgbAlpha;
+};
+
+void
+debug_dump_surface_bmp(struct pipe_context *pipe,
+                       const char *filename,
+                       struct pipe_surface *surface)
+{
+   struct pipe_transfer *transfer;
+   struct pipe_resource *texture = surface->texture;
+   void *ptr;
+
+   ptr = pipe_transfer_map(pipe, texture, surface->u.tex.level,
+                           surface->u.tex.first_layer, PIPE_TRANSFER_READ,
+                           0, 0, surface->width, surface->height, &transfer);
+
+   debug_dump_transfer_bmp(pipe, filename, transfer, ptr);
+
+   pipe->transfer_unmap(pipe, transfer);
+}
+
+void
+debug_dump_transfer_bmp(struct pipe_context *pipe,
+                        const char *filename,
+                        struct pipe_transfer *transfer, void *ptr)
+{
+   float *rgba;
+
+   if (!transfer)
+      goto error1;
+
+   rgba = MALLOC(transfer->box.width *
+		 transfer->box.height *
+		 transfer->box.depth *
+		 4*sizeof(float));
+   if (!rgba)
+      goto error1;
+
+   pipe_get_tile_rgba(transfer, ptr, 0, 0,
+                      transfer->box.width, transfer->box.height,
+                      rgba);
+
+   debug_dump_float_rgba_bmp(filename,
+                             transfer->box.width, transfer->box.height,
+                             rgba, transfer->box.width);
+
+   FREE(rgba);
+error1:
+   ;
+}
+
+void
+debug_dump_float_rgba_bmp(const char *filename,
+                          unsigned width, unsigned height,
+                          float *rgba, unsigned stride)
+{
+   FILE *stream;
+   struct bmp_file_header bmfh;
+   struct bmp_info_header bmih;
+   unsigned x, y;
+
+   if (!rgba)
+      goto error1;
+
+   bmfh.bfType = 0x4d42;
+   bmfh.bfSize = 14 + 40 + height*width*4;
+   bmfh.bfReserved1 = 0;
+   bmfh.bfReserved2 = 0;
+   bmfh.bfOffBits = 14 + 40;
+
+   bmih.biSize = 40;
+   bmih.biWidth = width;
+   bmih.biHeight = height;
+   bmih.biPlanes = 1;
+   bmih.biBitCount = 32;
+   bmih.biCompression = 0;
+   bmih.biSizeImage = height*width*4;
+   bmih.biXPelsPerMeter = 0;
+   bmih.biYPelsPerMeter = 0;
+   bmih.biClrUsed = 0;
+   bmih.biClrImportant = 0;
+
+   stream = fopen(filename, "wb");
+   if (!stream)
+      goto error1;
+
+   fwrite(&bmfh, 14, 1, stream);
+   fwrite(&bmih, 40, 1, stream);
+
+   y = height;
+   while (y--) {
+      float *ptr = rgba + (stride * y * 4);
+      for (x = 0; x < width; ++x) {
+         struct bmp_rgb_quad pixel;
+         pixel.rgbRed   = float_to_ubyte(ptr[x*4 + 0]);
+         pixel.rgbGreen = float_to_ubyte(ptr[x*4 + 1]);
+         pixel.rgbBlue  = float_to_ubyte(ptr[x*4 + 2]);
+         pixel.rgbAlpha = float_to_ubyte(ptr[x*4 + 3]);
+         fwrite(&pixel, 1, 4, stream);
+      }
+   }
+
+   fclose(stream);
+error1:
+   ;
+}
+
+void
+debug_dump_ubyte_rgba_bmp(const char *filename,
+                          unsigned width, unsigned height,
+                          const ubyte *rgba, unsigned stride)
+{
+   FILE *stream;
+   struct bmp_file_header bmfh;
+   struct bmp_info_header bmih;
+   unsigned x, y;
+
+   assert(rgba);
+   if (!rgba)
+      goto error1;
+
+   bmfh.bfType = 0x4d42;
+   bmfh.bfSize = 14 + 40 + height*width*4;
+   bmfh.bfReserved1 = 0;
+   bmfh.bfReserved2 = 0;
+   bmfh.bfOffBits = 14 + 40;
+
+   bmih.biSize = 40;
+   bmih.biWidth = width;
+   bmih.biHeight = height;
+   bmih.biPlanes = 1;
+   bmih.biBitCount = 32;
+   bmih.biCompression = 0;
+   bmih.biSizeImage = height*width*4;
+   bmih.biXPelsPerMeter = 0;
+   bmih.biYPelsPerMeter = 0;
+   bmih.biClrUsed = 0;
+   bmih.biClrImportant = 0;
+
+   stream = fopen(filename, "wb");
+   assert(stream);
+   if (!stream)
+      goto error1;
+
+   fwrite(&bmfh, 14, 1, stream);
+   fwrite(&bmih, 40, 1, stream);
+
+   y = height;
+   while (y--) {
+      const ubyte *ptr = rgba + (stride * y * 4);
+      for (x = 0; x < width; ++x) {
+         struct bmp_rgb_quad pixel;
+         pixel.rgbRed   = ptr[x*4 + 0];
+         pixel.rgbGreen = ptr[x*4 + 1];
+         pixel.rgbBlue  = ptr[x*4 + 2];
+         pixel.rgbAlpha = ptr[x*4 + 3];
+         fwrite(&pixel, 1, 4, stream);
+      }
+   }
+
+   fclose(stream);
+error1:
+   ;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_debug_image.h b/src/gallium/auxiliary/util/u_debug_image.h
new file mode 100644
index 00000000000..f190eec5f52
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_image.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2008-2016 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef U_DEBUG_IMAGE_H
+#define U_DEBUG_IMAGE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+
+
+#ifdef DEBUG
+struct pipe_context;
+struct pipe_surface;
+struct pipe_transfer;
+struct pipe_resource;
+
+void debug_dump_image(const char *prefix,
+                      enum pipe_format format, unsigned cpp,
+                      unsigned width, unsigned height,
+                      unsigned stride,
+                      const void *data);
+void debug_dump_surface(struct pipe_context *pipe,
+			const char *prefix,
+                        struct pipe_surface *surface);
+void debug_dump_texture(struct pipe_context *pipe,
+			const char *prefix,
+                        struct pipe_resource *texture);
+void debug_dump_surface_bmp(struct pipe_context *pipe,
+                            const char *filename,
+                            struct pipe_surface *surface);
+void debug_dump_transfer_bmp(struct pipe_context *pipe,
+                             const char *filename,
+                             struct pipe_transfer *transfer, void *ptr);
+void debug_dump_float_rgba_bmp(const char *filename,
+                               unsigned width, unsigned height,
+                               float *rgba, unsigned stride);
+void debug_dump_ubyte_rgba_bmp(const char *filename,
+                               unsigned width, unsigned height,
+                               const ubyte *rgba, unsigned stride);
+#else
+#define debug_dump_image(prefix, format, cpp, width, height, stride, data) ((void)0)
+#define debug_dump_surface(pipe, prefix, surface) ((void)0)
+#define debug_dump_surface_bmp(pipe, filename, surface) ((void)0)
+#define debug_dump_transfer_bmp(filename, transfer, ptr) ((void)0)
+#define debug_dump_float_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
+#define debug_dump_ubyte_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
+#endif
+
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_debug_stack.c b/src/gallium/auxiliary/util/u_debug_stack.c
index 68961d3510e..1faa1903a76 100644
--- a/src/gallium/auxiliary/util/u_debug_stack.c
+++ b/src/gallium/auxiliary/util/u_debug_stack.c
@@ -2,7 +2,7 @@
  * 
  * Copyright 2009 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,13 +22,13 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 /**
  * @file
  * Stack backtracing.
- * 
+ *
  * @author Jose Fonseca <[email protected]>
  */
 
@@ -44,12 +44,13 @@
 /**
  * Capture stack backtrace.
  *
- * NOTE: The implementation of this function is quite big, but it is important not to
- * break it down in smaller functions to avoid adding new frames to the calling stack.
+ * NOTE: The implementation of this function is quite big, but it is important
+ * not to break it down in smaller functions to avoid adding new frames to the
+ * calling stack.
  */
 void
 debug_backtrace_capture(struct debug_stack_frame *backtrace,
-                        unsigned start_frame, 
+                        unsigned start_frame,
                         unsigned nr_frames)
 {
    const void **frame_pointer = NULL;
@@ -66,7 +67,8 @@ debug_backtrace_capture(struct debug_stack_frame *backtrace,
     */
 #if defined(PIPE_OS_WINDOWS)
    {
-      typedef USHORT (WINAPI *PFNCAPTURESTACKBACKTRACE)(ULONG, ULONG, PVOID *, PULONG);
+      typedef USHORT (WINAPI *PFNCAPTURESTACKBACKTRACE)(ULONG, ULONG,
+                                                        PVOID *, PULONG);
       static PFNCAPTURESTACKBACKTRACE pfnCaptureStackBackTrace = NULL;
 
       if (!pfnCaptureStackBackTrace) {
@@ -76,8 +78,9 @@ debug_backtrace_capture(struct debug_stack_frame *backtrace,
             assert(hModule);
          }
          if (hModule) {
-            pfnCaptureStackBackTrace = (PFNCAPTURESTACKBACKTRACE)GetProcAddress(hModule,
-                                                                                "RtlCaptureStackBackTrace");
+            pfnCaptureStackBackTrace =
+               (PFNCAPTURESTACKBACKTRACE)GetProcAddress(hModule,
+                                                "RtlCaptureStackBackTrace");
          }
       }
       if (pfnCaptureStackBackTrace) {
@@ -88,7 +91,8 @@ debug_backtrace_capture(struct debug_stack_frame *backtrace,
          start_frame += 1;
 
          assert(start_frame + nr_frames < 63);
-         i = pfnCaptureStackBackTrace(start_frame, nr_frames, (PVOID *) &backtrace->function, NULL);
+         i = pfnCaptureStackBackTrace(start_frame, nr_frames,
+                                      (PVOID *) &backtrace->function, NULL);
 
          /* Pad remaing requested frames with NULL */
          while (i < nr_frames) {
@@ -110,50 +114,49 @@ debug_backtrace_capture(struct debug_stack_frame *backtrace,
 #else
    frame_pointer = NULL;
 #endif
-  
-   
+
 #ifdef PIPE_ARCH_X86
-   while(nr_frames) {
+   while (nr_frames) {
       const void **next_frame_pointer;
 
-      if(!frame_pointer)
+      if (!frame_pointer)
          break;
-      
-      if(start_frame)
+
+      if (start_frame)
          --start_frame;
       else {
          backtrace[i++].function = frame_pointer[1];
          --nr_frames;
       }
-      
+
       next_frame_pointer = (const void **)frame_pointer[0];
-      
+
       /* Limit the stack walk to avoid referencing undefined memory */
-      if((uintptr_t)next_frame_pointer <= (uintptr_t)frame_pointer ||
-         (uintptr_t)next_frame_pointer > (uintptr_t)frame_pointer + 64*1024)
+      if ((uintptr_t)next_frame_pointer <= (uintptr_t)frame_pointer ||
+          (uintptr_t)next_frame_pointer > (uintptr_t)frame_pointer + 64*1024)
          break;
-      
+
       frame_pointer = next_frame_pointer;
    }
 #else
    (void) frame_pointer;
 #endif
 
-   while(nr_frames) {
+   while (nr_frames) {
       backtrace[i++].function = NULL;
       --nr_frames;
    }
 }
-   
+
 
 void
-debug_backtrace_dump(const struct debug_stack_frame *backtrace, 
+debug_backtrace_dump(const struct debug_stack_frame *backtrace,
                      unsigned nr_frames)
 {
    unsigned i;
-   
-   for(i = 0; i < nr_frames; ++i) {
-      if(!backtrace[i].function)
+
+   for (i = 0; i < nr_frames; ++i) {
+      if (!backtrace[i].function)
          break;
       debug_symbol_print(backtrace[i].function);
    }
diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index 3428172203b..74e6f99da67 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -58,7 +58,7 @@
 #define NUM_NEW_TOKENS 53
 
 
-static void
+void
 util_pstipple_update_stipple_texture(struct pipe_context *pipe,
                                      struct pipe_resource *tex,
                                      const uint32_t pattern[32])
@@ -118,7 +118,7 @@ util_pstipple_create_stipple_texture(struct pipe_context *pipe,
 
    tex = screen->resource_create(screen, &templat);
 
-   if (tex)
+   if (tex && pattern)
       util_pstipple_update_stipple_texture(pipe, tex, pattern);
 
    return tex;
diff --git a/src/gallium/auxiliary/util/u_pstipple.h b/src/gallium/auxiliary/util/u_pstipple.h
index ef8396f4318..d1662be2839 100644
--- a/src/gallium/auxiliary/util/u_pstipple.h
+++ b/src/gallium/auxiliary/util/u_pstipple.h
@@ -36,6 +36,11 @@ struct pipe_resource;
 struct pipe_shader_state;
 
 
+extern void
+util_pstipple_update_stipple_texture(struct pipe_context *pipe,
+                                     struct pipe_resource *tex,
+                                     const uint32_t pattern[32]);
+
 extern struct pipe_resource *
 util_pstipple_create_stipple_texture(struct pipe_context *pipe,
                                      const uint32_t pattern[32]);
diff --git a/src/gallium/auxiliary/util/u_staging.c b/src/gallium/auxiliary/util/u_staging.c
index b569c8f9907..caef2a8245c 100644
--- a/src/gallium/auxiliary/util/u_staging.c
+++ b/src/gallium/auxiliary/util/u_staging.c
@@ -29,11 +29,14 @@
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 
+
 static void
-util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigned height, unsigned depth, struct pipe_resource *template)
+util_staging_resource_template(struct pipe_resource *pt, unsigned width,
+                               unsigned height, unsigned depth,
+                               struct pipe_resource *template)
 {
    memset(template, 0, sizeof(struct pipe_resource));
-   if(pt->target != PIPE_BUFFER && depth <= 1)
+   if (pt->target != PIPE_BUFFER && depth <= 1)
       template->target = PIPE_TEXTURE_RECT;
    else
       template->target = pt->target;
@@ -49,16 +52,15 @@ util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigne
    template->flags = 0;
 }
 
+
 struct util_staging_transfer *
 util_staging_transfer_init(struct pipe_context *pipe,
-           struct pipe_resource *pt,
-           unsigned level,
-           unsigned usage,
-           const struct pipe_box *box,
-           boolean direct, struct util_staging_transfer *tx)
+                           struct pipe_resource *pt,
+                           unsigned level, unsigned usage,
+                           const struct pipe_box *box,
+                           boolean direct, struct util_staging_transfer *tx)
 {
    struct pipe_screen *pscreen = pipe->screen;
-
    struct pipe_resource staging_resource_template;
 
    pipe_resource_reference(&tx->base.resource, pt);
@@ -66,23 +68,22 @@ util_staging_transfer_init(struct pipe_context *pipe,
    tx->base.usage = usage;
    tx->base.box = *box;
 
-   if (direct)
-   {
+   if (direct) {
       tx->staging_resource = pt;
       return tx;
    }
 
-   util_staging_resource_template(pt, box->width, box->height, box->depth, &staging_resource_template);
-   tx->staging_resource = pscreen->resource_create(pscreen, &staging_resource_template);
-   if (!tx->staging_resource)
-   {
+   util_staging_resource_template(pt, box->width, box->height,
+                                  box->depth, &staging_resource_template);
+   tx->staging_resource = pscreen->resource_create(pscreen,
+                                                   &staging_resource_template);
+   if (!tx->staging_resource) {
       pipe_resource_reference(&tx->base.resource, NULL);
       FREE(tx);
       return NULL;
    }
 
-   if (usage & PIPE_TRANSFER_READ)
-   {
+   if (usage & PIPE_TRANSFER_READ) {
       /* XXX this looks wrong dst is always the same but looping over src z? */
       int zi;
       struct pipe_box sbox;
@@ -92,7 +93,7 @@ util_staging_transfer_init(struct pipe_context *pipe,
       sbox.width = box->width;
       sbox.height = box->height;
       sbox.depth = 1;
-      for(zi = 0; zi < box->depth; ++zi) {
+      for (zi = 0; zi < box->depth; ++zi) {
          sbox.z = sbox.z + zi;
          pipe->resource_copy_region(pipe, tx->staging_resource, 0, 0, 0, 0,
                                     tx->base.resource, level, &sbox);
@@ -102,14 +103,15 @@ util_staging_transfer_init(struct pipe_context *pipe,
    return tx;
 }
 
+
 void
-util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx)
+util_staging_transfer_destroy(struct pipe_context *pipe,
+                              struct pipe_transfer *ptx)
 {
    struct util_staging_transfer *tx = (struct util_staging_transfer *)ptx;
 
-   if (tx->staging_resource != tx->base.resource)
-   {
-      if(tx->base.usage & PIPE_TRANSFER_WRITE) {
+   if (tx->staging_resource != tx->base.resource) {
+      if (tx->base.usage & PIPE_TRANSFER_WRITE) {
          /* XXX this looks wrong src is always the same but looping over dst z? */
          int zi;
          struct pipe_box sbox;
@@ -119,8 +121,10 @@ util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *p
          sbox.width = tx->base.box.width;
          sbox.height = tx->base.box.height;
          sbox.depth = 1;
-         for(zi = 0; zi < tx->base.box.depth; ++zi)
-            pipe->resource_copy_region(pipe, tx->base.resource, tx->base.level, tx->base.box.x, tx->base.box.y, tx->base.box.z + zi,
+         for (zi = 0; zi < tx->base.box.depth; ++zi)
+            pipe->resource_copy_region(pipe, tx->base.resource, tx->base.level,
+                                       tx->base.box.x, tx->base.box.y,
+                                       tx->base.box.z + zi,
                                        tx->staging_resource, 0, &sbox);
       }
 
diff --git a/src/gallium/auxiliary/util/u_staging.h b/src/gallium/auxiliary/util/u_staging.h
index ddbb33443e4..6c468aad161 100644
--- a/src/gallium/auxiliary/util/u_staging.h
+++ b/src/gallium/auxiliary/util/u_staging.h
@@ -42,22 +42,26 @@
 struct util_staging_transfer {
    struct pipe_transfer base;
 
-   /* if direct, same as base.resource, otherwise the temporary staging resource */
+   /* if direct, same as base.resource, otherwise the temporary staging
+    * resource
+    */
    struct pipe_resource *staging_resource;
 };
 
-/* user must be stride, slice_stride and offset */
-/* pt->usage == PIPE_USAGE_DYNAMIC || pt->usage == PIPE_USAGE_STAGING should be a good value to pass for direct */
-/* staging resource is currently created with PIPE_USAGE_STAGING */
+/* user must be stride, slice_stride and offset.
+ * pt->usage == PIPE_USAGE_DYNAMIC || pt->usage == PIPE_USAGE_STAGING
+ * should be a good value to pass for direct staging resource is currently
+ * created with PIPE_USAGE_STAGING
+ */
 struct util_staging_transfer *
 util_staging_transfer_init(struct pipe_context *pipe,
-           struct pipe_resource *pt,
-           unsigned level,
-           unsigned usage,
-           const struct pipe_box *box,
-           boolean direct, struct util_staging_transfer *tx);
+                           struct pipe_resource *pt,
+                           unsigned level, unsigned usage,
+                           const struct pipe_box *box,
+                           boolean direct, struct util_staging_transfer *tx);
 
 void
-util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx);
+util_staging_transfer_destroy(struct pipe_context *pipe,
+                              struct pipe_transfer *ptx);
 
 #endif
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 6eb6a2d52ef..f38dc8643b4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1004,7 +1004,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 	nir_const_value *const_offset;
 	/* UBO addresses are the first driver params: */
 	unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0);
-	int off = intr->const_index[0];
+	int off = 0;
 
 	/* First src is ubo index, which could either be an immed or not: */
 	src0 = get_src(ctx, &intr->src[0])[0];
@@ -1092,7 +1092,7 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
 	struct ir3_array *arr = get_var(ctx, dvar->var);
 	struct ir3_instruction *addr, **src;
-	unsigned wrmask = intr->const_index[0];
+	unsigned wrmask = nir_intrinsic_write_mask(intr);
 
 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1145,8 +1145,8 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
 	struct ir3_instruction **dst, **src;
 	struct ir3_block *b = ctx->block;
-	int idx = intr->const_index[0];
 	nir_const_value *const_offset;
+	int idx;
 
 	if (info->has_dest) {
 		dst = get_dst(ctx, &intr->dest, intr->num_components);
@@ -1156,6 +1156,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 
 	switch (intr->intrinsic) {
 	case nir_intrinsic_load_uniform:
+		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
 		if (const_offset) {
 			idx += const_offset->u[0];
@@ -1182,6 +1183,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		emit_intrinsic_load_ubo(ctx, intr, dst);
 		break;
 	case nir_intrinsic_load_input:
+		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
 		if (const_offset) {
 			idx += const_offset->u[0];
@@ -1208,6 +1210,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		emit_intrinsic_store_var(ctx, intr);
 		break;
 	case nir_intrinsic_store_output:
+		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[1]);
 		compile_assert(ctx, const_offset != NULL);
 		idx += const_offset->u[0];
@@ -1243,6 +1246,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		dst[0] = ctx->instance_id;
 		break;
 	case nir_intrinsic_load_user_clip_plane:
+		idx = nir_intrinsic_ucp_id(intr);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
 			dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c
index 69f36ae5df6..6831d2c4eff 100644
--- a/src/gallium/drivers/ilo/ilo_draw.c
+++ b/src/gallium/drivers/ilo/ilo_draw.c
@@ -71,6 +71,7 @@ query_process_bo(const struct ilo_context *ilo, struct ilo_query *q)
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_TIME_ELAPSED:
    case PIPE_QUERY_PRIMITIVES_GENERATED:
    case PIPE_QUERY_PRIMITIVES_EMITTED:
@@ -157,6 +158,7 @@ ilo_init_draw_query(struct ilo_context *ilo, struct ilo_query *q)
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_TIME_ELAPSED:
    case PIPE_QUERY_PRIMITIVES_GENERATED:
    case PIPE_QUERY_PRIMITIVES_EMITTED:
diff --git a/src/gallium/drivers/ilo/ilo_query.c b/src/gallium/drivers/ilo/ilo_query.c
index 27d08128ab0..106bd42a335 100644
--- a/src/gallium/drivers/ilo/ilo_query.c
+++ b/src/gallium/drivers/ilo/ilo_query.c
@@ -47,7 +47,7 @@ static const struct {
 #define INFOX(prefix) { NULL, NULL, NULL, NULL, }
 
    [PIPE_QUERY_OCCLUSION_COUNTER]      = INFO(draw),
-   [PIPE_QUERY_OCCLUSION_PREDICATE]    = INFOX(draw),
+   [PIPE_QUERY_OCCLUSION_PREDICATE]    = INFO(draw),
    [PIPE_QUERY_TIMESTAMP]              = INFO(draw),
    [PIPE_QUERY_TIMESTAMP_DISJOINT]     = INFOX(draw),
    [PIPE_QUERY_TIME_ELAPSED]           = INFO(draw),
@@ -75,6 +75,7 @@ ilo_create_query(struct pipe_context *pipe, unsigned query_type, unsigned index)
 
    switch (query_type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_TIMESTAMP:
    case PIPE_QUERY_TIME_ELAPSED:
    case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -163,6 +164,12 @@ query_serialize(const struct ilo_query *q, void *buf)
          dst[0] = q->result.u64;
       }
       break;
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      {
+         uint64_t *dst = buf;
+         dst[0] = !!q->result.u64;
+      }
+      break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
       {
          const struct pipe_query_data_pipeline_statistics *stats =
diff --git a/src/gallium/drivers/ilo/ilo_render.c b/src/gallium/drivers/ilo/ilo_render.c
index 8bc04df4fab..9a47ca80505 100644
--- a/src/gallium/drivers/ilo/ilo_render.c
+++ b/src/gallium/drivers/ilo/ilo_render.c
@@ -202,6 +202,7 @@ ilo_render_get_query_len(const struct ilo_render *render,
 
    switch (query_type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_TIMESTAMP:
    case PIPE_QUERY_TIME_ELAPSED:
       /* no reg */
@@ -268,6 +269,7 @@ ilo_render_emit_query(struct ilo_render *render,
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
       pipe_control_dw1 = GEN6_PIPE_CONTROL_DEPTH_STALL |
                          GEN6_PIPE_CONTROL_WRITE_PS_DEPTH_COUNT;
       break;
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
index 268aab26c40..241c2ccafb7 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -32,6 +32,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
+#include "util/u_debug_image.h"
 #include "util/u_string.h"
 #include "draw/draw_context.h"
 #include "lp_flush.h"
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c
index 3980be9579a..75a4b0446fe 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_query.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c
@@ -120,6 +120,7 @@ nv30_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
       q->report = 1;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
       q->enable = NV30_3D_QUERY_ENABLE;
       q->report = 1;
       break;
@@ -203,7 +204,6 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nv30_query *q = nv30_query(pq);
    volatile uint32_t *ntfy0 = nv30_ntfy(screen, q->qo[0]);
    volatile uint32_t *ntfy1 = nv30_ntfy(screen, q->qo[1]);
-   uint64_t *res64 = &result->u64;
 
    if (ntfy1) {
       while (ntfy1[3] & 0xff000000) {
@@ -227,7 +227,10 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       nv30_query_object_del(screen, &q->qo[1]);
    }
 
-   *res64 = q->result;
+   if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE)
+      result->b = !!q->result;
+   else
+      result->u64 = q->result;
    return true;
 }
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
index cccd3b71672..727b509372d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -156,6 +156,7 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
       hq->nesting = nv50->screen->num_occlusion_queries_active++;
       if (hq->nesting) {
          nv50_hw_query_get(push, q, 0x10, 0x0100f002);
@@ -213,6 +214,7 @@ nv50_hw_end_query(struct nv50_context *nv50, struct nv50_query *q)
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
       nv50_hw_query_get(push, q, 0, 0x0100f002);
       if (--nv50->screen->num_occlusion_queries_active == 0) {
          PUSH_SPACE(push, 2);
@@ -304,6 +306,9 @@ nv50_hw_get_query_result(struct nv50_context *nv50, struct nv50_query *q,
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
       res64[0] = hq->data[1] - hq->data[5];
       break;
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      res8[0] = hq->data[1] != hq->data[5];
+      break;
    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
    case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
       res64[0] = data64[0] - data64[2];
@@ -372,6 +377,7 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
       hq->rotate = 32;
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 2cf08897a8d..d92e691fdb8 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -961,8 +961,8 @@ struct pipe_resource *r600_compute_global_buffer_create(
 			templ->array_size);
 
 	result->base.b.vtbl = &r600_global_buffer_vtbl;
-	result->base.b.b.screen = screen;
 	result->base.b.b = *templ;
+	result->base.b.b.screen = screen;
 	pipe_reference_init(&result->base.b.b.reference, 1);
 
 	size_in_dw = (templ->width0+3) / 4;
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 3d0987624a6..474154e52ff 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -55,6 +55,14 @@ enum radeon_llvm_shader_type {
 	RADEON_LLVM_SHADER_CS = 3,
 };
 
+void radeon_llvm_add_attribute(LLVMValueRef F, const char *name, int value)
+{
+	char str[16];
+
+	snprintf(str, sizeof(str), "%i", value);
+	LLVMAddTargetDependentFunctionAttr(F, name, str);
+}
+
 /**
  * Set the shader type we want to compile
  *
@@ -62,7 +70,6 @@ enum radeon_llvm_shader_type {
  */
 void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
 {
-	char Str[2];
 	enum radeon_llvm_shader_type llvm_type;
 
 	switch (type) {
@@ -84,9 +91,7 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
 		assert(0);
 	}
 
-	sprintf(Str, "%1d", llvm_type);
-
-	LLVMAddTargetDependentFunctionAttr(F, "ShaderType", Str);
+	radeon_llvm_add_attribute(F, "ShaderType", llvm_type);
 }
 
 static void init_r600_target()
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index 45f05a9e0e1..84dbd2584a1 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -34,6 +34,7 @@
 struct pipe_debug_callback;
 struct radeon_shader_binary;
 
+void radeon_llvm_add_attribute(LLVMValueRef F, const char *name, int value);
 void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
 
 LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 825fbb181ba..4d27e86b414 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -124,7 +124,8 @@ static void *si_create_compute_state(
                                                         code, header->num_bytes);
 			si_compile_llvm(sctx->screen, &program->kernels[i].binary,
 					&program->kernels[i].config, sctx->tm,
-					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE,
+					"Compute Shader");
 			si_shader_dump(sctx->screen, &program->kernels[i],
 				       &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
 			si_shader_binary_upload(sctx->screen, &program->kernels[i]);
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index d60c4515625..b5a4034cc12 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -182,7 +182,6 @@ void si_begin_new_cs(struct si_context *ctx)
 	si_mark_atom_dirty(ctx, &ctx->db_render_state);
 	si_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
 	si_mark_atom_dirty(ctx, &ctx->spi_map);
-	si_mark_atom_dirty(ctx, &ctx->spi_ps_input);
 	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 	si_all_descriptors_begin_new_cs(ctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 48947442757..3c963db5078 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -202,7 +202,6 @@ struct si_context {
 	struct si_viewports		viewports;
 	struct si_stencil_ref		stencil_ref;
 	struct r600_atom		spi_map;
-	struct r600_atom		spi_ps_input;
 
 	/* Precomputed states. */
 	struct si_pm4_state		*init_config;
@@ -222,7 +221,6 @@ struct si_context {
 	struct si_vertex_element	*vertex_elements;
 	unsigned			sprite_coord_enable;
 	bool				flatshade;
-	bool				force_persample_interp;
 
 	/* shader descriptors */
 	struct si_descriptors		vertex_buffers;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index d9ed6b234e0..c1d3edc7143 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -833,14 +833,11 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location)
 }
 
 /* This shouldn't be used by explicit INTERP opcodes. */
-static LLVMValueRef get_interp_param(struct si_shader_context *si_shader_ctx,
-				     unsigned param)
+static unsigned select_interp_param(struct si_shader_context *si_shader_ctx,
+				    unsigned param)
 {
-	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
-	unsigned sample_param = 0;
-	LLVMValueRef default_ij, sample_ij, force_sample;
-
-	default_ij = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, param);
+	if (!si_shader_ctx->shader->key.ps.force_persample_interp)
+		return param;
 
 	/* If the shader doesn't use center/centroid, just return the parameter.
 	 *
@@ -850,79 +847,52 @@ static LLVMValueRef get_interp_param(struct si_shader_context *si_shader_ctx,
 	switch (param) {
 	case SI_PARAM_PERSP_CENTROID:
 	case SI_PARAM_PERSP_CENTER:
-		if (!si_shader_ctx->shader->selector->forces_persample_interp_for_persp)
-			return default_ij;
-
-		sample_param = SI_PARAM_PERSP_SAMPLE;
-		break;
+		return SI_PARAM_PERSP_SAMPLE;
 
 	case SI_PARAM_LINEAR_CENTROID:
 	case SI_PARAM_LINEAR_CENTER:
-		if (!si_shader_ctx->shader->selector->forces_persample_interp_for_linear)
-			return default_ij;
-
-		sample_param = SI_PARAM_LINEAR_SAMPLE;
-		break;
+		return SI_PARAM_LINEAR_SAMPLE;
 
 	default:
-		return default_ij;
+		return param;
 	}
-
-	/* Otherwise, we have to select (i,j) based on a user data SGPR. */
-	sample_ij = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, sample_param);
-
-	/* TODO: this can be done more efficiently by switching between
-	 * 2 prologs.
-	 */
-	force_sample = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-				    SI_PARAM_PS_STATE_BITS);
-	force_sample = LLVMBuildTrunc(gallivm->builder, force_sample,
-				      LLVMInt1TypeInContext(gallivm->context), "");
-	return LLVMBuildSelect(gallivm->builder, force_sample,
-			       sample_ij, default_ij, "");
 }
 
-static void declare_input_fs(
-	struct radeon_llvm_context *radeon_bld,
-	unsigned input_index,
-	const struct tgsi_full_declaration *decl)
+/**
+ * Interpolate a fragment shader input.
+ *
+ * @param si_shader_ctx		context
+ * @param input_index		index of the input in hardware
+ * @param semantic_name		TGSI_SEMANTIC_*
+ * @param semantic_index	semantic index
+ * @param num_interp_inputs	number of all interpolated inputs (= BCOLOR offset)
+ * @param colors_read_mask	color components read (4 bits for each color, 8 bits in total)
+ * @param interp_param		interpolation weights (i,j)
+ * @param prim_mask		SI_PARAM_PRIM_MASK
+ * @param face			SI_PARAM_FRONT_FACE
+ * @param result		the return value (4 components)
+ */
+static void interp_fs_input(struct si_shader_context *si_shader_ctx,
+			    unsigned input_index,
+			    unsigned semantic_name,
+			    unsigned semantic_index,
+			    unsigned num_interp_inputs,
+			    unsigned colors_read_mask,
+			    LLVMValueRef interp_param,
+			    LLVMValueRef prim_mask,
+			    LLVMValueRef face,
+			    LLVMValueRef result[4])
 {
-	struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
-	struct si_shader_context *si_shader_ctx =
-		si_shader_context(&radeon_bld->soa.bld_base);
-	struct si_shader *shader = si_shader_ctx->shader;
-	struct lp_build_context *uint =	&radeon_bld->soa.bld_base.uint_bld;
+	struct lp_build_context *base = &si_shader_ctx->radeon_bld.soa.bld_base.base;
+	struct lp_build_context *uint =	&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = base->gallivm;
 	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
-	LLVMValueRef main_fn = radeon_bld->main_fn;
-
-	LLVMValueRef interp_param = NULL;
-	int interp_param_idx;
 	const char * intr_name;
-
-	/* This value is:
-	 * [15:0] NewPrimMask (Bit mask for each quad.  It is set it the
-	 *                     quad begins a new primitive.  Bit 0 always needs
-	 *                     to be unset)
-	 * [32:16] ParamOffset
-	 *
-	 */
-	LLVMValueRef params = LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
 	LLVMValueRef attr_number;
 
 	unsigned chan;
 
-	shader->ps_input_param_offset[input_index] = shader->nparam++;
-	attr_number = lp_build_const_int32(gallivm,
-					   shader->ps_input_param_offset[input_index]);
-
-	shader->ps_input_interpolate[input_index] = decl->Interp.Interpolate;
-	interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
-						     decl->Interp.Location);
-	if (interp_param_idx == -1)
-		return;
-	else if (interp_param_idx)
-		interp_param = get_interp_param(si_shader_ctx, interp_param_idx);
+	attr_number = lp_build_const_int32(gallivm, input_index);
 
 	/* fs.constant returns the param from the middle vertex, so it's not
 	 * really useful for flat shading. It's meant to be used for custom
@@ -936,24 +906,28 @@ static void declare_input_fs(
 	 */
 	intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
 
-	if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
+	if (semantic_name == TGSI_SEMANTIC_COLOR &&
 	    si_shader_ctx->shader->key.ps.color_two_side) {
 		LLVMValueRef args[4];
-		LLVMValueRef face, is_face_positive;
-		LLVMValueRef back_attr_number =
-			lp_build_const_int32(gallivm,
-					     shader->ps_input_param_offset[input_index] + 1);
+		LLVMValueRef is_face_positive;
+		LLVMValueRef back_attr_number;
 
-		face = LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
+		/* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
+		 * otherwise it's at offset "num_inputs".
+		 */
+		unsigned back_attr_offset = num_interp_inputs;
+		if (semantic_index == 1 && colors_read_mask & 0xf)
+			back_attr_offset += 1;
+
+		back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
 
 		is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
 						 face, uint->zero, "");
 
-		args[2] = params;
+		args[2] = prim_mask;
 		args[3] = interp_param;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 			LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
-			unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
 			LLVMValueRef front, back;
 
 			args[0] = llvm_chan;
@@ -967,48 +941,71 @@ static void declare_input_fs(
 					       input_type, args, args[3] ? 4 : 3,
 					       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
-			radeon_bld->inputs[soa_index] =
-				LLVMBuildSelect(gallivm->builder,
+			result[chan] = LLVMBuildSelect(gallivm->builder,
 						is_face_positive,
 						front,
 						back,
 						"");
 		}
-
-		shader->nparam++;
-	} else if (decl->Semantic.Name == TGSI_SEMANTIC_FOG) {
+	} else if (semantic_name == TGSI_SEMANTIC_FOG) {
 		LLVMValueRef args[4];
 
 		args[0] = uint->zero;
 		args[1] = attr_number;
-		args[2] = params;
+		args[2] = prim_mask;
 		args[3] = interp_param;
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-			lp_build_intrinsic(gallivm->builder, intr_name,
+		result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
 					input_type, args, args[3] ? 4 : 3,
 					LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
-			lp_build_const_float(gallivm, 0.0f);
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
-			lp_build_const_float(gallivm, 1.0f);
+		result[1] =
+		result[2] = lp_build_const_float(gallivm, 0.0f);
+		result[3] = lp_build_const_float(gallivm, 1.0f);
 	} else {
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 			LLVMValueRef args[4];
 			LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
-			unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
+
 			args[0] = llvm_chan;
 			args[1] = attr_number;
-			args[2] = params;
+			args[2] = prim_mask;
 			args[3] = interp_param;
-			radeon_bld->inputs[soa_index] =
-				lp_build_intrinsic(gallivm->builder, intr_name,
+			result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
 						input_type, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 		}
 	}
 }
 
+static void declare_input_fs(
+	struct radeon_llvm_context *radeon_bld,
+	unsigned input_index,
+	const struct tgsi_full_declaration *decl)
+{
+	struct si_shader_context *si_shader_ctx =
+		si_shader_context(&radeon_bld->soa.bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	LLVMValueRef main_fn = radeon_bld->main_fn;
+	LLVMValueRef interp_param = NULL;
+	int interp_param_idx;
+
+	interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
+						     decl->Interp.Location);
+	if (interp_param_idx == -1)
+		return;
+	else if (interp_param_idx) {
+		interp_param_idx = select_interp_param(si_shader_ctx,
+						       interp_param_idx);
+		interp_param = LLVMGetParam(main_fn, interp_param_idx);
+	}
+
+	interp_fs_input(si_shader_ctx, input_index, decl->Semantic.Name,
+			decl->Semantic.Index, shader->selector->info.num_inputs,
+			shader->selector->info.colors_read, interp_param,
+			LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
+			LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
+			&radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
+}
+
 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
 {
 	return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
@@ -1060,7 +1057,6 @@ static void declare_system_value(
 	struct si_shader_context *si_shader_ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
 	struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
-	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMValueRef value = 0;
 
@@ -1136,12 +1132,10 @@ static void declare_system_value(
 	}
 
 	case TGSI_SEMANTIC_SAMPLEMASK:
-		/* Smoothing isn't MSAA in GL, but it's MSAA in hardware.
-		 * Therefore, force gl_SampleMaskIn to 1 for GL. */
-		if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
-			value = uint_bld->one;
-		else
-			value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
+		/* This can only occur with the OpenGL Core profile, which
+		 * doesn't support smoothing.
+		 */
+		value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
 		break;
 
 	case TGSI_SEMANTIC_TESSCOORD:
@@ -1965,21 +1959,20 @@ handle_semantic:
 	}
 }
 
-/* This only writes the tessellation factor levels. */
-static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
+static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
+				  LLVMValueRef rel_patch_id,
+				  LLVMValueRef invocation_id,
+				  LLVMValueRef tcs_out_current_patch_data_offset)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	struct si_shader *shader = si_shader_ctx->shader;
 	unsigned tess_inner_index, tess_outer_index;
-	LLVMValueRef lds_base, lds_inner, lds_outer;
-	LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers;
-	LLVMValueRef out[6], vec0, vec1, invocation_id;
+	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
+	LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
 	unsigned stride, outer_comps, inner_comps, i;
 	struct lp_build_if_state if_ctx;
 
-	invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
-
 	/* Do this only for invocation 0, because the tess levels are per-patch,
 	 * not per-vertex.
 	 *
@@ -2018,7 +2011,7 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 	tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
 	tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
 
-	lds_base = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+	lds_base = tcs_out_current_patch_data_offset;
 	lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
 				 lp_build_const_int32(gallivm,
 						      tess_inner_index * 4), "");
@@ -2047,7 +2040,6 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 	/* Get the offset. */
 	tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 			       SI_PARAM_TESS_FACTOR_OFFSET);
-	rel_patch_id = get_rel_patch_id(si_shader_ctx);
 	byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
 				  lp_build_const_int32(gallivm, 4 * stride), "");
 
@@ -2060,6 +2052,20 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 	lp_build_endif(&if_ctx);
 }
 
+/* This only writes the tessellation factor levels. */
+static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef invocation_id;
+
+	invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
+
+	si_write_tess_factors(bld_base,
+			      get_rel_patch_id(si_shader_ctx),
+			      invocation_id,
+			      get_tcs_out_current_patch_data_offset(si_shader_ctx));
+}
+
 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
@@ -3253,17 +3259,17 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 	LLVMValueRef interp_param;
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	const char *intr_name;
-	int input_index;
+	int input_index = inst->Src[0].Register.Index;
 	int chan;
 	int i;
 	LLVMValueRef attr_number;
 	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
 	LLVMValueRef params = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
 	int interp_param_idx;
+	unsigned interp = shader->selector->info.input_interpolate[input_index];
 	unsigned location;
 
 	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
-	input_index = inst->Src[0].Register.Index;
 
 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
 	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
@@ -3271,8 +3277,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 	else
 		location = TGSI_INTERPOLATE_LOC_CENTROID;
 
-	interp_param_idx = lookup_interp_param_index(shader->ps_input_interpolate[input_index],
-						     location);
+	interp_param_idx = lookup_interp_param_index(interp, location);
 	if (interp_param_idx == -1)
 		return;
 	else if (interp_param_idx)
@@ -3280,8 +3285,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 	else
 		interp_param = NULL;
 
-	attr_number = lp_build_const_int32(gallivm,
-					   shader->ps_input_param_offset[input_index]);
+	attr_number = lp_build_const_int32(gallivm, input_index);
 
 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
 	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
@@ -3632,7 +3636,6 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 
 	case TGSI_PROCESSOR_FRAGMENT:
 		params[SI_PARAM_ALPHA_REF] = f32;
-		params[SI_PARAM_PS_STATE_BITS] = i32;
 		params[SI_PARAM_PRIM_MASK] = i32;
 		last_sgpr = SI_PARAM_PRIM_MASK;
 		params[SI_PARAM_PERSP_SAMPLE] = v2i32;
@@ -3663,10 +3666,6 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	radeon_llvm_create_func(&si_shader_ctx->radeon_bld, params, num_params);
 	radeon_llvm_shader_type(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->type);
 
-	if (shader->dx10_clamp_mode)
-		LLVMAddTargetDependentFunctionAttr(si_shader_ctx->radeon_bld.main_fn,
-						   "enable-no-nans-fp-math", "true");
-
 	for (i = 0; i <= last_sgpr; ++i) {
 		LLVMValueRef P = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, i);
 
@@ -3884,7 +3883,7 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 			conf->spi_ps_input_ena = value;
 			break;
 		case R_0286D0_SPI_PS_INPUT_ADDR:
-			/* Not used yet, but will be in the future */
+			conf->spi_ps_input_addr = value;
 			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
@@ -3904,6 +3903,9 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 			}
 			break;
 		}
+
+		if (!conf->spi_ps_input_addr)
+			conf->spi_ps_input_addr = conf->spi_ps_input_ena;
 	}
 }
 
@@ -4045,6 +4047,13 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
 		max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
 
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
+		if (processor == TGSI_PROCESSOR_FRAGMENT) {
+			fprintf(stderr, "*** SHADER CONFIG ***\n"
+				"SPI_PS_INPUT_ADDR = 0x%04x\n"
+				"SPI_PS_INPUT_ENA  = 0x%04x\n",
+				conf->spi_ps_input_addr, conf->spi_ps_input_ena);
+		}
+
 		fprintf(stderr, "*** SHADER STATS ***\n"
 			"SGPRS: %d\n"
 			"VGPRS: %d\n"
@@ -4084,7 +4093,8 @@ int si_compile_llvm(struct si_screen *sscreen,
 		    LLVMTargetMachineRef tm,
 		    LLVMModuleRef mod,
 		    struct pipe_debug_callback *debug,
-		    unsigned processor)
+		    unsigned processor,
+		    const char *name)
 {
 	int r = 0;
 	unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
@@ -4092,8 +4102,11 @@ int si_compile_llvm(struct si_screen *sscreen,
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
 
-		if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR)))
+		if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
+			fprintf(stderr, "%s LLVM IR:\n\n", name);
 			LLVMDumpModule(mod);
+			fprintf(stderr, "\n");
+		}
 	}
 
 	if (!si_replace_shader(count, binary)) {
@@ -4106,6 +4119,20 @@ int si_compile_llvm(struct si_screen *sscreen,
 
 	si_shader_binary_read_config(binary, conf, 0);
 
+	/* Enable 64-bit and 16-bit denormals, because there is no performance
+	 * cost.
+	 *
+	 * If denormals are enabled, all floating-point output modifiers are
+	 * ignored.
+	 *
+	 * Don't enable denormals for 32-bit floats, because:
+	 * - Floating-point output modifiers would be ignored by the hw.
+	 * - Some opcodes don't support denormals, such as v_mad_f32. We would
+	 *   have to stop using those.
+	 * - SI & CI would be very slow.
+	 */
+	conf->float_mode |= V_00B028_FP_64_DENORMS;
+
 	FREE(binary->config);
 	FREE(binary->global_symbol_offsets);
 	binary->config = NULL;
@@ -4116,7 +4143,7 @@ int si_compile_llvm(struct si_screen *sscreen,
 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 				      struct si_shader_context *si_shader_ctx,
-				      struct si_shader *gs, bool dump,
+				      struct si_shader *gs,
 				      struct pipe_debug_callback *debug)
 {
 	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
@@ -4186,14 +4213,14 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 	radeon_llvm_finalize_module(&si_shader_ctx->radeon_bld);
 
-	if (dump)
-		fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
-
 	r = si_compile_llvm(sscreen, &si_shader_ctx->shader->binary,
 			    &si_shader_ctx->shader->config, si_shader_ctx->tm,
 			    bld_base->base.gallivm->module,
-			    debug, TGSI_PROCESSOR_GEOMETRY);
+			    debug, TGSI_PROCESSOR_GEOMETRY,
+			    "GS Copy Shader");
 	if (!r) {
+		if (r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
+			fprintf(stderr, "GS Copy Shader:\n");
 		si_shader_dump(sscreen, si_shader_ctx->shader, debug,
 			       TGSI_PROCESSOR_GEOMETRY);
 		r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
@@ -4250,47 +4277,26 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 	}
 }
 
-int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader,
-		     struct pipe_debug_callback *debug)
+static void si_init_shader_ctx(struct si_shader_context *ctx,
+			       struct si_screen *sscreen,
+			       struct si_shader *shader,
+			       LLVMTargetMachineRef tm,
+			       struct tgsi_shader_info *info)
 {
-	struct si_shader_selector *sel = shader->selector;
-	struct tgsi_token *tokens = sel->tokens;
-	struct si_shader_context si_shader_ctx;
-	struct lp_build_tgsi_context * bld_base;
-	struct tgsi_shader_info stipple_shader_info;
-	LLVMModuleRef mod;
-	int r = 0;
-	bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
-			    shader->key.ps.poly_stipple;
-	bool dump = r600_can_dump_shader(&sscreen->b, sel->info.processor);
-
-	if (poly_stipple) {
-		tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-						SI_POLY_STIPPLE_SAMPLER,
-						TGSI_FILE_SYSTEM_VALUE);
-		tgsi_scan_shader(tokens, &stipple_shader_info);
-	}
-
-	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
-	 * conversion fails. */
-	if (dump && !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
-		si_dump_shader_key(sel->type, &shader->key, stderr);
-		tgsi_dump(tokens, 0);
-		si_dump_streamout(&sel->so);
-	}
-
-	assert(shader->nparam == 0);
-
-	memset(&si_shader_ctx, 0, sizeof(si_shader_ctx));
-	radeon_llvm_context_init(&si_shader_ctx.radeon_bld);
-	bld_base = &si_shader_ctx.radeon_bld.soa.bld_base;
-
-	if (sel->type != PIPE_SHADER_COMPUTE)
-		shader->dx10_clamp_mode = true;
+	struct lp_build_tgsi_context *bld_base;
+
+	memset(ctx, 0, sizeof(*ctx));
+	radeon_llvm_context_init(&ctx->radeon_bld);
+	ctx->tm = tm;
+	ctx->screen = sscreen;
+	if (shader && shader->selector)
+		ctx->type = shader->selector->info.processor;
+	else
+		ctx->type = -1;
+	ctx->shader = shader;
 
-	shader->uses_instanceid = sel->info.uses_instanceid;
-	bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
+	bld_base = &ctx->radeon_bld.soa.bld_base;
+	bld_base->info = info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
 
 	bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
@@ -4326,12 +4332,45 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
 		bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
 	}
+}
+
+int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug)
+{
+	struct si_shader_selector *sel = shader->selector;
+	struct tgsi_token *tokens = sel->tokens;
+	struct si_shader_context si_shader_ctx;
+	struct lp_build_tgsi_context * bld_base;
+	struct tgsi_shader_info stipple_shader_info;
+	LLVMModuleRef mod;
+	int r = 0;
+	bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
+			    shader->key.ps.poly_stipple;
+
+	if (poly_stipple) {
+		tokens = util_pstipple_create_fragment_shader(tokens, NULL,
+						SI_POLY_STIPPLE_SAMPLER,
+						TGSI_FILE_SYSTEM_VALUE);
+		tgsi_scan_shader(tokens, &stipple_shader_info);
+	}
 
+	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
+	 * conversion fails. */
+	if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
+	    !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
+		si_dump_shader_key(sel->type, &shader->key, stderr);
+		tgsi_dump(tokens, 0);
+		si_dump_streamout(&sel->so);
+	}
+
+	si_init_shader_ctx(&si_shader_ctx, sscreen, shader, tm,
+			   poly_stipple ? &stipple_shader_info : &sel->info);
+
+	shader->uses_instanceid = sel->info.uses_instanceid;
+
+	bld_base = &si_shader_ctx.radeon_bld.soa.bld_base;
 	si_shader_ctx.radeon_bld.load_system_value = declare_system_value;
-	si_shader_ctx.shader = shader;
-	si_shader_ctx.type = tgsi_get_processor_type(tokens);
-	si_shader_ctx.screen = sscreen;
-	si_shader_ctx.tm = tm;
 
 	switch (si_shader_ctx.type) {
 	case TGSI_PROCESSOR_VERTEX:
@@ -4401,7 +4440,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
 
 	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
-			    mod, debug, si_shader_ctx.type);
+			    mod, debug, si_shader_ctx.type, "TGSI shader");
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
 		goto out;
@@ -4422,7 +4461,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		shader->gs_copy_shader->selector = shader->selector;
 		si_shader_ctx.shader = shader->gs_copy_shader;
 		if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
-						    shader, dump, debug))) {
+						    shader, debug))) {
 			free(shader->gs_copy_shader);
 			shader->gs_copy_shader = NULL;
 			goto out;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 98bdb890a45..c42c51e0455 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -88,7 +88,6 @@ struct radeon_shader_reloc;
 #define SI_SGPR_TCS_OUT_LAYOUT	9  /* TCS & TES only */
 #define SI_SGPR_TCS_IN_LAYOUT	10 /* TCS only */
 #define SI_SGPR_ALPHA_REF	8  /* PS only */
-#define SI_SGPR_PS_STATE_BITS	9  /* PS only */
 
 #define SI_VS_NUM_USER_SGPR	13 /* API VS */
 #define SI_ES_NUM_USER_SGPR	12 /* API VS */
@@ -97,7 +96,7 @@ struct radeon_shader_reloc;
 #define SI_TES_NUM_USER_SGPR	10
 #define SI_GS_NUM_USER_SGPR	8
 #define SI_GSCOPY_NUM_USER_SGPR	4
-#define SI_PS_NUM_USER_SGPR	10
+#define SI_PS_NUM_USER_SGPR	9
 
 /* LLVM function parameter indices */
 #define SI_PARAM_RW_BUFFERS	0
@@ -152,27 +151,23 @@ struct radeon_shader_reloc;
 
 /* PS only parameters */
 #define SI_PARAM_ALPHA_REF		4
-/* Bits:
- * 0: force_persample_interp
- */
-#define SI_PARAM_PS_STATE_BITS		5
-#define SI_PARAM_PRIM_MASK		6
-#define SI_PARAM_PERSP_SAMPLE		7
-#define SI_PARAM_PERSP_CENTER		8
-#define SI_PARAM_PERSP_CENTROID		9
-#define SI_PARAM_PERSP_PULL_MODEL	10
-#define SI_PARAM_LINEAR_SAMPLE		11
-#define SI_PARAM_LINEAR_CENTER		12
-#define SI_PARAM_LINEAR_CENTROID	13
-#define SI_PARAM_LINE_STIPPLE_TEX	14
-#define SI_PARAM_POS_X_FLOAT		15
-#define SI_PARAM_POS_Y_FLOAT		16
-#define SI_PARAM_POS_Z_FLOAT		17
-#define SI_PARAM_POS_W_FLOAT		18
-#define SI_PARAM_FRONT_FACE		19
-#define SI_PARAM_ANCILLARY		20
-#define SI_PARAM_SAMPLE_COVERAGE	21
-#define SI_PARAM_POS_FIXED_PT		22
+#define SI_PARAM_PRIM_MASK		5
+#define SI_PARAM_PERSP_SAMPLE		6
+#define SI_PARAM_PERSP_CENTER		7
+#define SI_PARAM_PERSP_CENTROID		8
+#define SI_PARAM_PERSP_PULL_MODEL	9
+#define SI_PARAM_LINEAR_SAMPLE		10
+#define SI_PARAM_LINEAR_CENTER		11
+#define SI_PARAM_LINEAR_CENTROID	12
+#define SI_PARAM_LINE_STIPPLE_TEX	13
+#define SI_PARAM_POS_X_FLOAT		14
+#define SI_PARAM_POS_Y_FLOAT		15
+#define SI_PARAM_POS_Z_FLOAT		16
+#define SI_PARAM_POS_W_FLOAT		17
+#define SI_PARAM_FRONT_FACE		18
+#define SI_PARAM_ANCILLARY		19
+#define SI_PARAM_SAMPLE_COVERAGE	20
+#define SI_PARAM_POS_FIXED_PT		21
 
 #define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 1)
 
@@ -193,14 +188,6 @@ struct si_shader_selector {
 	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
 	unsigned	type;
 
-	/* Whether the shader has to use a conditional assignment to
-	 * choose between weights when emulating
-	 * pipe_rasterizer_state::force_persample_interp.
-	 * If false, "si_emit_spi_ps_input" will take care of it instead.
-	 */
-	bool		forces_persample_interp_for_persp;
-	bool		forces_persample_interp_for_linear;
-
 	/* GS parameters. */
 	unsigned	esgs_itemsize;
 	unsigned	gs_input_verts_per_prim;
@@ -245,6 +232,7 @@ union si_shader_key {
 		unsigned	poly_stipple:1;
 		unsigned	poly_line_smoothing:1;
 		unsigned	clamp_color:1;
+		unsigned	force_persample_interp:1;
 	} ps;
 	struct {
 		unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
@@ -272,6 +260,7 @@ struct si_shader_config {
 	unsigned			num_vgprs;
 	unsigned			lds_size;
 	unsigned			spi_ps_input_ena;
+	unsigned			spi_ps_input_addr;
 	unsigned			float_mode;
 	unsigned			scratch_bytes_per_wave;
 	unsigned			rsrc1;
@@ -290,14 +279,10 @@ struct si_shader {
 	struct radeon_shader_binary	binary;
 	struct si_shader_config		config;
 
-	unsigned		nparam;
 	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
-	unsigned		ps_input_param_offset[PIPE_MAX_SHADER_INPUTS];
-	unsigned		ps_input_interpolate[PIPE_MAX_SHADER_INPUTS];
 	bool			uses_instanceid;
 	unsigned		nr_pos_exports;
 	unsigned		nr_param_exports;
-	bool			dx10_clamp_mode; /* convert NaNs to 0 */
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
@@ -343,7 +328,8 @@ int si_compile_llvm(struct si_screen *sscreen,
 		    LLVMTargetMachineRef tm,
 		    LLVMModuleRef mod,
 		    struct pipe_debug_callback *debug,
-		    unsigned processor);
+		    unsigned processor,
+		    const char *name);
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 507f45938ce..e9a017534d1 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -133,7 +133,6 @@ union si_state_atoms {
 		struct r600_atom *viewports;
 		struct r600_atom *stencil_ref;
 		struct r600_atom *spi_map;
-		struct r600_atom *spi_ps_input;
 	} s;
 	struct r600_atom *array[0];
 };
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index bbef429edc5..77a4e47c809 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -124,7 +124,8 @@ static void si_shader_ls(struct si_shader *shader)
 	shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
-			   S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
+			   S_00B528_DX10_CLAMP(1) |
+			   S_00B528_FLOAT_MODE(shader->config.float_mode);
 	shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
 			   S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 }
@@ -157,7 +158,8 @@ static void si_shader_hs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
 		       S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B428_SGPRS((num_sgprs - 1) / 8) |
-		       S_00B428_DX10_CLAMP(shader->dx10_clamp_mode));
+		       S_00B428_DX10_CLAMP(1) |
+		       S_00B428_FLOAT_MODE(shader->config.float_mode));
 	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
 		       S_00B42C_USER_SGPR(num_user_sgprs) |
 		       S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
@@ -203,7 +205,8 @@ static void si_shader_es(struct si_shader *shader)
 		       S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B328_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
-		       S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
+		       S_00B328_DX10_CLAMP(1) |
+		       S_00B328_FLOAT_MODE(shader->config.float_mode));
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
 		       S_00B32C_USER_SGPR(num_user_sgprs) |
 		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
@@ -292,7 +295,8 @@ static void si_shader_gs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
 		       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B228_SGPRS((num_sgprs - 1) / 8) |
-		       S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
+		       S_00B228_DX10_CLAMP(1) |
+		       S_00B228_FLOAT_MODE(shader->config.float_mode));
 	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
 		       S_00B22C_USER_SGPR(num_user_sgprs) |
 		       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
@@ -381,7 +385,8 @@ static void si_shader_vs(struct si_shader *shader, struct si_shader *gs)
 		       S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B128_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
-		       S_00B128_DX10_CLAMP(shader->dx10_clamp_mode));
+		       S_00B128_DX10_CLAMP(1) |
+		       S_00B128_FLOAT_MODE(shader->config.float_mode));
 	si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS,
 		       S_00B12C_USER_SGPR(num_user_sgprs) |
 		       S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
@@ -404,6 +409,18 @@ static void si_shader_vs(struct si_shader *shader, struct si_shader *gs)
 		si_set_tesseval_regs(shader, pm4);
 }
 
+static unsigned si_get_ps_num_interp(struct si_shader *ps)
+{
+	struct tgsi_shader_info *info = &ps->selector->info;
+	unsigned num_colors = !!(info->colors_read & 0x0f) +
+			      !!(info->colors_read & 0xf0);
+	unsigned num_interp = ps->selector->info.num_inputs +
+			      (ps->key.ps.color_two_side ? num_colors : 0);
+
+	assert(num_interp <= 32);
+	return MIN2(num_interp, 32);
+}
+
 static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
 {
 	unsigned value = shader->key.ps.spi_shader_col_format;
@@ -460,6 +477,17 @@ static void si_shader_ps(struct si_shader *shader)
 	unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
 	uint64_t va;
 	bool has_centroid;
+	unsigned input_ena = shader->config.spi_ps_input_ena;
+
+	/* we need to enable at least one of them, otherwise we hang the GPU */
+	assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
+	       G_0286CC_PERSP_CENTER_ENA(input_ena) ||
+	       G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
+	       G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
+	       G_0286CC_LINEAR_SAMPLE_ENA(input_ena) ||
+	       G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
+	       G_0286CC_LINEAR_CENTROID_ENA(input_ena) ||
+	       G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
 
 	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 
@@ -503,11 +531,15 @@ static void si_shader_ps(struct si_shader *shader)
 	     shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS))
 		spi_shader_col_format = V_028714_SPI_SHADER_32_R;
 
+	si_pm4_set_reg(pm4, R_0286CC_SPI_PS_INPUT_ENA, input_ena);
+	si_pm4_set_reg(pm4, R_0286D0_SPI_PS_INPUT_ADDR,
+		       shader->config.spi_ps_input_addr);
+
 	/* Set interpolation controls. */
 	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
 		       G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
 
-	spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
+	spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
 			    S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
 
 	/* Set registers. */
@@ -540,7 +572,8 @@ static void si_shader_ps(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
 		       S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B028_SGPRS((num_sgprs - 1) / 8) |
-		       S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
+		       S_00B028_DX10_CLAMP(1) |
+		       S_00B028_FLOAT_MODE(shader->config.float_mode));
 	si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
 		       S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
 		       S_00B02C_USER_SGPR(num_user_sgprs) |
@@ -681,7 +714,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 				       sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
 			bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS;
 
-			key->ps.color_two_side = rs->two_side;
+			key->ps.color_two_side = rs->two_side && sel->info.colors_read;
 
 			if (sctx->queued.named.blend) {
 				key->ps.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
@@ -694,6 +727,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 						       (is_line && rs->line_smooth)) &&
 						      sctx->framebuffer.nr_samples <= 1;
 			key->ps.clamp_color = rs->clamp_fragment_color;
+
+			key->ps.force_persample_interp = rs->force_persample_interp &&
+							 rs->multisample_enable &&
+							 sctx->framebuffer.nr_samples > 1 &&
+							 sctx->ps_iter_samples > 1 &&
+							 (sel->info.uses_persp_center ||
+							  sel->info.uses_persp_centroid ||
+							  sel->info.uses_linear_center ||
+							  sel->info.uses_linear_centroid);
 		}
 
 		key->ps.alpha_func = si_get_alpha_test_func(sctx);
@@ -796,7 +838,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 	sel->type = util_pipe_shader_from_tgsi_processor(sel->info.processor);
 	p_atomic_inc(&sscreen->b.num_shaders_created);
 
-	/* First set which opcode uses which (i,j) pair. */
+	/* Set which opcode uses which (i,j) pair. */
 	if (sel->info.uses_persp_opcode_interp_centroid)
 		sel->info.uses_persp_centroid = true;
 
@@ -811,19 +853,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 	    sel->info.uses_linear_opcode_interp_sample)
 		sel->info.uses_linear_center = true;
 
-	/* Determine if the shader has to use a conditional assignment when
-	 * emulating force_persample_interp.
-	 */
-	sel->forces_persample_interp_for_persp =
-		sel->info.uses_persp_center +
-		sel->info.uses_persp_centroid +
-		sel->info.uses_persp_sample >= 2;
-
-	sel->forces_persample_interp_for_linear =
-		sel->info.uses_linear_center +
-		sel->info.uses_linear_centroid +
-		sel->info.uses_linear_sample >= 2;
-
 	switch (sel->type) {
 	case PIPE_SHADER_GEOMETRY:
 		sel->gs_output_prim =
@@ -893,7 +922,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 	}
 
 	/* Pre-compilation. */
-	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
+	if (sel->type == PIPE_SHADER_GEOMETRY ||
+	    sscreen->b.debug_flags & DBG_PRECOMPILE) {
 		struct si_shader_ctx_state state = {sel};
 		union si_shader_key key;
 
@@ -1030,6 +1060,41 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 }
 
+static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
+{
+	if (shader->pm4) {
+		switch (shader->selector->type) {
+		case PIPE_SHADER_VERTEX:
+			if (shader->key.vs.as_ls)
+				si_pm4_delete_state(sctx, ls, shader->pm4);
+			else if (shader->key.vs.as_es)
+				si_pm4_delete_state(sctx, es, shader->pm4);
+			else
+				si_pm4_delete_state(sctx, vs, shader->pm4);
+			break;
+		case PIPE_SHADER_TESS_CTRL:
+			si_pm4_delete_state(sctx, hs, shader->pm4);
+			break;
+		case PIPE_SHADER_TESS_EVAL:
+			if (shader->key.tes.as_es)
+				si_pm4_delete_state(sctx, es, shader->pm4);
+			else
+				si_pm4_delete_state(sctx, vs, shader->pm4);
+			break;
+		case PIPE_SHADER_GEOMETRY:
+			si_pm4_delete_state(sctx, gs, shader->pm4);
+			si_pm4_delete_state(sctx, vs, shader->gs_copy_shader->pm4);
+			break;
+		case PIPE_SHADER_FRAGMENT:
+			si_pm4_delete_state(sctx, ps, shader->pm4);
+			break;
+		}
+	}
+
+	si_shader_destroy(shader);
+	free(shader);
+}
+
 static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -1050,35 +1115,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 
 	while (p) {
 		c = p->next_variant;
-		switch (sel->type) {
-		case PIPE_SHADER_VERTEX:
-			if (p->key.vs.as_ls)
-				si_pm4_delete_state(sctx, ls, p->pm4);
-			else if (p->key.vs.as_es)
-				si_pm4_delete_state(sctx, es, p->pm4);
-			else
-				si_pm4_delete_state(sctx, vs, p->pm4);
-			break;
-		case PIPE_SHADER_TESS_CTRL:
-			si_pm4_delete_state(sctx, hs, p->pm4);
-			break;
-		case PIPE_SHADER_TESS_EVAL:
-			if (p->key.tes.as_es)
-				si_pm4_delete_state(sctx, es, p->pm4);
-			else
-				si_pm4_delete_state(sctx, vs, p->pm4);
-			break;
-		case PIPE_SHADER_GEOMETRY:
-			si_pm4_delete_state(sctx, gs, p->pm4);
-			si_pm4_delete_state(sctx, vs, p->gs_copy_shader->pm4);
-			break;
-		case PIPE_SHADER_FRAGMENT:
-			si_pm4_delete_state(sctx, ps, p->pm4);
-			break;
-		}
-
-		si_shader_destroy(p);
-		free(p);
+		si_delete_shader(sctx, p);
 		p = c;
 	}
 
@@ -1087,132 +1124,86 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 	free(sel);
 }
 
+static unsigned si_get_ps_input_cntl(struct si_context *sctx,
+				     struct si_shader *vs, unsigned name,
+				     unsigned index, unsigned interpolate)
+{
+	struct tgsi_shader_info *vsinfo = &vs->selector->info;
+	unsigned j, ps_input_cntl = 0;
+
+	if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
+	    (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade))
+		ps_input_cntl |= S_028644_FLAT_SHADE(1);
+
+	if (name == TGSI_SEMANTIC_PCOORD ||
+	    (name == TGSI_SEMANTIC_TEXCOORD &&
+	     sctx->sprite_coord_enable & (1 << index))) {
+		ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
+	}
+
+	for (j = 0; j < vsinfo->num_outputs; j++) {
+		if (name == vsinfo->output_semantic_name[j] &&
+		    index == vsinfo->output_semantic_index[j]) {
+			ps_input_cntl |= S_028644_OFFSET(vs->vs_output_param_offset[j]);
+			break;
+		}
+	}
+
+	if (name == TGSI_SEMANTIC_PRIMID)
+		/* PrimID is written after the last output. */
+		ps_input_cntl |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]);
+	else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
+		/* No corresponding output found, load defaults into input.
+		 * Don't set any other bits.
+		 * (FLAT_SHADE=1 completely changes behavior) */
+		ps_input_cntl = S_028644_OFFSET(0x20);
+	}
+	return ps_input_cntl;
+}
+
 static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	struct si_shader *vs = si_get_vs_state(sctx);
-	struct tgsi_shader_info *psinfo;
-	struct tgsi_shader_info *vsinfo = &vs->selector->info;
-	unsigned i, j, tmp, num_written = 0;
+	struct tgsi_shader_info *psinfo = ps ? &ps->selector->info : NULL;
+	unsigned i, num_interp, num_written = 0, bcol_interp[2];
 
-	if (!ps || !ps->nparam)
+	if (!ps || !ps->selector->info.num_inputs)
 		return;
 
-	psinfo = &ps->selector->info;
-
-	radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, ps->nparam);
+	num_interp = si_get_ps_num_interp(ps);
+	assert(num_interp > 0);
+	radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, num_interp);
 
 	for (i = 0; i < psinfo->num_inputs; i++) {
 		unsigned name = psinfo->input_semantic_name[i];
 		unsigned index = psinfo->input_semantic_index[i];
 		unsigned interpolate = psinfo->input_interpolate[i];
-		unsigned param_offset = ps->ps_input_param_offset[i];
-bcolor:
-		tmp = 0;
-
-		if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
-		    (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade))
-			tmp |= S_028644_FLAT_SHADE(1);
-
-		if (name == TGSI_SEMANTIC_PCOORD ||
-		    (name == TGSI_SEMANTIC_TEXCOORD &&
-		     sctx->sprite_coord_enable & (1 << index))) {
-			tmp |= S_028644_PT_SPRITE_TEX(1);
-		}
 
-		for (j = 0; j < vsinfo->num_outputs; j++) {
-			if (name == vsinfo->output_semantic_name[j] &&
-			    index == vsinfo->output_semantic_index[j]) {
-				tmp |= S_028644_OFFSET(vs->vs_output_param_offset[j]);
-				break;
-			}
-		}
-
-		if (name == TGSI_SEMANTIC_PRIMID)
-			/* PrimID is written after the last output. */
-			tmp |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]);
-		else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) {
-			/* No corresponding output found, load defaults into input.
-			 * Don't set any other bits.
-			 * (FLAT_SHADE=1 completely changes behavior) */
-			tmp = S_028644_OFFSET(0x20);
-		}
-
-		assert(param_offset == num_written);
-		radeon_emit(cs, tmp);
+		radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, name, index,
+						     interpolate));
 		num_written++;
 
-		if (name == TGSI_SEMANTIC_COLOR &&
-		    ps->key.ps.color_two_side) {
-			name = TGSI_SEMANTIC_BCOLOR;
-			param_offset++;
-			goto bcolor;
+		if (name == TGSI_SEMANTIC_COLOR) {
+			assert(index < ARRAY_SIZE(bcol_interp));
+			bcol_interp[index] = interpolate;
 		}
 	}
-	assert(ps->nparam == num_written);
-}
 
-static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-	struct si_shader *ps = sctx->ps_shader.current;
-	unsigned input_ena;
-
-	if (!ps)
-		return;
+	if (ps->key.ps.color_two_side) {
+		unsigned bcol = TGSI_SEMANTIC_BCOLOR;
 
-	input_ena = ps->config.spi_ps_input_ena;
+		for (i = 0; i < 2; i++) {
+			if (!(psinfo->colors_read & (0xf << (i * 4))))
+				continue;
 
-	/* we need to enable at least one of them, otherwise we hang the GPU */
-	assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
-	    G_0286CC_PERSP_CENTER_ENA(input_ena) ||
-	    G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
-	    G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
-	    G_0286CC_LINEAR_SAMPLE_ENA(input_ena) ||
-	    G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
-	    G_0286CC_LINEAR_CENTROID_ENA(input_ena) ||
-	    G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
-
-	if (sctx->force_persample_interp) {
-		unsigned num_persp = G_0286CC_PERSP_SAMPLE_ENA(input_ena) +
-				     G_0286CC_PERSP_CENTER_ENA(input_ena) +
-				     G_0286CC_PERSP_CENTROID_ENA(input_ena);
-		unsigned num_linear = G_0286CC_LINEAR_SAMPLE_ENA(input_ena) +
-				      G_0286CC_LINEAR_CENTER_ENA(input_ena) +
-				      G_0286CC_LINEAR_CENTROID_ENA(input_ena);
-
-		/* If only one set of (i,j) coordinates is used, we can disable
-		 * CENTER/CENTROID, enable SAMPLE and it will load SAMPLE coordinates
-		 * where CENTER/CENTROID are expected, effectively forcing per-sample
-		 * interpolation.
-		 */
-		if (num_persp == 1) {
-			input_ena &= C_0286CC_PERSP_CENTER_ENA;
-			input_ena &= C_0286CC_PERSP_CENTROID_ENA;
-			input_ena |= G_0286CC_PERSP_SAMPLE_ENA(1);
+			radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, bcol,
+							     i, bcol_interp[i]));
+			num_written++;
 		}
-		if (num_linear == 1) {
-			input_ena &= C_0286CC_LINEAR_CENTER_ENA;
-			input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
-			input_ena |= G_0286CC_LINEAR_SAMPLE_ENA(1);
-		}
-
-		/* If at least 2 sets of coordinates are used, we can't use this
-		 * trick and have to select SAMPLE using a conditional assignment
-		 * in the shader with "force_persample_interp" being a shader constant.
-		 */
 	}
-
-	radeon_set_context_reg_seq(cs, R_0286CC_SPI_PS_INPUT_ENA, 2);
-	radeon_emit(cs, input_ena);
-	radeon_emit(cs, input_ena);
-
-	if (ps->selector->forces_persample_interp_for_persp ||
-	    ps->selector->forces_persample_interp_for_linear)
-		radeon_set_sh_reg(cs, R_00B030_SPI_SHADER_USER_DATA_PS_0 +
-				      SI_SGPR_PS_STATE_BITS * 4,
-				  sctx->force_persample_interp);
+	assert(num_interp == num_written);
 }
 
 /**
@@ -1746,12 +1737,6 @@ bool si_update_shaders(struct si_context *sctx)
 			si_mark_atom_dirty(sctx, &sctx->spi_map);
 		}
 
-		if (si_pm4_state_changed(sctx, ps) ||
-		    sctx->force_persample_interp != rs->force_persample_interp) {
-			sctx->force_persample_interp = rs->force_persample_interp;
-			si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
-		}
-
 		if (sctx->b.family == CHIP_STONEY && si_pm4_state_changed(sctx, ps))
 			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 
@@ -1784,7 +1769,6 @@ bool si_update_shaders(struct si_context *sctx)
 void si_init_shader_functions(struct si_context *sctx)
 {
 	si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
-	si_init_atom(sctx, &sctx->spi_ps_input, &sctx->atoms.s.spi_ps_input, si_emit_spi_ps_input);
 
 	sctx->b.b.create_vs_state = si_create_shader_selector;
 	sctx->b.b.create_tcs_state = si_create_shader_selector;
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 9e1e158219f..892084707d2 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -2845,6 +2845,9 @@
 #define   S_00B028_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
 #define   G_00B028_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
 #define   C_00B028_FLOAT_MODE                                         0xFFF00FFF
+#define     V_00B028_FP_32_DENORMS					0x30
+#define     V_00B028_FP_64_DENORMS					0xc0
+#define     V_00B028_FP_ALL_DENORMS					0xf0
 #define   S_00B028_PRIV(x)                                            (((x) & 0x1) << 20)
 #define   G_00B028_PRIV(x)                                            (((x) >> 20) & 0x1)
 #define   C_00B028_PRIV                                               0xFFEFFFFF
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 188347bb4ca..5a29e26517d 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -38,6 +38,7 @@
 #include "sp_state.h"
 #include "sp_tile_cache.h"
 #include "sp_tex_tile_cache.h"
+#include "util/u_debug_image.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
 
diff --git a/src/gallium/drivers/svga/svga_pipe_flush.c b/src/gallium/drivers/svga/svga_pipe_flush.c
index d593c781680..8e0af12d294 100644
--- a/src/gallium/drivers/svga/svga_pipe_flush.c
+++ b/src/gallium/drivers/svga/svga_pipe_flush.c
@@ -24,6 +24,7 @@
  **********************************************************/
 
 #include "pipe/p_defines.h"
+#include "util/u_debug_image.h"
 #include "util/u_string.h"
 #include "svga_screen.h"
 #include "svga_surface.h"
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 4d03fe1ee0b..2ce2b3aef75 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -120,18 +120,13 @@ trace_context_draw_vbo(struct pipe_context *_pipe,
    trace_dump_trace_flush();
 
    if (info->indirect) {
-      struct pipe_draw_info *_info = NULL;
+      struct pipe_draw_info _info;
 
-      _info = MALLOC(sizeof(*_info));
-      if (!_info)
-         return;
-
-      memcpy(_info, info, sizeof(*_info));
-      _info->indirect = trace_resource_unwrap(tr_ctx, _info->indirect);
-      _info->indirect_params = trace_resource_unwrap(tr_ctx,
-                                                     _info->indirect_params);
-      pipe->draw_vbo(pipe, _info);
-      FREE(_info);
+      memcpy(&_info, info, sizeof(_info));
+      _info.indirect = trace_resource_unwrap(tr_ctx, _info.indirect);
+      _info.indirect_params = trace_resource_unwrap(tr_ctx,
+                                                    _info.indirect_params);
+      pipe->draw_vbo(pipe, &_info);
    } else {
       pipe->draw_vbo(pipe, info);
    }
@@ -1285,6 +1280,33 @@ trace_context_clear_depth_stencil(struct pipe_context *_pipe,
 }
 
 static inline void
+trace_context_clear_texture(struct pipe_context *_pipe,
+                            struct pipe_resource *res,
+                            unsigned level,
+                            const struct pipe_box *box,
+                            const void *data)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   res = trace_resource_unwrap(tr_ctx, res);
+
+   trace_dump_call_begin("pipe_context", "clear_texture");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, res);
+   trace_dump_arg(uint, level);
+   trace_dump_arg_begin("box");
+   trace_dump_box(box);
+   trace_dump_arg_end();
+   trace_dump_arg(ptr, data);
+
+   pipe->clear_texture(pipe, res, level, box, data);
+
+   trace_dump_call_end();
+}
+
+static inline void
 trace_context_flush(struct pipe_context *_pipe,
                     struct pipe_fence_handle **fence,
                     unsigned flags)
@@ -1709,6 +1731,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(clear);
    TR_CTX_INIT(clear_render_target);
    TR_CTX_INIT(clear_depth_stencil);
+   TR_CTX_INIT(clear_texture);
    TR_CTX_INIT(flush);
    TR_CTX_INIT(generate_mipmap);
    TR_CTX_INIT(texture_barrier);
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 800f16cd250..b01f6ea3dcb 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -349,6 +349,12 @@ enum pipe_flush_flags
 #define PIPE_CONTEXT_DEBUG             (1 << 1)
 
 /**
+ * Whether out-of-bounds shader loads must return zero and out-of-bounds
+ * shader stores must be dropped.
+ */
+#define PIPE_CONTEXT_ROBUST_BUFFER_ACCESS (1 << 2)
+
+/**
  * Flags for pipe_context::memory_barrier.
  */
 #define PIPE_BARRIER_MAPPED_BUFFER     (1 << 0)
diff --git a/src/gallium/targets/graw-null/graw_util.c b/src/gallium/targets/graw-null/graw_util.c
index 07693e85f6a..03b45d99e9d 100644
--- a/src/gallium/targets/graw-null/graw_util.c
+++ b/src/gallium/targets/graw-null/graw_util.c
@@ -5,6 +5,7 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_text.h"
 #include "util/u_debug.h"
+#include "util/u_debug_image.h"
 #include "util/u_memory.h"
 #include "state_tracker/graw.h"
 
diff --git a/src/gallium/tests/graw/graw_util.h b/src/gallium/tests/graw/graw_util.h
index f09c1eadc9c..3c7dbd061cc 100644
--- a/src/gallium/tests/graw/graw_util.h
+++ b/src/gallium/tests/graw/graw_util.h
@@ -9,6 +9,7 @@
 
 #include "util/u_box.h"    
 #include "util/u_debug.h"
+#include "util/u_debug_image.h"
 #include "util/u_draw_quad.h"
 #include "util/u_format.h"
 #include "util/u_inlines.h"
diff --git a/src/gallium/tests/trivial/quad-tex.c b/src/gallium/tests/trivial/quad-tex.c
index 4c5a9200a52..ddee2942af9 100644
--- a/src/gallium/tests/trivial/quad-tex.c
+++ b/src/gallium/tests/trivial/quad-tex.c
@@ -50,7 +50,7 @@
 /* u_sampler_view_default_template */
 #include "util/u_sampler.h"
 /* debug_dump_surface_bmp */
-#include "util/u_debug.h"
+#include "util/u_debug_image.h"
 /* util_draw_vertex_buffer helper */
 #include "util/u_draw_quad.h"
 /* FREE & CALLOC_STRUCT */
diff --git a/src/gallium/tests/trivial/tri.c b/src/gallium/tests/trivial/tri.c
index c71a63f44e5..914f5e75fa9 100644
--- a/src/gallium/tests/trivial/tri.c
+++ b/src/gallium/tests/trivial/tri.c
@@ -48,7 +48,7 @@
 #include "cso_cache/cso_context.h"
 
 /* debug_dump_surface_bmp */
-#include "util/u_debug.h"
+#include "util/u_debug_image.h"
 /* util_draw_vertex_buffer helper */
 #include "util/u_draw_quad.h"
 /* FREE & CALLOC_STRUCT */
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index 35dc7e69dcf..49c310cfdf7 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -405,6 +405,12 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
             radeon_get_drm_value(ws->fd, RADEON_INFO_NUM_TILE_PIPES, NULL,
                                  &ws->info.num_tile_pipes);
 
+            /* The kernel returns 12 for some cards for an unknown reason.
+             * I thought this was supposed to be a power of two.
+             */
+            if (ws->gen == DRV_SI && ws->info.num_tile_pipes == 12)
+                ws->info.num_tile_pipes = 8;
+
             if (radeon_get_drm_value(ws->fd, RADEON_INFO_BACKEND_MAP, NULL,
                                       &ws->info.r600_gb_backend_map))
                 ws->info.r600_gb_backend_map_valid = TRUE;
diff --git a/src/mapi/glapi/gen/gl_genexec.py b/src/mapi/glapi/gen/gl_genexec.py
index 6c66779c222..72d7b6fea1f 100644
--- a/src/mapi/glapi/gen/gl_genexec.py
+++ b/src/mapi/glapi/gen/gl_genexec.py
@@ -66,6 +66,7 @@ header = """/**
 #include "main/convolve.h"
 #include "main/copyimage.h"
 #include "main/depth.h"
+#include "main/debug_output.h"
 #include "main/dlist.h"
 #include "main/drawpix.h"
 #include "main/drawtex.h"
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index ffe560faa3d..6669f295399 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -57,6 +57,8 @@ MAIN_FILES = \
 	main/dd.h \
 	main/debug.c \
 	main/debug.h \
+	main/debug_output.c \
+	main/debug_output.h \
 	main/depth.c \
 	main/depth.h \
 	main/dlist.c \
@@ -530,8 +532,6 @@ PROGRAM_FILES = \
 	program/program_parser.h \
 	program/prog_statevars.c \
 	program/prog_statevars.h \
-	program/sampler.cpp \
-	program/sampler.h \
 	program/string_to_uint_map.cpp \
 	program/symbol_table.c \
 	program/symbol_table.h
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index 5cfa2f8ca4f..a6545084e31 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -46,6 +46,7 @@
 #include "main/mtypes.h"
 #include "main/framebuffer.h"
 #include "main/version.h"
+#include "main/debug_output.h"
 #include "main/errors.h"
 #include "main/macros.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index f2faceeb579..9b0750026c4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -1140,7 +1140,7 @@ namespace brw {
                                               dims, rsize, op, pred);
 
          /* An unbound surface access should give zero as result. */
-         if (rsize)
+         if (rsize && pred)
             set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
 
          return tmp;
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 319c2a5669f..ab1a0d7255f 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -919,7 +919,7 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
        *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
        */
       const unsigned dc_flush =
-         brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_INVALIDATE : 0;
+         brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
 
       if (brw->gen == 6) {
          /* Hardware workaround: SNB B-Spec says:
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index ef5b34cc687..0a916c99947 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -490,6 +490,10 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar)
 
    nir = nir_optimize(nir, is_scalar);
 
+   if (is_scalar) {
+      OPT_V(nir_lower_load_const_to_scalar);
+   }
+
    /* Lower a bunch of stuff */
    OPT_V(nir_lower_var_copies);
 
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index 6c636d26139..b41e28e1ec8 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -51,7 +51,7 @@ gen8_add_cs_stall_workaround_bits(uint32_t *flags)
                       PIPE_CONTROL_WRITE_TIMESTAMP |
                       PIPE_CONTROL_STALL_AT_SCOREBOARD |
                       PIPE_CONTROL_DEPTH_STALL |
-                      PIPE_CONTROL_DATA_CACHE_INVALIDATE;
+                      PIPE_CONTROL_DATA_CACHE_FLUSH;
 
    /* If we're doing a CS stall, and don't already have one of the
     * workaround bits set, add "Stall at Pixel Scoreboard."
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index c9872b68d75..b093a87bb82 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -209,7 +209,7 @@ static void
 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 {
    struct brw_context *brw = brw_context(ctx);
-   unsigned bits = (PIPE_CONTROL_DATA_CACHE_INVALIDATE |
+   unsigned bits = (PIPE_CONTROL_DATA_CACHE_FLUSH |
                     PIPE_CONTROL_NO_WRITE |
                     PIPE_CONTROL_CS_STALL);
    assert(brw->gen >= 7 && brw->gen <= 9);
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 60f7fd9cfcd..4f97577515a 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -939,8 +939,9 @@ fs_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       fs_inst *inst = (fs_inst *)n->inst;
 
-      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
-         inst->has_side_effects())
+      if ((inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+           inst->has_side_effects()) &&
+          inst->opcode != FS_OPCODE_FB_WRITE)
          add_barrier_deps(n);
 
       /* read-after-write deps. */
@@ -1195,7 +1196,7 @@ vec4_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
-      if (inst->has_side_effects())
+      if (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE)
          add_barrier_deps(n);
 
       /* read-after-write deps. */
diff --git a/src/mesa/drivers/dri/i965/gen7_l3_state.c b/src/mesa/drivers/dri/i965/gen7_l3_state.c
index b63e61ca8f0..0c1813f9048 100644
--- a/src/mesa/drivers/dri/i965/gen7_l3_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_l3_state.c
@@ -330,23 +330,39 @@ setup_l3_config(struct brw_context *brw, const struct brw_l3_config *cfg)
 
    /* According to the hardware docs, the L3 partitioning can only be changed
     * while the pipeline is completely drained and the caches are flushed,
-    * which involves a first PIPE_CONTROL flush which stalls the pipeline and
-    * initiates invalidation of the relevant caches...
+    * which involves a first PIPE_CONTROL flush which stalls the pipeline...
+    */
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_DATA_CACHE_FLUSH |
+                               PIPE_CONTROL_NO_WRITE |
+                               PIPE_CONTROL_CS_STALL);
+
+   /* ...followed by a second pipelined PIPE_CONTROL that initiates
+    * invalidation of the relevant caches.  Note that because RO invalidation
+    * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
+    * command is processed by the CS) we cannot combine it with the previous
+    * stalling flush as the hardware documentation suggests, because that
+    * would cause the CS to stall on previous rendering *after* RO
+    * invalidation and wouldn't prevent the RO caches from being polluted by
+    * concurrent rendering before the stall completes.  This intentionally
+    * doesn't implement the SKL+ hardware workaround suggesting to enable CS
+    * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
+    * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
+    * already guarantee that there is no concurrent GPGPU kernel execution
+    * (see SKL HSD 2132585).
     */
    brw_emit_pipe_control_flush(brw,
                                PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
                                PIPE_CONTROL_CONST_CACHE_INVALIDATE |
                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                               PIPE_CONTROL_DATA_CACHE_INVALIDATE |
-                               PIPE_CONTROL_NO_WRITE |
-                               PIPE_CONTROL_CS_STALL);
+                               PIPE_CONTROL_STATE_CACHE_INVALIDATE |
+                               PIPE_CONTROL_NO_WRITE);
 
-   /* ...followed by a second stalling flush which guarantees that
-    * invalidation is complete when the L3 configuration registers are
-    * modified.
+   /* Now send a third stalling flush to make sure that invalidation is
+    * complete when the L3 configuration registers are modified.
     */
    brw_emit_pipe_control_flush(brw,
-                               PIPE_CONTROL_DATA_CACHE_INVALIDATE |
+                               PIPE_CONTROL_DATA_CACHE_FLUSH |
                                PIPE_CONTROL_NO_WRITE |
                                PIPE_CONTROL_CS_STALL);
 
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index 8888d6f776c..365c045b8b0 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -86,7 +86,7 @@
 #define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
 #define PIPE_CONTROL_FLUSH_ENABLE	(1 << 7) /* Gen7+ only */
 /* GT */
-#define PIPE_CONTROL_DATA_CACHE_INVALIDATE	(1 << 5)
+#define PIPE_CONTROL_DATA_CACHE_FLUSH   	(1 << 5)
 #define PIPE_CONTROL_VF_CACHE_INVALIDATE	(1 << 4)
 #define PIPE_CONTROL_CONST_CACHE_INVALIDATE	(1 << 3)
 #define PIPE_CONTROL_STATE_CACHE_INVALIDATE	(1 << 2)
diff --git a/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
index e5e566c60bc..8d4a447a88b 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
@@ -40,6 +40,7 @@ public:
    struct gl_context *ctx;
    struct gl_shader_program *shader_prog;
    struct brw_vertex_program *vp;
+   struct brw_vue_prog_data *prog_data;
    vec4_visitor *v;
 };
 
@@ -47,9 +48,13 @@ class cmod_propagation_vec4_visitor : public vec4_visitor
 {
 public:
    cmod_propagation_vec4_visitor(struct brw_compiler *compiler,
-                                 nir_shader *shader)
-      : vec4_visitor(compiler, NULL, NULL, NULL, shader, NULL,
-                     false, -1) {}
+                                 nir_shader *shader,
+                                 struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
+                     false, -1)
+      {
+         prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+      }
 
 protected:
    /* Dummy implementation for pure virtual methods */
@@ -96,13 +101,14 @@ void cmod_propagation_test::SetUp()
    ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
    compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
    devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
    compiler->devinfo = devinfo;
 
    vp = ralloc(NULL, struct brw_vertex_program);
 
    nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL);
 
-   v = new cmod_propagation_vec4_visitor(compiler, shader);
+   v = new cmod_propagation_vec4_visitor(compiler, shader, prog_data);
 
    _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0);
 
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index 12667ffd23c..311f07a7cca 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -39,6 +39,7 @@ public:
    struct gl_context *ctx;
    struct gl_shader_program *shader_prog;
    struct brw_vertex_program *vp;
+   struct brw_vue_prog_data *prog_data;
    vec4_visitor *v;
 };
 
@@ -46,10 +47,12 @@ class copy_propagation_vec4_visitor : public vec4_visitor
 {
 public:
    copy_propagation_vec4_visitor(struct brw_compiler *compiler,
-                                 nir_shader *shader)
-      : vec4_visitor(compiler, NULL, NULL, NULL, shader, NULL,
+                                 nir_shader *shader,
+                                 struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
                      false /* no_spills */, -1)
    {
+      prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
    }
 
 protected:
@@ -91,13 +94,14 @@ void copy_propagation_test::SetUp()
    ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
    compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
    devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
    compiler->devinfo = devinfo;
 
    vp = ralloc(NULL, struct brw_vertex_program);
 
    nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL);
 
-   v = new copy_propagation_vec4_visitor(compiler, shader);
+   v = new copy_propagation_vec4_visitor(compiler, shader, prog_data);
 
    _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0);
 
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index 34dcf95dc48..cc4a2de89d5 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -41,6 +41,7 @@ public:
    struct gl_context *ctx;
    struct gl_shader_program *shader_prog;
    struct brw_vertex_program *vp;
+   struct brw_vue_prog_data *prog_data;
    vec4_visitor *v;
 };
 
@@ -49,10 +50,12 @@ class register_coalesce_vec4_visitor : public vec4_visitor
 {
 public:
    register_coalesce_vec4_visitor(struct brw_compiler *compiler,
-                                  nir_shader *shader)
-      : vec4_visitor(compiler, NULL, NULL, NULL, shader, NULL,
+                                  nir_shader *shader,
+                                  struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
                      false /* no_spills */, -1)
    {
+      prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
    }
 
 protected:
@@ -94,13 +97,14 @@ void register_coalesce_test::SetUp()
    ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
    compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
    devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
    compiler->devinfo = devinfo;
 
    vp = ralloc(NULL, struct brw_vertex_program);
 
    nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL);
 
-   v = new register_coalesce_vec4_visitor(compiler, shader);
+   v = new register_coalesce_vec4_visitor(compiler, shader, prog_data);
 
    _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0);
 
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 8b415ed6019..9388a1ca51d 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -89,6 +89,7 @@
 #include "context.h"
 #include "cpuinfo.h"
 #include "debug.h"
+#include "debug_output.h"
 #include "depth.h"
 #include "dlist.h"
 #include "eval.h"
@@ -814,8 +815,8 @@ init_attrib_groups(struct gl_context *ctx)
    _mesa_init_current( ctx );
    _mesa_init_depth( ctx );
    _mesa_init_debug( ctx );
+   _mesa_init_debug_output( ctx );
    _mesa_init_display_list( ctx );
-   _mesa_init_errors( ctx );
    _mesa_init_eval( ctx );
    _mesa_init_fbobjects( ctx );
    _mesa_init_feedback( ctx );
diff --git a/src/mesa/main/debug_output.c b/src/mesa/main/debug_output.c
new file mode 100644
index 00000000000..10ee6757cc1
--- /dev/null
+++ b/src/mesa/main/debug_output.c
@@ -0,0 +1,1301 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 1999-2016  Brian Paul, et al   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "context.h"
+#include "debug_output.h"
+#include "dispatch.h"
+#include "enums.h"
+#include "imports.h"
+#include "hash.h"
+#include "mtypes.h"
+#include "version.h"
+#include "util/hash_table.h"
+#include "util/simple_list.h"
+
+
+static mtx_t DynamicIDMutex = _MTX_INITIALIZER_NP;
+static GLuint NextDynamicID = 1;
+
+
+/**
+ * A namespace element.
+ */
+struct gl_debug_element
+{
+   struct simple_node link;
+
+   GLuint ID;
+   /* at which severity levels (mesa_debug_severity) is the message enabled */
+   GLbitfield State;
+};
+
+
+struct gl_debug_namespace
+{
+   struct simple_node Elements;
+   GLbitfield DefaultState;
+};
+
+
+struct gl_debug_group {
+   struct gl_debug_namespace Namespaces[MESA_DEBUG_SOURCE_COUNT][MESA_DEBUG_TYPE_COUNT];
+};
+
+
+/**
+ * An error, warning, or other piece of debug information for an application
+ * to consume via GL_ARB_debug_output/GL_KHR_debug.
+ */
+struct gl_debug_message
+{
+   enum mesa_debug_source source;
+   enum mesa_debug_type type;
+   GLuint id;
+   enum mesa_debug_severity severity;
+   /* length as given by the user - if message was explicitly null terminated,
+    * length can be negative */
+   GLsizei length;
+   GLcharARB *message;
+};
+
+
+/**
+ * Debug message log.  It works like a ring buffer.
+ */
+struct gl_debug_log {
+   struct gl_debug_message Messages[MAX_DEBUG_LOGGED_MESSAGES];
+   GLint NextMessage;
+   GLint NumMessages;
+};
+
+
+struct gl_debug_state
+{
+   GLDEBUGPROC Callback;
+   const void *CallbackData;
+   GLboolean SyncOutput;
+   GLboolean DebugOutput;
+
+   struct gl_debug_group *Groups[MAX_DEBUG_GROUP_STACK_DEPTH];
+   struct gl_debug_message GroupMessages[MAX_DEBUG_GROUP_STACK_DEPTH];
+   GLint CurrentGroup; // GroupStackDepth - 1
+
+   struct gl_debug_log Log;
+};
+
+
+static char out_of_memory[] = "Debugging error: out of memory";
+
+static const GLenum debug_source_enums[] = {
+   GL_DEBUG_SOURCE_API,
+   GL_DEBUG_SOURCE_WINDOW_SYSTEM,
+   GL_DEBUG_SOURCE_SHADER_COMPILER,
+   GL_DEBUG_SOURCE_THIRD_PARTY,
+   GL_DEBUG_SOURCE_APPLICATION,
+   GL_DEBUG_SOURCE_OTHER,
+};
+
+static const GLenum debug_type_enums[] = {
+   GL_DEBUG_TYPE_ERROR,
+   GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR,
+   GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR,
+   GL_DEBUG_TYPE_PORTABILITY,
+   GL_DEBUG_TYPE_PERFORMANCE,
+   GL_DEBUG_TYPE_OTHER,
+   GL_DEBUG_TYPE_MARKER,
+   GL_DEBUG_TYPE_PUSH_GROUP,
+   GL_DEBUG_TYPE_POP_GROUP,
+};
+
+static const GLenum debug_severity_enums[] = {
+   GL_DEBUG_SEVERITY_LOW,
+   GL_DEBUG_SEVERITY_MEDIUM,
+   GL_DEBUG_SEVERITY_HIGH,
+   GL_DEBUG_SEVERITY_NOTIFICATION,
+};
+
+
+static enum mesa_debug_source
+gl_enum_to_debug_source(GLenum e)
+{
+   unsigned i;
+
+   for (i = 0; i < ARRAY_SIZE(debug_source_enums); i++) {
+      if (debug_source_enums[i] == e)
+         break;
+   }
+   return i;
+}
+
+static enum mesa_debug_type
+gl_enum_to_debug_type(GLenum e)
+{
+   unsigned i;
+
+   for (i = 0; i < ARRAY_SIZE(debug_type_enums); i++) {
+      if (debug_type_enums[i] == e)
+         break;
+   }
+   return i;
+}
+
+static enum mesa_debug_severity
+gl_enum_to_debug_severity(GLenum e)
+{
+   unsigned i;
+
+   for (i = 0; i < ARRAY_SIZE(debug_severity_enums); i++) {
+      if (debug_severity_enums[i] == e)
+         break;
+   }
+   return i;
+}
+
+
+/**
+ * Handles generating a GL_ARB_debug_output message ID generated by the GL or
+ * GLSL compiler.
+ *
+ * The GL API has this "ID" mechanism, where the intention is to allow a
+ * client to filter in/out messages based on source, type, and ID.  Of course,
+ * building a giant enum list of all debug output messages that Mesa might
+ * generate is ridiculous, so instead we have our caller pass us a pointer to
+ * static storage where the ID should get stored.  This ID will be shared
+ * across all contexts for that message (which seems like a desirable
+ * property, even if it's not expected by the spec), but note that it won't be
+ * the same between executions if messages aren't generated in the same order.
+ */
+void
+_mesa_debug_get_id(GLuint *id)
+{
+   if (!(*id)) {
+      mtx_lock(&DynamicIDMutex);
+      if (!(*id))
+         *id = NextDynamicID++;
+      mtx_unlock(&DynamicIDMutex);
+   }
+}
+
+static void
+debug_message_clear(struct gl_debug_message *msg)
+{
+   if (msg->message != (char*)out_of_memory)
+      free(msg->message);
+   msg->message = NULL;
+   msg->length = 0;
+}
+
+static void
+debug_message_store(struct gl_debug_message *msg,
+                    enum mesa_debug_source source,
+                    enum mesa_debug_type type, GLuint id,
+                    enum mesa_debug_severity severity,
+                    GLsizei len, const char *buf)
+{
+   GLsizei length = len;
+
+   assert(!msg->message && !msg->length);
+
+   if (length < 0)
+      length = strlen(buf);
+
+   msg->message = malloc(length+1);
+   if (msg->message) {
+      (void) strncpy(msg->message, buf, (size_t)length);
+      msg->message[length] = '\0';
+
+      msg->length = len;
+      msg->source = source;
+      msg->type = type;
+      msg->id = id;
+      msg->severity = severity;
+   } else {
+      static GLuint oom_msg_id = 0;
+      _mesa_debug_get_id(&oom_msg_id);
+
+      /* malloc failed! */
+      msg->message = out_of_memory;
+      msg->length = -1;
+      msg->source = MESA_DEBUG_SOURCE_OTHER;
+      msg->type = MESA_DEBUG_TYPE_ERROR;
+      msg->id = oom_msg_id;
+      msg->severity = MESA_DEBUG_SEVERITY_HIGH;
+   }
+}
+
+static void
+debug_namespace_init(struct gl_debug_namespace *ns)
+{
+   make_empty_list(&ns->Elements);
+
+   /* Enable all the messages with severity HIGH or MEDIUM by default */
+   ns->DefaultState = (1 << MESA_DEBUG_SEVERITY_MEDIUM ) |
+                      (1 << MESA_DEBUG_SEVERITY_HIGH) |
+                      (1 << MESA_DEBUG_SEVERITY_NOTIFICATION);
+}
+
+static void
+debug_namespace_clear(struct gl_debug_namespace *ns)
+{
+   struct simple_node *node, *tmp;
+
+   foreach_s(node, tmp, &ns->Elements)
+      free(node);
+}
+
+static bool
+debug_namespace_copy(struct gl_debug_namespace *dst,
+                     const struct gl_debug_namespace *src)
+{
+   struct simple_node *node;
+
+   dst->DefaultState = src->DefaultState;
+
+   make_empty_list(&dst->Elements);
+   foreach(node, &src->Elements) {
+      const struct gl_debug_element *elem =
+         (const struct gl_debug_element *) node;
+      struct gl_debug_element *copy;
+
+      copy = malloc(sizeof(*copy));
+      if (!copy) {
+         debug_namespace_clear(dst);
+         return false;
+      }
+
+      copy->ID = elem->ID;
+      copy->State = elem->State;
+      insert_at_tail(&dst->Elements, &copy->link);
+   }
+
+   return true;
+}
+
+/**
+ * Set the state of \p id in the namespace.
+ */
+static bool
+debug_namespace_set(struct gl_debug_namespace *ns,
+                    GLuint id, bool enabled)
+{
+   const uint32_t state = (enabled) ?
+      ((1 << MESA_DEBUG_SEVERITY_COUNT) - 1) : 0;
+   struct gl_debug_element *elem = NULL;
+   struct simple_node *node;
+
+   /* find the element */
+   foreach(node, &ns->Elements) {
+      struct gl_debug_element *tmp = (struct gl_debug_element *) node;
+      if (tmp->ID == id) {
+         elem = tmp;
+         break;
+      }
+   }
+
+   /* we do not need the element if it has the default state */
+   if (ns->DefaultState == state) {
+      if (elem) {
+         remove_from_list(&elem->link);
+         free(elem);
+      }
+      return true;
+   }
+
+   if (!elem) {
+      elem = malloc(sizeof(*elem));
+      if (!elem)
+         return false;
+
+      elem->ID = id;
+      insert_at_tail(&ns->Elements, &elem->link);
+   }
+
+   elem->State = state;
+
+   return true;
+}
+
+/**
+ * Set the default state of the namespace for \p severity.  When \p severity
+ * is MESA_DEBUG_SEVERITY_COUNT, the default values for all severities are
+ * updated.
+ */
+static void
+debug_namespace_set_all(struct gl_debug_namespace *ns,
+                        enum mesa_debug_severity severity,
+                        bool enabled)
+{
+   struct simple_node *node, *tmp;
+   uint32_t mask, val;
+
+   /* set all elements to the same state */
+   if (severity == MESA_DEBUG_SEVERITY_COUNT) {
+      ns->DefaultState = (enabled) ? ((1 << severity) - 1) : 0;
+      debug_namespace_clear(ns);
+      make_empty_list(&ns->Elements);
+      return;
+   }
+
+   mask = 1 << severity;
+   val = (enabled) ? mask : 0;
+
+   ns->DefaultState = (ns->DefaultState & ~mask) | val;
+
+   foreach_s(node, tmp, &ns->Elements) {
+      struct gl_debug_element *elem = (struct gl_debug_element *) node;
+
+      elem->State = (elem->State & ~mask) | val;
+      if (elem->State == ns->DefaultState) {
+         remove_from_list(node);
+         free(node);
+      }
+   }
+}
+
+/**
+ * Get the state of \p id in the namespace.
+ */
+static bool
+debug_namespace_get(const struct gl_debug_namespace *ns, GLuint id,
+                    enum mesa_debug_severity severity)
+{
+   struct simple_node *node;
+   uint32_t state;
+
+   state = ns->DefaultState;
+   foreach(node, &ns->Elements) {
+      struct gl_debug_element *elem = (struct gl_debug_element *) node;
+
+      if (elem->ID == id) {
+         state = elem->State;
+         break;
+      }
+   }
+
+   return (state & (1 << severity));
+}
+
+/**
+ * Allocate and initialize context debug state.
+ */
+static struct gl_debug_state *
+debug_create(void)
+{
+   struct gl_debug_state *debug;
+   int s, t;
+
+   debug = CALLOC_STRUCT(gl_debug_state);
+   if (!debug)
+      return NULL;
+
+   debug->Groups[0] = malloc(sizeof(*debug->Groups[0]));
+   if (!debug->Groups[0]) {
+      free(debug);
+      return NULL;
+   }
+
+   /* Initialize state for filtering known debug messages. */
+   for (s = 0; s < MESA_DEBUG_SOURCE_COUNT; s++) {
+      for (t = 0; t < MESA_DEBUG_TYPE_COUNT; t++)
+         debug_namespace_init(&debug->Groups[0]->Namespaces[s][t]);
+   }
+
+   return debug;
+}
+
+/**
+ * Return true if the top debug group points to the group below it.
+ */
+static bool
+debug_is_group_read_only(const struct gl_debug_state *debug)
+{
+   const GLint gstack = debug->CurrentGroup;
+   return (gstack > 0 && debug->Groups[gstack] == debug->Groups[gstack - 1]);
+}
+
+/**
+ * Make the top debug group writable.
+ */
+static bool
+debug_make_group_writable(struct gl_debug_state *debug)
+{
+   const GLint gstack = debug->CurrentGroup;
+   const struct gl_debug_group *src = debug->Groups[gstack];
+   struct gl_debug_group *dst;
+   int s, t;
+
+   if (!debug_is_group_read_only(debug))
+      return true;
+
+   dst = malloc(sizeof(*dst));
+   if (!dst)
+      return false;
+
+   for (s = 0; s < MESA_DEBUG_SOURCE_COUNT; s++) {
+      for (t = 0; t < MESA_DEBUG_TYPE_COUNT; t++) {
+         if (!debug_namespace_copy(&dst->Namespaces[s][t],
+                                   &src->Namespaces[s][t])) {
+            /* error path! */
+            for (t = t - 1; t >= 0; t--)
+               debug_namespace_clear(&dst->Namespaces[s][t]);
+            for (s = s - 1; s >= 0; s--) {
+               for (t = 0; t < MESA_DEBUG_TYPE_COUNT; t++)
+                  debug_namespace_clear(&dst->Namespaces[s][t]);
+            }
+            free(dst);
+            return false;
+         }
+      }
+   }
+
+   debug->Groups[gstack] = dst;
+
+   return true;
+}
+
+/**
+ * Free the top debug group.
+ */
+static void
+debug_clear_group(struct gl_debug_state *debug)
+{
+   const GLint gstack = debug->CurrentGroup;
+
+   if (!debug_is_group_read_only(debug)) {
+      struct gl_debug_group *grp = debug->Groups[gstack];
+      int s, t;
+
+      for (s = 0; s < MESA_DEBUG_SOURCE_COUNT; s++) {
+         for (t = 0; t < MESA_DEBUG_TYPE_COUNT; t++)
+            debug_namespace_clear(&grp->Namespaces[s][t]);
+      }
+
+      free(grp);
+   }
+
+   debug->Groups[gstack] = NULL;
+}
+
+/**
+ * Loop through debug group stack tearing down states for
+ * filtering debug messages.  Then free debug output state.
+ */
+static void
+debug_destroy(struct gl_debug_state *debug)
+{
+   while (debug->CurrentGroup > 0) {
+      debug_clear_group(debug);
+      debug->CurrentGroup--;
+   }
+
+   debug_clear_group(debug);
+   free(debug);
+}
+
+/**
+ * Sets the state of the given message source/type/ID tuple.
+ */
+static void
+debug_set_message_enable(struct gl_debug_state *debug,
+                         enum mesa_debug_source source,
+                         enum mesa_debug_type type,
+                         GLuint id, GLboolean enabled)
+{
+   const GLint gstack = debug->CurrentGroup;
+   struct gl_debug_namespace *ns;
+
+   debug_make_group_writable(debug);
+   ns = &debug->Groups[gstack]->Namespaces[source][type];
+
+   debug_namespace_set(ns, id, enabled);
+}
+
+/*
+ * Set the state of all message IDs found in the given intersection of
+ * 'source', 'type', and 'severity'.  The _COUNT enum can be used for
+ * GL_DONT_CARE (include all messages in the class).
+ *
+ * This requires both setting the state of all previously seen message
+ * IDs in the hash table, and setting the default state for all
+ * applicable combinations of source/type/severity, so that all the
+ * yet-unknown message IDs that may be used in the future will be
+ * impacted as if they were already known.
+ */
+static void
+debug_set_message_enable_all(struct gl_debug_state *debug,
+                             enum mesa_debug_source source,
+                             enum mesa_debug_type type,
+                             enum mesa_debug_severity severity,
+                             GLboolean enabled)
+{
+   const GLint gstack = debug->CurrentGroup;
+   int s, t, smax, tmax;
+
+   if (source == MESA_DEBUG_SOURCE_COUNT) {
+      source = 0;
+      smax = MESA_DEBUG_SOURCE_COUNT;
+   } else {
+      smax = source+1;
+   }
+
+   if (type == MESA_DEBUG_TYPE_COUNT) {
+      type = 0;
+      tmax = MESA_DEBUG_TYPE_COUNT;
+   } else {
+      tmax = type+1;
+   }
+
+   debug_make_group_writable(debug);
+
+   for (s = source; s < smax; s++) {
+      for (t = type; t < tmax; t++) {
+         struct gl_debug_namespace *nspace =
+            &debug->Groups[gstack]->Namespaces[s][t];
+         debug_namespace_set_all(nspace, severity, enabled);
+      }
+   }
+}
+
+/**
+ * Returns if the given message source/type/ID tuple is enabled.
+ */
+bool
+_mesa_debug_is_message_enabled(const struct gl_debug_state *debug,
+                               enum mesa_debug_source source,
+                               enum mesa_debug_type type,
+                               GLuint id,
+                               enum mesa_debug_severity severity)
+{
+   const GLint gstack = debug->CurrentGroup;
+   struct gl_debug_group *grp = debug->Groups[gstack];
+   struct gl_debug_namespace *nspace = &grp->Namespaces[source][type];
+
+   if (!debug->DebugOutput)
+      return false;
+
+   return debug_namespace_get(nspace, id, severity);
+}
+
+/**
+ * 'buf' is not necessarily a null-terminated string. When logging, copy
+ * 'len' characters from it, store them in a new, null-terminated string,
+ * and remember the number of bytes used by that string, *including*
+ * the null terminator this time.
+ */
+static void
+debug_log_message(struct gl_debug_state *debug,
+                  enum mesa_debug_source source,
+                  enum mesa_debug_type type, GLuint id,
+                  enum mesa_debug_severity severity,
+                  GLsizei len, const char *buf)
+{
+   struct gl_debug_log *log = &debug->Log;
+   GLint nextEmpty;
+   struct gl_debug_message *emptySlot;
+
+   assert(len < MAX_DEBUG_MESSAGE_LENGTH);
+
+   if (log->NumMessages == MAX_DEBUG_LOGGED_MESSAGES)
+      return;
+
+   nextEmpty = (log->NextMessage + log->NumMessages)
+      % MAX_DEBUG_LOGGED_MESSAGES;
+   emptySlot = &log->Messages[nextEmpty];
+
+   debug_message_store(emptySlot, source, type,
+                       id, severity, len, buf);
+
+   log->NumMessages++;
+}
+
+/**
+ * Return the oldest debug message out of the log.
+ */
+static const struct gl_debug_message *
+debug_fetch_message(const struct gl_debug_state *debug)
+{
+   const struct gl_debug_log *log = &debug->Log;
+
+   return (log->NumMessages) ? &log->Messages[log->NextMessage] : NULL;
+}
+
+/**
+ * Delete the oldest debug messages out of the log.
+ */
+static void
+debug_delete_messages(struct gl_debug_state *debug, int count)
+{
+   struct gl_debug_log *log = &debug->Log;
+
+   if (count > log->NumMessages)
+      count = log->NumMessages;
+
+   while (count--) {
+      struct gl_debug_message *msg = &log->Messages[log->NextMessage];
+
+      debug_message_clear(msg);
+
+      log->NumMessages--;
+      log->NextMessage++;
+      log->NextMessage %= MAX_DEBUG_LOGGED_MESSAGES;
+   }
+}
+
+static struct gl_debug_message *
+debug_get_group_message(struct gl_debug_state *debug)
+{
+   return &debug->GroupMessages[debug->CurrentGroup];
+}
+
+static void
+debug_push_group(struct gl_debug_state *debug)
+{
+   const GLint gstack = debug->CurrentGroup;
+
+   /* just point to the previous stack */
+   debug->Groups[gstack + 1] = debug->Groups[gstack];
+   debug->CurrentGroup++;
+}
+
+static void
+debug_pop_group(struct gl_debug_state *debug)
+{
+   debug_clear_group(debug);
+   debug->CurrentGroup--;
+}
+
+
+/**
+ * Lock and return debug state for the context.  The debug state will be
+ * allocated and initialized upon the first call.  When NULL is returned, the
+ * debug state is not locked.
+ */
+static struct gl_debug_state *
+_mesa_lock_debug_state(struct gl_context *ctx)
+{
+   mtx_lock(&ctx->DebugMutex);
+
+   if (!ctx->Debug) {
+      ctx->Debug = debug_create();
+      if (!ctx->Debug) {
+         GET_CURRENT_CONTEXT(cur);
+         mtx_unlock(&ctx->DebugMutex);
+
+         /*
+          * This function may be called from other threads.  When that is the
+          * case, we cannot record this OOM error.
+          */
+         if (ctx == cur)
+            _mesa_error(ctx, GL_OUT_OF_MEMORY, "allocating debug state");
+
+         return NULL;
+      }
+   }
+
+   return ctx->Debug;
+}
+
+static void
+_mesa_unlock_debug_state(struct gl_context *ctx)
+{
+   mtx_unlock(&ctx->DebugMutex);
+}
+
+/**
+ * Set the integer debug state specified by \p pname.  This can be called from
+ * _mesa_set_enable for example.
+ */
+bool
+_mesa_set_debug_state_int(struct gl_context *ctx, GLenum pname, GLint val)
+{
+   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
+
+   if (!debug)
+      return false;
+
+   switch (pname) {
+   case GL_DEBUG_OUTPUT:
+      debug->DebugOutput = (val != 0);
+      break;
+   case GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB:
+      debug->SyncOutput = (val != 0);
+      break;
+   default:
+      assert(!"unknown debug output param");
+      break;
+   }
+
+   _mesa_unlock_debug_state(ctx);
+
+   return true;
+}
+
+/**
+ * Query the integer debug state specified by \p pname.  This can be called
+ * _mesa_GetIntegerv for example.
+ */
+GLint
+_mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname)
+{
+   struct gl_debug_state *debug;
+   GLint val;
+
+   mtx_lock(&ctx->DebugMutex);
+   debug = ctx->Debug;
+   if (!debug) {
+      mtx_unlock(&ctx->DebugMutex);
+      return 0;
+   }
+
+   switch (pname) {
+   case GL_DEBUG_OUTPUT:
+      val = debug->DebugOutput;
+      break;
+   case GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB:
+      val = debug->SyncOutput;
+      break;
+   case GL_DEBUG_LOGGED_MESSAGES:
+      val = debug->Log.NumMessages;
+      break;
+   case GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH:
+      val = (debug->Log.NumMessages) ?
+         debug->Log.Messages[debug->Log.NextMessage].length : 0;
+      break;
+   case GL_DEBUG_GROUP_STACK_DEPTH:
+      val = debug->CurrentGroup + 1;
+      break;
+   default:
+      assert(!"unknown debug output param");
+      val = 0;
+      break;
+   }
+
+   mtx_unlock(&ctx->DebugMutex);
+
+   return val;
+}
+
+/**
+ * Query the pointer debug state specified by \p pname.  This can be called
+ * _mesa_GetPointerv for example.
+ */
+void *
+_mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum pname)
+{
+   struct gl_debug_state *debug;
+   void *val;
+
+   mtx_lock(&ctx->DebugMutex);
+   debug = ctx->Debug;
+   if (!debug) {
+      mtx_unlock(&ctx->DebugMutex);
+      return NULL;
+   }
+
+   switch (pname) {
+   case GL_DEBUG_CALLBACK_FUNCTION_ARB:
+      val = (void *) debug->Callback;
+      break;
+   case GL_DEBUG_CALLBACK_USER_PARAM_ARB:
+      val = (void *) debug->CallbackData;
+      break;
+   default:
+      assert(!"unknown debug output param");
+      val = NULL;
+      break;
+   }
+
+   mtx_unlock(&ctx->DebugMutex);
+
+   return val;
+}
+
+/**
+ * Insert a debug message.  The mutex is assumed to be locked, and will be
+ * unlocked by this call.
+ */
+static void
+log_msg_locked_and_unlock(struct gl_context *ctx,
+                          enum mesa_debug_source source,
+                          enum mesa_debug_type type, GLuint id,
+                          enum mesa_debug_severity severity,
+                          GLint len, const char *buf)
+{
+   struct gl_debug_state *debug = ctx->Debug;
+
+   if (!_mesa_debug_is_message_enabled(debug, source, type, id, severity)) {
+      _mesa_unlock_debug_state(ctx);
+      return;
+   }
+
+   if (ctx->Debug->Callback) {
+      GLenum gl_source = debug_source_enums[source];
+      GLenum gl_type = debug_type_enums[type];
+      GLenum gl_severity = debug_severity_enums[severity];
+      GLDEBUGPROC callback = ctx->Debug->Callback;
+      const void *data = ctx->Debug->CallbackData;
+
+      /*
+       * When ctx->Debug->SyncOutput is GL_FALSE, the client is prepared for
+       * unsynchronous calls.  When it is GL_TRUE, we will not spawn threads.
+       * In either case, we can call the callback unlocked.
+       */
+      _mesa_unlock_debug_state(ctx);
+      callback(gl_source, gl_type, id, gl_severity, len, buf, data);
+   }
+   else {
+      debug_log_message(ctx->Debug, source, type, id, severity, len, buf);
+      _mesa_unlock_debug_state(ctx);
+   }
+}
+
+/**
+ * Log a client or driver debug message.
+ */
+void
+_mesa_log_msg(struct gl_context *ctx, enum mesa_debug_source source,
+              enum mesa_debug_type type, GLuint id,
+              enum mesa_debug_severity severity, GLint len, const char *buf)
+{
+   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
+
+   if (!debug)
+      return;
+
+   log_msg_locked_and_unlock(ctx, source, type, id, severity, len, buf);
+}
+
+
+/**
+ * Verify that source, type, and severity are valid enums.
+ *
+ * The 'caller' param is used for handling values available
+ * only in glDebugMessageInsert or glDebugMessageControl
+ */
+static GLboolean
+validate_params(struct gl_context *ctx, unsigned caller,
+                const char *callerstr, GLenum source, GLenum type,
+                GLenum severity)
+{
+#define INSERT 1
+#define CONTROL 2
+   switch(source) {
+   case GL_DEBUG_SOURCE_APPLICATION_ARB:
+   case GL_DEBUG_SOURCE_THIRD_PARTY_ARB:
+      break;
+   case GL_DEBUG_SOURCE_API_ARB:
+   case GL_DEBUG_SOURCE_SHADER_COMPILER_ARB:
+   case GL_DEBUG_SOURCE_WINDOW_SYSTEM_ARB:
+   case GL_DEBUG_SOURCE_OTHER_ARB:
+      if (caller != INSERT)
+         break;
+      else
+         goto error;
+   case GL_DONT_CARE:
+      if (caller == CONTROL)
+         break;
+      else
+         goto error;
+   default:
+      goto error;
+   }
+
+   switch(type) {
+   case GL_DEBUG_TYPE_ERROR_ARB:
+   case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR_ARB:
+   case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR_ARB:
+   case GL_DEBUG_TYPE_PERFORMANCE_ARB:
+   case GL_DEBUG_TYPE_PORTABILITY_ARB:
+   case GL_DEBUG_TYPE_OTHER_ARB:
+   case GL_DEBUG_TYPE_MARKER:
+   case GL_DEBUG_TYPE_PUSH_GROUP:
+   case GL_DEBUG_TYPE_POP_GROUP:
+      break;
+   case GL_DONT_CARE:
+      if (caller == CONTROL)
+         break;
+      else
+         goto error;
+   default:
+      goto error;
+   }
+
+   switch(severity) {
+   case GL_DEBUG_SEVERITY_HIGH_ARB:
+   case GL_DEBUG_SEVERITY_MEDIUM_ARB:
+   case GL_DEBUG_SEVERITY_LOW_ARB:
+   case GL_DEBUG_SEVERITY_NOTIFICATION:
+      break;
+   case GL_DONT_CARE:
+      if (caller == CONTROL)
+         break;
+      else
+         goto error;
+   default:
+      goto error;
+   }
+   return GL_TRUE;
+
+error:
+   _mesa_error(ctx, GL_INVALID_ENUM, "bad values passed to %s"
+               "(source=0x%x, type=0x%x, severity=0x%x)", callerstr,
+               source, type, severity);
+
+   return GL_FALSE;
+}
+
+
+static GLboolean
+validate_length(struct gl_context *ctx, const char *callerstr, GLsizei length,
+                const GLchar *buf)
+{
+
+   if (length < 0) {
+      GLsizei len = strlen(buf);
+
+      if (len >= MAX_DEBUG_MESSAGE_LENGTH) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                    "%s(null terminated string length=%d, is not less than "
+                    "GL_MAX_DEBUG_MESSAGE_LENGTH=%d)", callerstr, len,
+                    MAX_DEBUG_MESSAGE_LENGTH);
+         return GL_FALSE;
+      }
+   }
+
+   if (length >= MAX_DEBUG_MESSAGE_LENGTH) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                 "%s(length=%d, which is not less than "
+                 "GL_MAX_DEBUG_MESSAGE_LENGTH=%d)", callerstr, length,
+                 MAX_DEBUG_MESSAGE_LENGTH);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+
+void GLAPIENTRY
+_mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
+                         GLenum severity, GLint length,
+                         const GLchar *buf)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *callerstr;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glDebugMessageInsert";
+   else
+      callerstr = "glDebugMessageInsertKHR";
+
+   if (!validate_params(ctx, INSERT, callerstr, source, type, severity))
+      return; /* GL_INVALID_ENUM */
+
+   if (!validate_length(ctx, callerstr, length, buf))
+      return; /* GL_INVALID_VALUE */
+
+   _mesa_log_msg(ctx, gl_enum_to_debug_source(source),
+                 gl_enum_to_debug_type(type), id,
+                 gl_enum_to_debug_severity(severity),
+                 length, buf);
+
+   if (type == GL_DEBUG_TYPE_MARKER && ctx->Driver.EmitStringMarker) {
+      /* if length not specified, string will be null terminated: */
+      if (length < 0)
+         length = strlen(buf);
+      ctx->Driver.EmitStringMarker(ctx, buf, length);
+   }
+}
+
+
+GLuint GLAPIENTRY
+_mesa_GetDebugMessageLog(GLuint count, GLsizei logSize, GLenum *sources,
+                         GLenum *types, GLenum *ids, GLenum *severities,
+                         GLsizei *lengths, GLchar *messageLog)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_debug_state *debug;
+   const char *callerstr;
+   GLuint ret;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glGetDebugMessageLog";
+   else
+      callerstr = "glGetDebugMessageLogKHR";
+
+   if (!messageLog)
+      logSize = 0;
+
+   if (logSize < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(logSize=%d : logSize must not be negative)",
+                  callerstr, logSize);
+      return 0;
+   }
+
+   debug = _mesa_lock_debug_state(ctx);
+   if (!debug)
+      return 0;
+
+   for (ret = 0; ret < count; ret++) {
+      const struct gl_debug_message *msg = debug_fetch_message(debug);
+      GLsizei len;
+
+      if (!msg)
+         break;
+
+      len = msg->length;
+      if (len < 0)
+         len = strlen(msg->message);
+
+      if (logSize < len+1 && messageLog != NULL)
+         break;
+
+      if (messageLog) {
+         assert(msg->message[len] == '\0');
+         (void) strncpy(messageLog, msg->message, (size_t)len+1);
+
+         messageLog += len+1;
+         logSize -= len+1;
+      }
+
+      if (lengths)
+         *lengths++ = len+1;
+      if (severities)
+         *severities++ = debug_severity_enums[msg->severity];
+      if (sources)
+         *sources++ = debug_source_enums[msg->source];
+      if (types)
+         *types++ = debug_type_enums[msg->type];
+      if (ids)
+         *ids++ = msg->id;
+
+      debug_delete_messages(debug, 1);
+   }
+
+   _mesa_unlock_debug_state(ctx);
+
+   return ret;
+}
+
+
+void GLAPIENTRY
+_mesa_DebugMessageControl(GLenum gl_source, GLenum gl_type,
+                          GLenum gl_severity, GLsizei count,
+                          const GLuint *ids, GLboolean enabled)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   enum mesa_debug_source source = gl_enum_to_debug_source(gl_source);
+   enum mesa_debug_type type = gl_enum_to_debug_type(gl_type);
+   enum mesa_debug_severity severity = gl_enum_to_debug_severity(gl_severity);
+   const char *callerstr;
+   struct gl_debug_state *debug;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glDebugMessageControl";
+   else
+      callerstr = "glDebugMessageControlKHR";
+
+   if (count < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(count=%d : count must not be negative)", callerstr,
+                  count);
+      return;
+   }
+
+   if (!validate_params(ctx, CONTROL, callerstr, gl_source, gl_type,
+                        gl_severity))
+      return; /* GL_INVALID_ENUM */
+
+   if (count && (gl_severity != GL_DONT_CARE || gl_type == GL_DONT_CARE
+                 || gl_source == GL_DONT_CARE)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(When passing an array of ids, severity must be"
+         " GL_DONT_CARE, and source and type must not be GL_DONT_CARE.",
+                  callerstr);
+      return;
+   }
+
+   debug = _mesa_lock_debug_state(ctx);
+   if (!debug)
+      return;
+
+   if (count) {
+      GLsizei i;
+      for (i = 0; i < count; i++)
+         debug_set_message_enable(debug, source, type, ids[i], enabled);
+   }
+   else {
+      debug_set_message_enable_all(debug, source, type, severity, enabled);
+   }
+
+   _mesa_unlock_debug_state(ctx);
+}
+
+
+void GLAPIENTRY
+_mesa_DebugMessageCallback(GLDEBUGPROC callback, const void *userParam)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
+   if (debug) {
+      debug->Callback = callback;
+      debug->CallbackData = userParam;
+      _mesa_unlock_debug_state(ctx);
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
+                     const GLchar *message)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *callerstr;
+   struct gl_debug_state *debug;
+   struct gl_debug_message *emptySlot;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glPushDebugGroup";
+   else
+      callerstr = "glPushDebugGroupKHR";
+
+   switch(source) {
+   case GL_DEBUG_SOURCE_APPLICATION:
+   case GL_DEBUG_SOURCE_THIRD_PARTY:
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM, "bad value passed to %s"
+                  "(source=0x%x)", callerstr, source);
+      return;
+   }
+
+   if (!validate_length(ctx, callerstr, length, message))
+      return; /* GL_INVALID_VALUE */
+
+   debug = _mesa_lock_debug_state(ctx);
+   if (!debug)
+      return;
+
+   if (debug->CurrentGroup >= MAX_DEBUG_GROUP_STACK_DEPTH-1) {
+      _mesa_unlock_debug_state(ctx);
+      _mesa_error(ctx, GL_STACK_OVERFLOW, "%s", callerstr);
+      return;
+   }
+
+   /* pop reuses the message details from push so we store this */
+   emptySlot = debug_get_group_message(debug);
+   debug_message_store(emptySlot,
+                       gl_enum_to_debug_source(source),
+                       gl_enum_to_debug_type(GL_DEBUG_TYPE_PUSH_GROUP),
+                       id,
+                       gl_enum_to_debug_severity(GL_DEBUG_SEVERITY_NOTIFICATION),
+                       length, message);
+
+   debug_push_group(debug);
+
+   log_msg_locked_and_unlock(ctx,
+         gl_enum_to_debug_source(source),
+         MESA_DEBUG_TYPE_PUSH_GROUP, id,
+         MESA_DEBUG_SEVERITY_NOTIFICATION, length,
+         message);
+}
+
+
+void GLAPIENTRY
+_mesa_PopDebugGroup(void)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *callerstr;
+   struct gl_debug_state *debug;
+   struct gl_debug_message *gdmessage, msg;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glPopDebugGroup";
+   else
+      callerstr = "glPopDebugGroupKHR";
+
+   debug = _mesa_lock_debug_state(ctx);
+   if (!debug)
+      return;
+
+   if (debug->CurrentGroup <= 0) {
+      _mesa_unlock_debug_state(ctx);
+      _mesa_error(ctx, GL_STACK_UNDERFLOW, "%s", callerstr);
+      return;
+   }
+
+   debug_pop_group(debug);
+
+   /* make a shallow copy */
+   gdmessage = debug_get_group_message(debug);
+   msg = *gdmessage;
+   gdmessage->message = NULL;
+   gdmessage->length = 0;
+
+   log_msg_locked_and_unlock(ctx,
+         msg.source,
+         gl_enum_to_debug_type(GL_DEBUG_TYPE_POP_GROUP),
+         msg.id,
+         gl_enum_to_debug_severity(GL_DEBUG_SEVERITY_NOTIFICATION),
+         msg.length, msg.message);
+
+   debug_message_clear(&msg);
+}
+
+
+void
+_mesa_init_debug_output(struct gl_context *ctx)
+{
+   mtx_init(&ctx->DebugMutex, mtx_plain);
+}
+
+
+void
+_mesa_free_errors_data(struct gl_context *ctx)
+{
+   if (ctx->Debug) {
+      debug_destroy(ctx->Debug);
+      /* set to NULL just in case it is used before context is completely gone. */
+      ctx->Debug = NULL;
+   }
+
+   mtx_destroy(&ctx->DebugMutex);
+}
+
+void GLAPIENTRY
+_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (ctx->Extensions.GREMEDY_string_marker) {
+      /* if length not specified, string will be null terminated: */
+      if (len <= 0)
+         len = strlen(string);
+      ctx->Driver.EmitStringMarker(ctx, string, len);
+   } else {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "StringMarkerGREMEDY");
+   }
+}
diff --git a/src/mesa/main/debug_output.h b/src/mesa/main/debug_output.h
new file mode 100644
index 00000000000..9d8be4f2273
--- /dev/null
+++ b/src/mesa/main/debug_output.h
@@ -0,0 +1,107 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 1999-2016  Brian Paul, et al   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef DEBUG_OUTPUT_H
+#define DEBUG_OUTPUT_H
+
+
+#include <stdio.h>
+#include <stdarg.h>
+#include "compiler.h"
+#include "glheader.h"
+#include "mtypes.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void
+_mesa_init_debug_output(struct gl_context *ctx);
+
+void
+_mesa_free_errors_data(struct gl_context *ctx);
+
+void
+_mesa_debug_get_id(GLuint *id);
+
+bool
+_mesa_set_debug_state_int(struct gl_context *ctx, GLenum pname, GLint val);
+
+GLint
+_mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname);
+
+void *
+_mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum pname);
+
+void
+_mesa_log_msg(struct gl_context *ctx, enum mesa_debug_source source,
+              enum mesa_debug_type type, GLuint id,
+              enum mesa_debug_severity severity, GLint len, const char *buf);
+
+bool
+_mesa_debug_is_message_enabled(const struct gl_debug_state *debug,
+                               enum mesa_debug_source source,
+                               enum mesa_debug_type type,
+                               GLuint id,
+                               enum mesa_debug_severity severity);
+
+void GLAPIENTRY
+_mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
+                         GLenum severity, GLint length,
+                         const GLchar* buf);
+
+GLuint GLAPIENTRY
+_mesa_GetDebugMessageLog(GLuint count, GLsizei logSize, GLenum* sources,
+                         GLenum* types, GLenum* ids, GLenum* severities,
+                         GLsizei* lengths, GLchar* messageLog);
+
+void GLAPIENTRY
+_mesa_DebugMessageControl(GLenum source, GLenum type, GLenum severity,
+                          GLsizei count, const GLuint *ids,
+                          GLboolean enabled);
+
+void GLAPIENTRY
+_mesa_DebugMessageCallback(GLDEBUGPROC callback,
+                           const void *userParam);
+
+void GLAPIENTRY
+_mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
+                     const GLchar *message);
+
+void GLAPIENTRY
+_mesa_PopDebugGroup(void);
+
+void GLAPIENTRY
+_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* DEBUG_OUTPUT_H */
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index cd8e3b6a2f2..fb31d2f2706 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -194,7 +194,7 @@ typedef enum
    OPCODE_BLEND_FUNC_SEPARATE_I,
 
    OPCODE_CALL_LIST,
-   OPCODE_CALL_LIST_OFFSET,
+   OPCODE_CALL_LISTS,
    OPCODE_CLEAR,
    OPCODE_CLEAR_ACCUM,
    OPCODE_CLEAR_COLOR,
@@ -706,6 +706,10 @@ _mesa_delete_list(struct gl_context *ctx, struct gl_display_list *dlist)
             free(get_pointer(&n[10]));
             n += InstSize[n[0].opcode];
             break;
+         case OPCODE_CALL_LISTS:
+            free(get_pointer(&n[3]));
+            n += InstSize[n[0].opcode];
+            break;
          case OPCODE_DRAW_PIXELS:
             free(get_pointer(&n[5]));
             n += InstSize[n[0].opcode];
@@ -1569,37 +1573,49 @@ static void GLAPIENTRY
 save_CallLists(GLsizei num, GLenum type, const GLvoid * lists)
 {
    GET_CURRENT_CONTEXT(ctx);
-   GLint i;
-   GLboolean typeErrorFlag;
+   unsigned type_size;
+   Node *n;
+   void *lists_copy;
 
    SAVE_FLUSH_VERTICES(ctx);
 
    switch (type) {
    case GL_BYTE:
    case GL_UNSIGNED_BYTE:
+      type_size = 1;
+      break;
    case GL_SHORT:
    case GL_UNSIGNED_SHORT:
+   case GL_2_BYTES:
+      type_size = 2;
+      break;
+   case GL_3_BYTES:
+      type_size = 3;
+      break;
    case GL_INT:
    case GL_UNSIGNED_INT:
    case GL_FLOAT:
-   case GL_2_BYTES:
-   case GL_3_BYTES:
    case GL_4_BYTES:
-      typeErrorFlag = GL_FALSE;
+      type_size = 4;
       break;
    default:
-      typeErrorFlag = GL_TRUE;
+      type_size = 0;
    }
 
-   for (i = 0; i < num; i++) {
-      GLint list = translate_id(i, type, lists);
-      Node *n = alloc_instruction(ctx, OPCODE_CALL_LIST_OFFSET, 2);
-      if (n) {
-         n[1].i = list;
-         n[2].b = typeErrorFlag;
-      }
+   if (num > 0 && type_size > 0) {
+      /* create a copy of the array of list IDs to save in the display list */
+      lists_copy = memdup(lists, num * type_size);
+   } else {
+      lists_copy = NULL;
    }
 
+   n = alloc_instruction(ctx, OPCODE_CALL_LISTS, 2 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = num;
+      n[2].e = type;
+      save_pointer(&n[3], lists_copy);
+   };
+
    /* After this, we don't know what state we're in.  Invalidate all
     * cached information previously gathered:
     */
@@ -7772,15 +7788,9 @@ execute_list(struct gl_context *ctx, GLuint list)
                execute_list(ctx, n[1].ui);
             }
             break;
-         case OPCODE_CALL_LIST_OFFSET:
-            /* Generated by glCallLists() so we must add ListBase */
-            if (n[2].b) {
-               /* user specified a bad data type at compile time */
-               _mesa_error(ctx, GL_INVALID_ENUM, "glCallLists(type)");
-            }
-            else if (ctx->ListState.CallDepth < MAX_LIST_NESTING) {
-               GLuint list = (GLuint) (ctx->List.ListBase + n[1].i);
-               execute_list(ctx, list);
+         case OPCODE_CALL_LISTS:
+            if (ctx->ListState.CallDepth < MAX_LIST_NESTING) {
+               CALL_CallLists(ctx->Exec, (n[1].i, n[2].e, get_pointer(&n[3])));
             }
             break;
          case OPCODE_CLEAR:
@@ -9105,6 +9115,14 @@ _mesa_CallLists(GLsizei n, GLenum type, const GLvoid * lists)
       return;
    }
 
+   if (n < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glCallLists(n < 0)");
+      return;
+   } else if (n == 0 || lists == NULL) {
+      /* nothing to do */
+      return;
+   }
+
    /* Save the CompileFlag status, turn it off, execute display list,
     * and restore the CompileFlag.
     */
@@ -9728,9 +9746,8 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
          case OPCODE_CALL_LIST:
             fprintf(f, "CallList %d\n", (int) n[1].ui);
             break;
-         case OPCODE_CALL_LIST_OFFSET:
-            fprintf(f, "CallList %d + offset %u = %u\n", (int) n[1].ui,
-                         ctx->List.ListBase, ctx->List.ListBase + n[1].ui);
+         case OPCODE_CALL_LISTS:
+            fprintf(f, "CallLists %d, %s\n", n[1].i, enum_string(n[1].e));
             break;
          case OPCODE_DISABLE:
             fprintf(f, "Disable %s\n", enum_string(n[1].e));
diff --git a/src/mesa/main/dlist.h b/src/mesa/main/dlist.h
index a1214674c62..7a23208ba5a 100644
--- a/src/mesa/main/dlist.h
+++ b/src/mesa/main/dlist.h
@@ -38,46 +38,61 @@
 
 GLboolean GLAPIENTRY
 _mesa_IsList(GLuint list);
+
 void GLAPIENTRY
 _mesa_DeleteLists(GLuint list, GLsizei range);
+
 GLuint GLAPIENTRY
 _mesa_GenLists(GLsizei range);
+
 void GLAPIENTRY
 _mesa_NewList(GLuint name, GLenum mode);
+
 void GLAPIENTRY
 _mesa_EndList(void);
+
 void GLAPIENTRY
-_mesa_CallList( GLuint list );
+_mesa_CallList(GLuint list);
+
 void GLAPIENTRY
-_mesa_CallLists( GLsizei n, GLenum type, const GLvoid *lists );
+_mesa_CallLists(GLsizei n, GLenum type, const GLvoid *lists);
+
 void GLAPIENTRY
 _mesa_ListBase(GLuint base);
 
-extern struct gl_display_list *
+struct gl_display_list *
 _mesa_lookup_list(struct gl_context *ctx, GLuint list);
 
-extern void _mesa_compile_error( struct gl_context *ctx, GLenum error, const char *s );
+void
+_mesa_compile_error(struct gl_context *ctx, GLenum error, const char *s);
 
-extern void *_mesa_dlist_alloc(struct gl_context *ctx, GLuint opcode, GLuint sz);
+void *
+_mesa_dlist_alloc(struct gl_context *ctx, GLuint opcode, GLuint sz);
 
-extern void *
+void *
 _mesa_dlist_alloc_aligned(struct gl_context *ctx, GLuint opcode, GLuint bytes);
 
-extern GLint _mesa_dlist_alloc_opcode( struct gl_context *ctx, GLuint sz,
-                                       void (*execute)( struct gl_context *, void * ),
-                                       void (*destroy)( struct gl_context *, void * ),
-                                       void (*print)( struct gl_context *, void *, FILE * ) );
+GLint
+_mesa_dlist_alloc_opcode(struct gl_context *ctx, GLuint sz,
+                         void (*execute)(struct gl_context *, void *),
+                         void (*destroy)(struct gl_context *, void *),
+                         void (*print)(struct gl_context *, void *, FILE *));
 
-extern void _mesa_delete_list(struct gl_context *ctx, struct gl_display_list *dlist);
+void
+_mesa_delete_list(struct gl_context *ctx, struct gl_display_list *dlist);
 
-extern void _mesa_initialize_save_table(const struct gl_context *);
+void
+_mesa_initialize_save_table(const struct gl_context *);
 
-extern void _mesa_install_dlist_vtxfmt(struct _glapi_table *disp,
-                                       const GLvertexformat *vfmt);
+void
+_mesa_install_dlist_vtxfmt(struct _glapi_table *disp,
+                           const GLvertexformat *vfmt);
 
-extern void _mesa_init_display_list( struct gl_context * ctx );
+void
+_mesa_init_display_list(struct gl_context * ctx);
 
-extern void _mesa_free_display_list_data(struct gl_context *ctx);
+void
+_mesa_free_display_list_data(struct gl_context *ctx);
 
 
 #endif /* DLIST_H */
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index f7941817845..3fd3c2747ea 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -31,6 +31,7 @@
 #include "glheader.h"
 #include "clip.h"
 #include "context.h"
+#include "debug_output.h"
 #include "enable.h"
 #include "errors.h"
 #include "light.h"
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index 674364c7b0c..9932b4a5a89 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -34,6 +34,7 @@
 #include "enums.h"
 #include "imports.h"
 #include "context.h"
+#include "debug_output.h"
 #include "dispatch.h"
 #include "hash.h"
 #include "mtypes.h"
@@ -41,1265 +42,6 @@
 #include "util/hash_table.h"
 #include "util/simple_list.h"
 
-static mtx_t DynamicIDMutex = _MTX_INITIALIZER_NP;
-static GLuint NextDynamicID = 1;
-
-/**
- * A namespace element.
- */
-struct gl_debug_element
-{
-   struct simple_node link;
-
-   GLuint ID;
-   /* at which severity levels (mesa_debug_severity) is the message enabled */
-   GLbitfield State;
-};
-
-struct gl_debug_namespace
-{
-   struct simple_node Elements;
-   GLbitfield DefaultState;
-};
-
-struct gl_debug_group {
-   struct gl_debug_namespace Namespaces[MESA_DEBUG_SOURCE_COUNT][MESA_DEBUG_TYPE_COUNT];
-};
-
-/**
- * An error, warning, or other piece of debug information for an application
- * to consume via GL_ARB_debug_output/GL_KHR_debug.
- */
-struct gl_debug_message
-{
-   enum mesa_debug_source source;
-   enum mesa_debug_type type;
-   GLuint id;
-   enum mesa_debug_severity severity;
-   /* length as given by the user - if message was explicitly null terminated,
-    * length can be negative */
-   GLsizei length;
-   GLcharARB *message;
-};
-
-/**
- * Debug message log.  It works like a ring buffer.
- */
-struct gl_debug_log {
-   struct gl_debug_message Messages[MAX_DEBUG_LOGGED_MESSAGES];
-   GLint NextMessage;
-   GLint NumMessages;
-};
-
-struct gl_debug_state
-{
-   GLDEBUGPROC Callback;
-   const void *CallbackData;
-   GLboolean SyncOutput;
-   GLboolean DebugOutput;
-
-   struct gl_debug_group *Groups[MAX_DEBUG_GROUP_STACK_DEPTH];
-   struct gl_debug_message GroupMessages[MAX_DEBUG_GROUP_STACK_DEPTH];
-   GLint CurrentGroup; // GroupStackDepth - 1
-
-   struct gl_debug_log Log;
-};
-
-static char out_of_memory[] = "Debugging error: out of memory";
-
-static const GLenum debug_source_enums[] = {
-   GL_DEBUG_SOURCE_API,
-   GL_DEBUG_SOURCE_WINDOW_SYSTEM,
-   GL_DEBUG_SOURCE_SHADER_COMPILER,
-   GL_DEBUG_SOURCE_THIRD_PARTY,
-   GL_DEBUG_SOURCE_APPLICATION,
-   GL_DEBUG_SOURCE_OTHER,
-};
-
-static const GLenum debug_type_enums[] = {
-   GL_DEBUG_TYPE_ERROR,
-   GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR,
-   GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR,
-   GL_DEBUG_TYPE_PORTABILITY,
-   GL_DEBUG_TYPE_PERFORMANCE,
-   GL_DEBUG_TYPE_OTHER,
-   GL_DEBUG_TYPE_MARKER,
-   GL_DEBUG_TYPE_PUSH_GROUP,
-   GL_DEBUG_TYPE_POP_GROUP,
-};
-
-static const GLenum debug_severity_enums[] = {
-   GL_DEBUG_SEVERITY_LOW,
-   GL_DEBUG_SEVERITY_MEDIUM,
-   GL_DEBUG_SEVERITY_HIGH,
-   GL_DEBUG_SEVERITY_NOTIFICATION,
-};
-
-
-static enum mesa_debug_source
-gl_enum_to_debug_source(GLenum e)
-{
-   unsigned i;
-
-   for (i = 0; i < ARRAY_SIZE(debug_source_enums); i++) {
-      if (debug_source_enums[i] == e)
-         break;
-   }
-   return i;
-}
-
-static enum mesa_debug_type
-gl_enum_to_debug_type(GLenum e)
-{
-   unsigned i;
-
-   for (i = 0; i < ARRAY_SIZE(debug_type_enums); i++) {
-      if (debug_type_enums[i] == e)
-         break;
-   }
-   return i;
-}
-
-static enum mesa_debug_severity
-gl_enum_to_debug_severity(GLenum e)
-{
-   unsigned i;
-
-   for (i = 0; i < ARRAY_SIZE(debug_severity_enums); i++) {
-      if (debug_severity_enums[i] == e)
-         break;
-   }
-   return i;
-}
-
-
-/**
- * Handles generating a GL_ARB_debug_output message ID generated by the GL or
- * GLSL compiler.
- *
- * The GL API has this "ID" mechanism, where the intention is to allow a
- * client to filter in/out messages based on source, type, and ID.  Of course,
- * building a giant enum list of all debug output messages that Mesa might
- * generate is ridiculous, so instead we have our caller pass us a pointer to
- * static storage where the ID should get stored.  This ID will be shared
- * across all contexts for that message (which seems like a desirable
- * property, even if it's not expected by the spec), but note that it won't be
- * the same between executions if messages aren't generated in the same order.
- */
-static void
-debug_get_id(GLuint *id)
-{
-   if (!(*id)) {
-      mtx_lock(&DynamicIDMutex);
-      if (!(*id))
-         *id = NextDynamicID++;
-      mtx_unlock(&DynamicIDMutex);
-   }
-}
-
-static void
-debug_message_clear(struct gl_debug_message *msg)
-{
-   if (msg->message != (char*)out_of_memory)
-      free(msg->message);
-   msg->message = NULL;
-   msg->length = 0;
-}
-
-static void
-debug_message_store(struct gl_debug_message *msg,
-                    enum mesa_debug_source source,
-                    enum mesa_debug_type type, GLuint id,
-                    enum mesa_debug_severity severity,
-                    GLsizei len, const char *buf)
-{
-   GLsizei length = len;
-
-   assert(!msg->message && !msg->length);
-
-   if (length < 0)
-      length = strlen(buf);
-
-   msg->message = malloc(length+1);
-   if (msg->message) {
-      (void) strncpy(msg->message, buf, (size_t)length);
-      msg->message[length] = '\0';
-
-      msg->length = len;
-      msg->source = source;
-      msg->type = type;
-      msg->id = id;
-      msg->severity = severity;
-   } else {
-      static GLuint oom_msg_id = 0;
-      debug_get_id(&oom_msg_id);
-
-      /* malloc failed! */
-      msg->message = out_of_memory;
-      msg->length = -1;
-      msg->source = MESA_DEBUG_SOURCE_OTHER;
-      msg->type = MESA_DEBUG_TYPE_ERROR;
-      msg->id = oom_msg_id;
-      msg->severity = MESA_DEBUG_SEVERITY_HIGH;
-   }
-}
-
-static void
-debug_namespace_init(struct gl_debug_namespace *ns)
-{
-   make_empty_list(&ns->Elements);
-
-   /* Enable all the messages with severity HIGH or MEDIUM by default */
-   ns->DefaultState = (1 << MESA_DEBUG_SEVERITY_MEDIUM ) |
-                      (1 << MESA_DEBUG_SEVERITY_HIGH) |
-                      (1 << MESA_DEBUG_SEVERITY_NOTIFICATION);
-}
-
-static void
-debug_namespace_clear(struct gl_debug_namespace *ns)
-{
-   struct simple_node *node, *tmp;
-
-   foreach_s(node, tmp, &ns->Elements)
-      free(node);
-}
-
-static bool
-debug_namespace_copy(struct gl_debug_namespace *dst,
-                     const struct gl_debug_namespace *src)
-{
-   struct simple_node *node;
-
-   dst->DefaultState = src->DefaultState;
-
-   make_empty_list(&dst->Elements);
-   foreach(node, &src->Elements) {
-      const struct gl_debug_element *elem =
-         (const struct gl_debug_element *) node;
-      struct gl_debug_element *copy;
-
-      copy = malloc(sizeof(*copy));
-      if (!copy) {
-         debug_namespace_clear(dst);
-         return false;
-      }
-
-      copy->ID = elem->ID;
-      copy->State = elem->State;
-      insert_at_tail(&dst->Elements, &copy->link);
-   }
-
-   return true;
-}
-
-/**
- * Set the state of \p id in the namespace.
- */
-static bool
-debug_namespace_set(struct gl_debug_namespace *ns,
-                    GLuint id, bool enabled)
-{
-   const uint32_t state = (enabled) ?
-      ((1 << MESA_DEBUG_SEVERITY_COUNT) - 1) : 0;
-   struct gl_debug_element *elem = NULL;
-   struct simple_node *node;
-
-   /* find the element */
-   foreach(node, &ns->Elements) {
-      struct gl_debug_element *tmp = (struct gl_debug_element *) node;
-      if (tmp->ID == id) {
-         elem = tmp;
-         break;
-      }
-   }
-
-   /* we do not need the element if it has the default state */
-   if (ns->DefaultState == state) {
-      if (elem) {
-         remove_from_list(&elem->link);
-         free(elem);
-      }
-      return true;
-   }
-
-   if (!elem) {
-      elem = malloc(sizeof(*elem));
-      if (!elem)
-         return false;
-
-      elem->ID = id;
-      insert_at_tail(&ns->Elements, &elem->link);
-   }
-
-   elem->State = state;
-
-   return true;
-}
-
-/**
- * Set the default state of the namespace for \p severity.  When \p severity
- * is MESA_DEBUG_SEVERITY_COUNT, the default values for all severities are
- * updated.
- */
-static void
-debug_namespace_set_all(struct gl_debug_namespace *ns,
-                        enum mesa_debug_severity severity,
-                        bool enabled)
-{
-   struct simple_node *node, *tmp;
-   uint32_t mask, val;
-
-   /* set all elements to the same state */
-   if (severity == MESA_DEBUG_SEVERITY_COUNT) {
-      ns->DefaultState = (enabled) ? ((1 << severity) - 1) : 0;
-      debug_namespace_clear(ns);
-      make_empty_list(&ns->Elements);
-      return;
-   }
-
-   mask = 1 << severity;
-   val = (enabled) ? mask : 0;
-
-   ns->DefaultState = (ns->DefaultState & ~mask) | val;
-
-   foreach_s(node, tmp, &ns->Elements) {
-      struct gl_debug_element *elem = (struct gl_debug_element *) node;
-
-      elem->State = (elem->State & ~mask) | val;
-      if (elem->State == ns->DefaultState) {
-         remove_from_list(node);
-         free(node);
-      }
-   }
-}
-
-/**
- * Get the state of \p id in the namespace.
- */
-static bool
-debug_namespace_get(const struct gl_debug_namespace *ns, GLuint id,
-                    enum mesa_debug_severity severity)
-{
-   struct simple_node *node;
-   uint32_t state;
-
-   state = ns->DefaultState;
-   foreach(node, &ns->Elements) {
-      struct gl_debug_element *elem = (struct gl_debug_element *) node;
-
-      if (elem->ID == id) {
-         state = elem->State;
-         break;
-      }
-   }
-
-   return (state & (1 << severity));
-}
-
-/**
- * Allocate and initialize context debug state.
- */
-static struct gl_debug_state *
-debug_create(void)
-{
-   struct gl_debug_state *debug;
-   int s, t;
-
-   debug = CALLOC_STRUCT(gl_debug_state);
-   if (!debug)
-      return NULL;
-
-   debug->Groups[0] = malloc(sizeof(*debug->Groups[0]));
-   if (!debug->Groups[0]) {
-      free(debug);
-      return NULL;
-   }
-
-   /* Initialize state for filtering known debug messages. */
-   for (s = 0; s < MESA_DEBUG_SOURCE_COUNT; s++) {
-      for (t = 0; t < MESA_DEBUG_TYPE_COUNT; t++)
-         debug_namespace_init(&debug->Groups[0]->Namespaces[s][t]);
-   }
-
-   return debug;
-}
-
-/**
- * Return true if the top debug group points to the group below it.
- */
-static bool
-debug_is_group_read_only(const struct gl_debug_state *debug)
-{
-   const GLint gstack = debug->CurrentGroup;
-   return (gstack > 0 && debug->Groups[gstack] == debug->Groups[gstack - 1]);
-}
-
-/**
- * Make the top debug group writable.
- */
-static bool
-debug_make_group_writable(struct gl_debug_state *debug)
-{
-   const GLint gstack = debug->CurrentGroup;
-   const struct gl_debug_group *src = debug->Groups[gstack];
-   struct gl_debug_group *dst;
-   int s, t;
-
-   if (!debug_is_group_read_only(debug))
-      return true;
-
-   dst = malloc(sizeof(*dst));
-   if (!dst)
-      return false;
-
-   for (s = 0; s < MESA_DEBUG_SOURCE_COUNT; s++) {
-      for (t = 0; t < MESA_DEBUG_TYPE_COUNT; t++) {
-         if (!debug_namespace_copy(&dst->Namespaces[s][t],
-                                   &src->Namespaces[s][t])) {
-            /* error path! */
-            for (t = t - 1; t >= 0; t--)
-               debug_namespace_clear(&dst->Namespaces[s][t]);
-            for (s = s - 1; s >= 0; s--) {
-               for (t = 0; t < MESA_DEBUG_TYPE_COUNT; t++)
-                  debug_namespace_clear(&dst->Namespaces[s][t]);
-            }
-            free(dst);
-            return false;
-         }
-      }
-   }
-
-   debug->Groups[gstack] = dst;
-
-   return true;
-}
-
-/**
- * Free the top debug group.
- */
-static void
-debug_clear_group(struct gl_debug_state *debug)
-{
-   const GLint gstack = debug->CurrentGroup;
-
-   if (!debug_is_group_read_only(debug)) {
-      struct gl_debug_group *grp = debug->Groups[gstack];
-      int s, t;
-
-      for (s = 0; s < MESA_DEBUG_SOURCE_COUNT; s++) {
-         for (t = 0; t < MESA_DEBUG_TYPE_COUNT; t++)
-            debug_namespace_clear(&grp->Namespaces[s][t]);
-      }
-
-      free(grp);
-   }
-
-   debug->Groups[gstack] = NULL;
-}
-
-/**
- * Loop through debug group stack tearing down states for
- * filtering debug messages.  Then free debug output state.
- */
-static void
-debug_destroy(struct gl_debug_state *debug)
-{
-   while (debug->CurrentGroup > 0) {
-      debug_clear_group(debug);
-      debug->CurrentGroup--;
-   }
-
-   debug_clear_group(debug);
-   free(debug);
-}
-
-/**
- * Sets the state of the given message source/type/ID tuple.
- */
-static void
-debug_set_message_enable(struct gl_debug_state *debug,
-                         enum mesa_debug_source source,
-                         enum mesa_debug_type type,
-                         GLuint id, GLboolean enabled)
-{
-   const GLint gstack = debug->CurrentGroup;
-   struct gl_debug_namespace *ns;
-
-   debug_make_group_writable(debug);
-   ns = &debug->Groups[gstack]->Namespaces[source][type];
-
-   debug_namespace_set(ns, id, enabled);
-}
-
-/*
- * Set the state of all message IDs found in the given intersection of
- * 'source', 'type', and 'severity'.  The _COUNT enum can be used for
- * GL_DONT_CARE (include all messages in the class).
- *
- * This requires both setting the state of all previously seen message
- * IDs in the hash table, and setting the default state for all
- * applicable combinations of source/type/severity, so that all the
- * yet-unknown message IDs that may be used in the future will be
- * impacted as if they were already known.
- */
-static void
-debug_set_message_enable_all(struct gl_debug_state *debug,
-                             enum mesa_debug_source source,
-                             enum mesa_debug_type type,
-                             enum mesa_debug_severity severity,
-                             GLboolean enabled)
-{
-   const GLint gstack = debug->CurrentGroup;
-   int s, t, smax, tmax;
-
-   if (source == MESA_DEBUG_SOURCE_COUNT) {
-      source = 0;
-      smax = MESA_DEBUG_SOURCE_COUNT;
-   } else {
-      smax = source+1;
-   }
-
-   if (type == MESA_DEBUG_TYPE_COUNT) {
-      type = 0;
-      tmax = MESA_DEBUG_TYPE_COUNT;
-   } else {
-      tmax = type+1;
-   }
-
-   debug_make_group_writable(debug);
-
-   for (s = source; s < smax; s++) {
-      for (t = type; t < tmax; t++) {
-         struct gl_debug_namespace *nspace =
-            &debug->Groups[gstack]->Namespaces[s][t];
-         debug_namespace_set_all(nspace, severity, enabled);
-      }
-   }
-}
-
-/**
- * Returns if the given message source/type/ID tuple is enabled.
- */
-static bool
-debug_is_message_enabled(const struct gl_debug_state *debug,
-                         enum mesa_debug_source source,
-                         enum mesa_debug_type type,
-                         GLuint id,
-                         enum mesa_debug_severity severity)
-{
-   const GLint gstack = debug->CurrentGroup;
-   struct gl_debug_group *grp = debug->Groups[gstack];
-   struct gl_debug_namespace *nspace = &grp->Namespaces[source][type];
-
-   if (!debug->DebugOutput)
-      return false;
-
-   return debug_namespace_get(nspace, id, severity);
-}
-
-/**
- * 'buf' is not necessarily a null-terminated string. When logging, copy
- * 'len' characters from it, store them in a new, null-terminated string,
- * and remember the number of bytes used by that string, *including*
- * the null terminator this time.
- */
-static void
-debug_log_message(struct gl_debug_state *debug,
-                  enum mesa_debug_source source,
-                  enum mesa_debug_type type, GLuint id,
-                  enum mesa_debug_severity severity,
-                  GLsizei len, const char *buf)
-{
-   struct gl_debug_log *log = &debug->Log;
-   GLint nextEmpty;
-   struct gl_debug_message *emptySlot;
-
-   assert(len < MAX_DEBUG_MESSAGE_LENGTH);
-
-   if (log->NumMessages == MAX_DEBUG_LOGGED_MESSAGES)
-      return;
-
-   nextEmpty = (log->NextMessage + log->NumMessages)
-      % MAX_DEBUG_LOGGED_MESSAGES;
-   emptySlot = &log->Messages[nextEmpty];
-
-   debug_message_store(emptySlot, source, type,
-                       id, severity, len, buf);
-
-   log->NumMessages++;
-}
-
-/**
- * Return the oldest debug message out of the log.
- */
-static const struct gl_debug_message *
-debug_fetch_message(const struct gl_debug_state *debug)
-{
-   const struct gl_debug_log *log = &debug->Log;
-
-   return (log->NumMessages) ? &log->Messages[log->NextMessage] : NULL;
-}
-
-/**
- * Delete the oldest debug messages out of the log.
- */
-static void
-debug_delete_messages(struct gl_debug_state *debug, int count)
-{
-   struct gl_debug_log *log = &debug->Log;
-
-   if (count > log->NumMessages)
-      count = log->NumMessages;
-
-   while (count--) {
-      struct gl_debug_message *msg = &log->Messages[log->NextMessage];
-
-      debug_message_clear(msg);
-
-      log->NumMessages--;
-      log->NextMessage++;
-      log->NextMessage %= MAX_DEBUG_LOGGED_MESSAGES;
-   }
-}
-
-static struct gl_debug_message *
-debug_get_group_message(struct gl_debug_state *debug)
-{
-   return &debug->GroupMessages[debug->CurrentGroup];
-}
-
-static void
-debug_push_group(struct gl_debug_state *debug)
-{
-   const GLint gstack = debug->CurrentGroup;
-
-   /* just point to the previous stack */
-   debug->Groups[gstack + 1] = debug->Groups[gstack];
-   debug->CurrentGroup++;
-}
-
-static void
-debug_pop_group(struct gl_debug_state *debug)
-{
-   debug_clear_group(debug);
-   debug->CurrentGroup--;
-}
-
-
-/**
- * Lock and return debug state for the context.  The debug state will be
- * allocated and initialized upon the first call.  When NULL is returned, the
- * debug state is not locked.
- */
-static struct gl_debug_state *
-_mesa_lock_debug_state(struct gl_context *ctx)
-{
-   mtx_lock(&ctx->DebugMutex);
-
-   if (!ctx->Debug) {
-      ctx->Debug = debug_create();
-      if (!ctx->Debug) {
-         GET_CURRENT_CONTEXT(cur);
-         mtx_unlock(&ctx->DebugMutex);
-
-         /*
-          * This function may be called from other threads.  When that is the
-          * case, we cannot record this OOM error.
-          */
-         if (ctx == cur)
-            _mesa_error(ctx, GL_OUT_OF_MEMORY, "allocating debug state");
-
-         return NULL;
-      }
-   }
-
-   return ctx->Debug;
-}
-
-static void
-_mesa_unlock_debug_state(struct gl_context *ctx)
-{
-   mtx_unlock(&ctx->DebugMutex);
-}
-
-/**
- * Set the integer debug state specified by \p pname.  This can be called from
- * _mesa_set_enable for example.
- */
-bool
-_mesa_set_debug_state_int(struct gl_context *ctx, GLenum pname, GLint val)
-{
-   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
-
-   if (!debug)
-      return false;
-
-   switch (pname) {
-   case GL_DEBUG_OUTPUT:
-      debug->DebugOutput = (val != 0);
-      break;
-   case GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB:
-      debug->SyncOutput = (val != 0);
-      break;
-   default:
-      assert(!"unknown debug output param");
-      break;
-   }
-
-   _mesa_unlock_debug_state(ctx);
-
-   return true;
-}
-
-/**
- * Query the integer debug state specified by \p pname.  This can be called
- * _mesa_GetIntegerv for example.
- */
-GLint
-_mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname)
-{
-   struct gl_debug_state *debug;
-   GLint val;
-
-   mtx_lock(&ctx->DebugMutex);
-   debug = ctx->Debug;
-   if (!debug) {
-      mtx_unlock(&ctx->DebugMutex);
-      return 0;
-   }
-
-   switch (pname) {
-   case GL_DEBUG_OUTPUT:
-      val = debug->DebugOutput;
-      break;
-   case GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB:
-      val = debug->SyncOutput;
-      break;
-   case GL_DEBUG_LOGGED_MESSAGES:
-      val = debug->Log.NumMessages;
-      break;
-   case GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH:
-      val = (debug->Log.NumMessages) ?
-         debug->Log.Messages[debug->Log.NextMessage].length : 0;
-      break;
-   case GL_DEBUG_GROUP_STACK_DEPTH:
-      val = debug->CurrentGroup + 1;
-      break;
-   default:
-      assert(!"unknown debug output param");
-      val = 0;
-      break;
-   }
-
-   mtx_unlock(&ctx->DebugMutex);
-
-   return val;
-}
-
-/**
- * Query the pointer debug state specified by \p pname.  This can be called
- * _mesa_GetPointerv for example.
- */
-void *
-_mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum pname)
-{
-   struct gl_debug_state *debug;
-   void *val;
-
-   mtx_lock(&ctx->DebugMutex);
-   debug = ctx->Debug;
-   if (!debug) {
-      mtx_unlock(&ctx->DebugMutex);
-      return NULL;
-   }
-
-   switch (pname) {
-   case GL_DEBUG_CALLBACK_FUNCTION_ARB:
-      val = (void *) debug->Callback;
-      break;
-   case GL_DEBUG_CALLBACK_USER_PARAM_ARB:
-      val = (void *) debug->CallbackData;
-      break;
-   default:
-      assert(!"unknown debug output param");
-      val = NULL;
-      break;
-   }
-
-   mtx_unlock(&ctx->DebugMutex);
-
-   return val;
-}
-
-/**
- * Insert a debug message.  The mutex is assumed to be locked, and will be
- * unlocked by this call.
- */
-static void
-log_msg_locked_and_unlock(struct gl_context *ctx,
-                          enum mesa_debug_source source,
-                          enum mesa_debug_type type, GLuint id,
-                          enum mesa_debug_severity severity,
-                          GLint len, const char *buf)
-{
-   struct gl_debug_state *debug = ctx->Debug;
-
-   if (!debug_is_message_enabled(debug, source, type, id, severity)) {
-      _mesa_unlock_debug_state(ctx);
-      return;
-   }
-
-   if (ctx->Debug->Callback) {
-      GLenum gl_source = debug_source_enums[source];
-      GLenum gl_type = debug_type_enums[type];
-      GLenum gl_severity = debug_severity_enums[severity];
-      GLDEBUGPROC callback = ctx->Debug->Callback;
-      const void *data = ctx->Debug->CallbackData;
-
-      /*
-       * When ctx->Debug->SyncOutput is GL_FALSE, the client is prepared for
-       * unsynchronous calls.  When it is GL_TRUE, we will not spawn threads.
-       * In either case, we can call the callback unlocked.
-       */
-      _mesa_unlock_debug_state(ctx);
-      callback(gl_source, gl_type, id, gl_severity, len, buf, data);
-   }
-   else {
-      debug_log_message(ctx->Debug, source, type, id, severity, len, buf);
-      _mesa_unlock_debug_state(ctx);
-   }
-}
-
-/**
- * Log a client or driver debug message.
- */
-static void
-log_msg(struct gl_context *ctx, enum mesa_debug_source source,
-        enum mesa_debug_type type, GLuint id,
-        enum mesa_debug_severity severity, GLint len, const char *buf)
-{
-   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
-
-   if (!debug)
-      return;
-
-   log_msg_locked_and_unlock(ctx, source, type, id, severity, len, buf);
-}
-
-
-/**
- * Verify that source, type, and severity are valid enums.
- *
- * The 'caller' param is used for handling values available
- * only in glDebugMessageInsert or glDebugMessageControl
- */
-static GLboolean
-validate_params(struct gl_context *ctx, unsigned caller,
-                const char *callerstr, GLenum source, GLenum type,
-                GLenum severity)
-{
-#define INSERT 1
-#define CONTROL 2
-   switch(source) {
-   case GL_DEBUG_SOURCE_APPLICATION_ARB:
-   case GL_DEBUG_SOURCE_THIRD_PARTY_ARB:
-      break;
-   case GL_DEBUG_SOURCE_API_ARB:
-   case GL_DEBUG_SOURCE_SHADER_COMPILER_ARB:
-   case GL_DEBUG_SOURCE_WINDOW_SYSTEM_ARB:
-   case GL_DEBUG_SOURCE_OTHER_ARB:
-      if (caller != INSERT)
-         break;
-      else
-         goto error;
-   case GL_DONT_CARE:
-      if (caller == CONTROL)
-         break;
-      else
-         goto error;
-   default:
-      goto error;
-   }
-
-   switch(type) {
-   case GL_DEBUG_TYPE_ERROR_ARB:
-   case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR_ARB:
-   case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR_ARB:
-   case GL_DEBUG_TYPE_PERFORMANCE_ARB:
-   case GL_DEBUG_TYPE_PORTABILITY_ARB:
-   case GL_DEBUG_TYPE_OTHER_ARB:
-   case GL_DEBUG_TYPE_MARKER:
-   case GL_DEBUG_TYPE_PUSH_GROUP:
-   case GL_DEBUG_TYPE_POP_GROUP:
-      break;
-   case GL_DONT_CARE:
-      if (caller == CONTROL)
-         break;
-      else
-         goto error;
-   default:
-      goto error;
-   }
-
-   switch(severity) {
-   case GL_DEBUG_SEVERITY_HIGH_ARB:
-   case GL_DEBUG_SEVERITY_MEDIUM_ARB:
-   case GL_DEBUG_SEVERITY_LOW_ARB:
-   case GL_DEBUG_SEVERITY_NOTIFICATION:
-      break;
-   case GL_DONT_CARE:
-      if (caller == CONTROL)
-         break;
-      else
-         goto error;
-   default:
-      goto error;
-   }
-   return GL_TRUE;
-
-error:
-   _mesa_error(ctx, GL_INVALID_ENUM, "bad values passed to %s"
-               "(source=0x%x, type=0x%x, severity=0x%x)", callerstr,
-               source, type, severity);
-
-   return GL_FALSE;
-}
-
-
-static GLboolean
-validate_length(struct gl_context *ctx, const char *callerstr, GLsizei length,
-                const GLchar *buf)
-{
-
-   if (length < 0) {
-      GLsizei len = strlen(buf);
-
-      if (len >= MAX_DEBUG_MESSAGE_LENGTH) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                    "%s(null terminated string length=%d, is not less than "
-                    "GL_MAX_DEBUG_MESSAGE_LENGTH=%d)", callerstr, len,
-                    MAX_DEBUG_MESSAGE_LENGTH);
-         return GL_FALSE;
-      }
-   }
-
-   if (length >= MAX_DEBUG_MESSAGE_LENGTH) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                 "%s(length=%d, which is not less than "
-                 "GL_MAX_DEBUG_MESSAGE_LENGTH=%d)", callerstr, length,
-                 MAX_DEBUG_MESSAGE_LENGTH);
-      return GL_FALSE;
-   }
-
-   return GL_TRUE;
-}
-
-
-void GLAPIENTRY
-_mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
-                         GLenum severity, GLint length,
-                         const GLchar *buf)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   const char *callerstr;
-
-   if (_mesa_is_desktop_gl(ctx))
-      callerstr = "glDebugMessageInsert";
-   else
-      callerstr = "glDebugMessageInsertKHR";
-
-   if (!validate_params(ctx, INSERT, callerstr, source, type, severity))
-      return; /* GL_INVALID_ENUM */
-
-   if (!validate_length(ctx, callerstr, length, buf))
-      return; /* GL_INVALID_VALUE */
-
-   log_msg(ctx, gl_enum_to_debug_source(source),
-           gl_enum_to_debug_type(type), id,
-           gl_enum_to_debug_severity(severity),
-           length, buf);
-
-   if (type == GL_DEBUG_TYPE_MARKER && ctx->Driver.EmitStringMarker) {
-      /* if length not specified, string will be null terminated: */
-      if (length < 0)
-         length = strlen(buf);
-      ctx->Driver.EmitStringMarker(ctx, buf, length);
-   }
-}
-
-
-GLuint GLAPIENTRY
-_mesa_GetDebugMessageLog(GLuint count, GLsizei logSize, GLenum *sources,
-                         GLenum *types, GLenum *ids, GLenum *severities,
-                         GLsizei *lengths, GLchar *messageLog)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_debug_state *debug;
-   const char *callerstr;
-   GLuint ret;
-
-   if (_mesa_is_desktop_gl(ctx))
-      callerstr = "glGetDebugMessageLog";
-   else
-      callerstr = "glGetDebugMessageLogKHR";
-
-   if (!messageLog)
-      logSize = 0;
-
-   if (logSize < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(logSize=%d : logSize must not be negative)",
-                  callerstr, logSize);
-      return 0;
-   }
-
-   debug = _mesa_lock_debug_state(ctx);
-   if (!debug)
-      return 0;
-
-   for (ret = 0; ret < count; ret++) {
-      const struct gl_debug_message *msg = debug_fetch_message(debug);
-      GLsizei len;
-
-      if (!msg)
-         break;
-
-      len = msg->length;
-      if (len < 0)
-         len = strlen(msg->message);
-
-      if (logSize < len+1 && messageLog != NULL)
-         break;
-
-      if (messageLog) {
-         assert(msg->message[len] == '\0');
-         (void) strncpy(messageLog, msg->message, (size_t)len+1);
-
-         messageLog += len+1;
-         logSize -= len+1;
-      }
-
-      if (lengths)
-         *lengths++ = len+1;
-      if (severities)
-         *severities++ = debug_severity_enums[msg->severity];
-      if (sources)
-         *sources++ = debug_source_enums[msg->source];
-      if (types)
-         *types++ = debug_type_enums[msg->type];
-      if (ids)
-         *ids++ = msg->id;
-
-      debug_delete_messages(debug, 1);
-   }
-
-   _mesa_unlock_debug_state(ctx);
-
-   return ret;
-}
-
-
-void GLAPIENTRY
-_mesa_DebugMessageControl(GLenum gl_source, GLenum gl_type,
-                          GLenum gl_severity, GLsizei count,
-                          const GLuint *ids, GLboolean enabled)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   enum mesa_debug_source source = gl_enum_to_debug_source(gl_source);
-   enum mesa_debug_type type = gl_enum_to_debug_type(gl_type);
-   enum mesa_debug_severity severity = gl_enum_to_debug_severity(gl_severity);
-   const char *callerstr;
-   struct gl_debug_state *debug;
-
-   if (_mesa_is_desktop_gl(ctx))
-      callerstr = "glDebugMessageControl";
-   else
-      callerstr = "glDebugMessageControlKHR";
-
-   if (count < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(count=%d : count must not be negative)", callerstr,
-                  count);
-      return;
-   }
-
-   if (!validate_params(ctx, CONTROL, callerstr, gl_source, gl_type,
-                        gl_severity))
-      return; /* GL_INVALID_ENUM */
-
-   if (count && (gl_severity != GL_DONT_CARE || gl_type == GL_DONT_CARE
-                 || gl_source == GL_DONT_CARE)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s(When passing an array of ids, severity must be"
-         " GL_DONT_CARE, and source and type must not be GL_DONT_CARE.",
-                  callerstr);
-      return;
-   }
-
-   debug = _mesa_lock_debug_state(ctx);
-   if (!debug)
-      return;
-
-   if (count) {
-      GLsizei i;
-      for (i = 0; i < count; i++)
-         debug_set_message_enable(debug, source, type, ids[i], enabled);
-   }
-   else {
-      debug_set_message_enable_all(debug, source, type, severity, enabled);
-   }
-
-   _mesa_unlock_debug_state(ctx);
-}
-
-
-void GLAPIENTRY
-_mesa_DebugMessageCallback(GLDEBUGPROC callback, const void *userParam)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
-   if (debug) {
-      debug->Callback = callback;
-      debug->CallbackData = userParam;
-      _mesa_unlock_debug_state(ctx);
-   }
-}
-
-
-void GLAPIENTRY
-_mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
-                     const GLchar *message)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   const char *callerstr;
-   struct gl_debug_state *debug;
-   struct gl_debug_message *emptySlot;
-
-   if (_mesa_is_desktop_gl(ctx))
-      callerstr = "glPushDebugGroup";
-   else
-      callerstr = "glPushDebugGroupKHR";
-
-   switch(source) {
-   case GL_DEBUG_SOURCE_APPLICATION:
-   case GL_DEBUG_SOURCE_THIRD_PARTY:
-      break;
-   default:
-      _mesa_error(ctx, GL_INVALID_ENUM, "bad value passed to %s"
-                  "(source=0x%x)", callerstr, source);
-      return;
-   }
-
-   if (!validate_length(ctx, callerstr, length, message))
-      return; /* GL_INVALID_VALUE */
-
-   debug = _mesa_lock_debug_state(ctx);
-   if (!debug)
-      return;
-
-   if (debug->CurrentGroup >= MAX_DEBUG_GROUP_STACK_DEPTH-1) {
-      _mesa_unlock_debug_state(ctx);
-      _mesa_error(ctx, GL_STACK_OVERFLOW, "%s", callerstr);
-      return;
-   }
-
-   /* pop reuses the message details from push so we store this */
-   emptySlot = debug_get_group_message(debug);
-   debug_message_store(emptySlot,
-                       gl_enum_to_debug_source(source),
-                       gl_enum_to_debug_type(GL_DEBUG_TYPE_PUSH_GROUP),
-                       id,
-                       gl_enum_to_debug_severity(GL_DEBUG_SEVERITY_NOTIFICATION),
-                       length, message);
-
-   debug_push_group(debug);
-
-   log_msg_locked_and_unlock(ctx,
-         gl_enum_to_debug_source(source),
-         MESA_DEBUG_TYPE_PUSH_GROUP, id,
-         MESA_DEBUG_SEVERITY_NOTIFICATION, length,
-         message);
-}
-
-
-void GLAPIENTRY
-_mesa_PopDebugGroup(void)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   const char *callerstr;
-   struct gl_debug_state *debug;
-   struct gl_debug_message *gdmessage, msg;
-
-   if (_mesa_is_desktop_gl(ctx))
-      callerstr = "glPopDebugGroup";
-   else
-      callerstr = "glPopDebugGroupKHR";
-
-   debug = _mesa_lock_debug_state(ctx);
-   if (!debug)
-      return;
-
-   if (debug->CurrentGroup <= 0) {
-      _mesa_unlock_debug_state(ctx);
-      _mesa_error(ctx, GL_STACK_UNDERFLOW, "%s", callerstr);
-      return;
-   }
-
-   debug_pop_group(debug);
-
-   /* make a shallow copy */
-   gdmessage = debug_get_group_message(debug);
-   msg = *gdmessage;
-   gdmessage->message = NULL;
-   gdmessage->length = 0;
-
-   log_msg_locked_and_unlock(ctx,
-         msg.source,
-         gl_enum_to_debug_type(GL_DEBUG_TYPE_POP_GROUP),
-         msg.id,
-         gl_enum_to_debug_severity(GL_DEBUG_SEVERITY_NOTIFICATION),
-         msg.length, msg.message);
-
-   debug_message_clear(&msg);
-}
-
-
-void
-_mesa_init_errors(struct gl_context *ctx)
-{
-   mtx_init(&ctx->DebugMutex, mtx_plain);
-}
-
-
-void
-_mesa_free_errors_data(struct gl_context *ctx)
-{
-   if (ctx->Debug) {
-      debug_destroy(ctx->Debug);
-      /* set to NULL just in case it is used before context is completely gone. */
-      ctx->Debug = NULL;
-   }
-
-   mtx_destroy(&ctx->DebugMutex);
-}
-
-void GLAPIENTRY
-_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   if (ctx->Extensions.GREMEDY_string_marker) {
-      /* if length not specified, string will be null terminated: */
-      if (len <= 0)
-         len = strlen(string);
-      ctx->Driver.EmitStringMarker(ctx, string, len);
-   } else {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "StringMarkerGREMEDY");
-   }
-}
-
-/**********************************************************************/
-/** \name Diagnostics */
-/*@{*/
 
 static FILE *LogFile = NULL;
 
@@ -1492,11 +234,11 @@ _mesa_gl_vdebug(struct gl_context *ctx,
    char s[MAX_DEBUG_MESSAGE_LENGTH];
    int len;
 
-   debug_get_id(id);
+   _mesa_debug_get_id(id);
 
    len = _mesa_vsnprintf(s, MAX_DEBUG_MESSAGE_LENGTH, fmtString, args);
 
-   log_msg(ctx, source, type, *id, severity, len, s);
+   _mesa_log_msg(ctx, source, type, *id, severity, len, s);
 }
 
 
@@ -1536,17 +278,17 @@ _mesa_error( struct gl_context *ctx, GLenum error, const char *fmtString, ... )
     */
    static GLuint error_msg_id = 0;
 
-   debug_get_id(&error_msg_id);
+   _mesa_debug_get_id(&error_msg_id);
 
    do_output = should_output(ctx, error, fmtString);
 
    mtx_lock(&ctx->DebugMutex);
    if (ctx->Debug) {
-      do_log = debug_is_message_enabled(ctx->Debug,
-                                        MESA_DEBUG_SOURCE_API,
-                                        MESA_DEBUG_TYPE_ERROR,
-                                        error_msg_id,
-                                        MESA_DEBUG_SEVERITY_HIGH);
+      do_log = _mesa_debug_is_message_enabled(ctx->Debug,
+                                              MESA_DEBUG_SOURCE_API,
+                                              MESA_DEBUG_TYPE_ERROR,
+                                              error_msg_id,
+                                              MESA_DEBUG_SEVERITY_HIGH);
    }
    else {
       do_log = GL_FALSE;
@@ -1585,8 +327,8 @@ _mesa_error( struct gl_context *ctx, GLenum error, const char *fmtString, ... )
 
       /* Log the error via ARB_debug_output if needed.*/
       if (do_log) {
-         log_msg(ctx, MESA_DEBUG_SOURCE_API, MESA_DEBUG_TYPE_ERROR,
-                 error_msg_id, MESA_DEBUG_SEVERITY_HIGH, len, s2);
+         _mesa_log_msg(ctx, MESA_DEBUG_SOURCE_API, MESA_DEBUG_TYPE_ERROR,
+                       error_msg_id, MESA_DEBUG_SEVERITY_HIGH, len, s2);
       }
    }
 
@@ -1652,7 +394,7 @@ _mesa_shader_debug(struct gl_context *ctx, GLenum type, GLuint *id,
    enum mesa_debug_severity severity = MESA_DEBUG_SEVERITY_HIGH;
    int len;
 
-   debug_get_id(id);
+   _mesa_debug_get_id(id);
 
    len = strlen(msg);
 
@@ -1660,7 +402,5 @@ _mesa_shader_debug(struct gl_context *ctx, GLenum type, GLuint *id,
    if (len >= MAX_DEBUG_MESSAGE_LENGTH)
       len = MAX_DEBUG_MESSAGE_LENGTH - 1;
 
-   log_msg(ctx, source, type, *id, severity, len, msg);
+   _mesa_log_msg(ctx, source, type, *id, severity, len, msg);
 }
-
-/*@}*/
diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index 92df2ac868a..5ad5254ff1b 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -47,14 +47,6 @@
 extern "C" {
 #endif
 
-struct _glapi_table;
-
-extern void
-_mesa_init_errors( struct gl_context *ctx );
-
-extern void
-_mesa_free_errors_data( struct gl_context *ctx );
-
 extern void
 _mesa_warning( struct gl_context *gc, const char *fmtString, ... ) PRINTFLIKE(2, 3);
 
@@ -76,6 +68,10 @@ _mesa_log(const char *fmtString, ...) PRINTFLIKE(1, 2);
 extern FILE *
 _mesa_get_log_file(void);
 
+void
+_mesa_shader_debug(struct gl_context *ctx, GLenum type, GLuint *id,
+                   const char *msg);
+
 extern void
 _mesa_gl_vdebug(struct gl_context *ctx,
                 GLuint *id,
@@ -104,42 +100,6 @@ _mesa_gl_debug(struct gl_context *ctx,
    }                                                                      \
 } while (0)
 
-bool
-_mesa_set_debug_state_int(struct gl_context *ctx, GLenum pname, GLint val);
-
-GLint
-_mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname);
-
-void *
-_mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum pname);
-
-extern void
-_mesa_shader_debug(struct gl_context *ctx, GLenum type, GLuint *id,
-                   const char *msg);
-
-void GLAPIENTRY
-_mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
-                         GLenum severity, GLint length,
-                         const GLchar* buf);
-GLuint GLAPIENTRY
-_mesa_GetDebugMessageLog(GLuint count, GLsizei logSize, GLenum* sources,
-                         GLenum* types, GLenum* ids, GLenum* severities,
-                         GLsizei* lengths, GLchar* messageLog);
-void GLAPIENTRY
-_mesa_DebugMessageControl(GLenum source, GLenum type, GLenum severity,
-                          GLsizei count, const GLuint *ids,
-                          GLboolean enabled);
-void GLAPIENTRY
-_mesa_DebugMessageCallback(GLDEBUGPROC callback,
-                           const void *userParam);
-void GLAPIENTRY
-_mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
-                     const GLchar *message);
-void GLAPIENTRY
-_mesa_PopDebugGroup(void);
-
-void GLAPIENTRY
-_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index ded6f2c06dc..d1e3a99fdc0 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -273,6 +273,8 @@ EXT(MESA_texture_signed_rgba                , EXT_texture_snorm
 EXT(MESA_window_pos                         , dummy_true                             , GLL,  x ,  x ,  x , 2000)
 EXT(MESA_ycbcr_texture                      , MESA_ycbcr_texture                     , GLL, GLC,  x ,  x , 2002)
 
+EXT(NVX_gpu_memory_info                     , NVX_gpu_memory_info                    , GLL, GLC,  x ,  x , 2013)
+
 EXT(NV_blend_square                         , dummy_true                             , GLL,  x ,  x ,  x , 1999)
 EXT(NV_conditional_render                   , NV_conditional_render                  , GLL, GLC,  x ,  x , 2008)
 EXT(NV_depth_clamp                          , ARB_depth_clamp                        , GLL, GLC,  x ,  x , 2001)
@@ -293,7 +295,6 @@ EXT(NV_texture_barrier                      , NV_texture_barrier
 EXT(NV_texture_env_combine4                 , NV_texture_env_combine4                , GLL,  x ,  x ,  x , 1999)
 EXT(NV_texture_rectangle                    , NV_texture_rectangle                   , GLL,  x ,  x ,  x , 2000)
 EXT(NV_vdpau_interop                        , NV_vdpau_interop                       , GLL, GLC,  x ,  x , 2010)
-EXT(NVX_gpu_memory_info                     , NVX_gpu_memory_info                    , GLL, GLC,  x ,  x , 2013)
 
 EXT(OES_EGL_image                           , OES_EGL_image                          , GLL, GLC, ES1, ES2, 2006) /* FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */
 EXT(OES_EGL_image_external                  , OES_EGL_image_external                 ,  x ,  x , ES1, ES2, 2010)
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 8453a922549..9005dc5897d 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -26,6 +26,7 @@
 #include "glheader.h"
 #include "context.h"
 #include "blend.h"
+#include "debug_output.h"
 #include "enable.h"
 #include "enums.h"
 #include "errors.h"
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 87c5a3a194f..92f8a389cd9 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -26,6 +26,7 @@
 #include <stdbool.h>
 #include "glheader.h"
 #include "context.h"
+#include "debug_output.h"
 #include "get.h"
 #include "enums.h"
 #include "extensions.h"
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index b86692a5f7e..7a70b592c47 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -807,11 +807,6 @@ invalid_enum:
       return;
    }
 
-   /* TODO: Have the driver be required to handle this fixup. */
-   if (q->Target == GL_ANY_SAMPLES_PASSED ||
-       q->Target == GL_ANY_SAMPLES_PASSED_CONSERVATIVE)
-      value = !!value;
-
    switch (ptype) {
    case GL_INT: {
       GLint *param = (GLint *)offset;
diff --git a/src/mesa/main/viewport.c b/src/mesa/main/viewport.c
index 7d8914291c3..681e46bfcf7 100644
--- a/src/mesa/main/viewport.c
+++ b/src/mesa/main/viewport.c
@@ -456,11 +456,11 @@ _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
    translate[0] = half_width + x;
    if (ctx->Transform.ClipOrigin == GL_UPPER_LEFT) {
       scale[1] = -half_height;
-      translate[1] = half_height - y;
    } else {
       scale[1] = half_height;
-      translate[1] = half_height + y;
    }
+   translate[1] = half_height + y;
+
    if (ctx->Transform.ClipDepthMode == GL_NEGATIVE_ONE_TO_ONE) {
       scale[2] = 0.5 * (f - n);
       translate[2] = 0.5 * (n + f);
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 0f17ed136da..71c5fc4a485 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -51,7 +51,6 @@
 #include "program/prog_print.h"
 #include "program/program.h"
 #include "program/prog_parameter.h"
-#include "program/sampler.h"
 
 
 static int swizzle_for_size(int size);
@@ -1390,7 +1389,7 @@ ir_to_mesa_visitor::visit(ir_dereference_variable *ir)
       switch (var->data.mode) {
       case ir_var_uniform:
 	 entry = new(mem_ctx) variable_storage(var, PROGRAM_UNIFORM,
-					       var->data.location);
+					       var->data.param_index);
 	 this->variables.push_tail(entry);
 	 break;
       case ir_var_shader_in:
@@ -1540,6 +1539,82 @@ get_assignment_lhs(ir_dereference *ir, ir_to_mesa_visitor *v)
    return dst_reg(v->result);
 }
 
+/* Calculate the sampler index and also calculate the base uniform location
+ * for struct members.
+ */
+static void
+calc_sampler_offsets(struct gl_shader_program *prog, ir_dereference *deref,
+                     unsigned *offset, unsigned *array_elements,
+                     unsigned *location)
+{
+   if (deref->ir_type == ir_type_dereference_variable)
+      return;
+
+   switch (deref->ir_type) {
+   case ir_type_dereference_array: {
+      ir_dereference_array *deref_arr = deref->as_dereference_array();
+      ir_constant *array_index =
+         deref_arr->array_index->constant_expression_value();
+
+      if (!array_index) {
+	 /* GLSL 1.10 and 1.20 allowed variable sampler array indices,
+	  * while GLSL 1.30 requires that the array indices be
+	  * constant integer expressions.  We don't expect any driver
+	  * to actually work with a really variable array index, so
+	  * all that would work would be an unrolled loop counter that ends
+	  * up being constant above.
+	  */
+	 ralloc_strcat(&prog->InfoLog,
+		       "warning: Variable sampler array index unsupported.\n"
+		       "This feature of the language was removed in GLSL 1.20 "
+		       "and is unlikely to be supported for 1.10 in Mesa.\n");
+      } else {
+         *offset += array_index->value.u[0] * *array_elements;
+      }
+
+      *array_elements *= deref_arr->array->type->length;
+
+      calc_sampler_offsets(prog, deref_arr->array->as_dereference(),
+                           offset, array_elements, location);
+      break;
+   }
+
+   case ir_type_dereference_record: {
+      ir_dereference_record *deref_record = deref->as_dereference_record();
+      unsigned field_index =
+         deref_record->record->type->field_index(deref_record->field);
+      *location +=
+         deref_record->record->type->record_location_offset(field_index);
+      calc_sampler_offsets(prog, deref_record->record->as_dereference(),
+                           offset, array_elements, location);
+      break;
+   }
+
+   default:
+      unreachable("Invalid deref type");
+      break;
+   }
+}
+
+static int
+get_sampler_uniform_value(class ir_dereference *sampler,
+                          struct gl_shader_program *shader_program,
+                          const struct gl_program *prog)
+{
+   GLuint shader = _mesa_program_enum_to_shader_stage(prog->Target);
+   ir_variable *var = sampler->variable_referenced();
+   unsigned location = var->data.location;
+   unsigned array_elements = 1;
+   unsigned offset = 0;
+
+   calc_sampler_offsets(shader_program, sampler, &offset, &array_elements,
+                        &location);
+
+   assert(shader_program->UniformStorage[location].opaque[shader].active);
+   return shader_program->UniformStorage[location].opaque[shader].index +
+          offset;
+}
+
 /**
  * Process the condition of a conditional assignment
  *
@@ -1989,9 +2064,8 @@ ir_to_mesa_visitor::visit(ir_texture *ir)
    if (ir->shadow_comparitor)
       inst->tex_shadow = GL_TRUE;
 
-   inst->sampler = _mesa_get_sampler_uniform_value(ir->sampler,
-						   this->shader_program,
-						   this->prog);
+   inst->sampler = get_sampler_uniform_value(ir->sampler, shader_program,
+                                             prog);
 
    switch (sampler_type->sampler_dimensionality) {
    case GLSL_SAMPLER_DIM_1D:
@@ -2269,8 +2343,7 @@ public:
    {
       this->idx = -1;
       this->program_resource_visitor::process(var);
-
-      var->data.location = this->idx;
+      var->data.param_index = this->idx;
    }
 
 private:
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index f8659f65339..74cbbfb1cc6 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -928,7 +928,7 @@ ptn_add_output_stores(struct ptn_compile *c)
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
       store->num_components = glsl_get_vector_elements(var->type);
-      store->const_index[0] = (1 << store->num_components) - 1;
+      nir_intrinsic_set_write_mask(store, (1 << store->num_components) - 1);
       store->variables[0] =
          nir_deref_var_create(store, c->output_vars[var->data.location]);
 
@@ -999,7 +999,7 @@ setup_registers_and_variables(struct ptn_compile *c)
             nir_intrinsic_instr *store =
                nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
             store->num_components = 4;
-            store->const_index[0] = WRITEMASK_XYZW;
+            nir_intrinsic_set_write_mask(store, WRITEMASK_XYZW);
             store->variables[0] = nir_deref_var_create(store, fullvar);
             store->src[0] = nir_src_for_ssa(f001);
             nir_builder_instr_insert(b, &store->instr);
diff --git a/src/mesa/program/sampler.cpp b/src/mesa/program/sampler.cpp
deleted file mode 100644
index f118552d64e..00000000000
--- a/src/mesa/program/sampler.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
- * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
- * Copyright © 2010 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "main/mtypes.h"
-#include "compiler/glsl_types.h"
-#include "compiler/glsl/ir.h"
-#include "compiler/glsl/ir_uniform.h"
-#include "compiler/glsl/ir_visitor.h"
-#include "compiler/glsl/program.h"
-#include "program/hash_table.h"
-#include "program/prog_parameter.h"
-#include "program/program.h"
-
-
-class get_sampler_name : public ir_hierarchical_visitor
-{
-public:
-   get_sampler_name(ir_dereference *last,
-		    struct gl_shader_program *shader_program)
-   {
-      this->mem_ctx = ralloc_context(NULL);
-      this->shader_program = shader_program;
-      this->name = NULL;
-      this->offset = 0;
-      this->last = last;
-   }
-
-   ~get_sampler_name()
-   {
-      ralloc_free(this->mem_ctx);
-   }
-
-   virtual ir_visitor_status visit(ir_dereference_variable *ir)
-   {
-      this->name = ir->var->name;
-      return visit_continue;
-   }
-
-   virtual ir_visitor_status visit_leave(ir_dereference_record *ir)
-   {
-      this->name = ralloc_asprintf(mem_ctx, "%s.%s", name, ir->field);
-      return visit_continue;
-   }
-
-   virtual ir_visitor_status visit_leave(ir_dereference_array *ir)
-   {
-      ir_constant *index = ir->array_index->as_constant();
-      int i;
-
-      if (index) {
-	 i = index->value.i[0];
-      } else {
-	 /* GLSL 1.10 and 1.20 allowed variable sampler array indices,
-	  * while GLSL 1.30 requires that the array indices be
-	  * constant integer expressions.  We don't expect any driver
-	  * to actually work with a really variable array index, so
-	  * all that would work would be an unrolled loop counter that ends
-	  * up being constant above.
-	  */
-	 ralloc_strcat(&shader_program->InfoLog,
-		       "warning: Variable sampler array index unsupported.\n"
-		       "This feature of the language was removed in GLSL 1.20 "
-		       "and is unlikely to be supported for 1.10 in Mesa.\n");
-	 i = 0;
-      }
-      if (ir != last) {
-	 this->name = ralloc_asprintf(mem_ctx, "%s[%d]", name, i);
-      } else {
-	 offset = i;
-      }
-      return visit_continue;
-   }
-
-   struct gl_shader_program *shader_program;
-   const char *name;
-   void *mem_ctx;
-   int offset;
-   ir_dereference *last;
-};
-
-
-int
-_mesa_get_sampler_uniform_value(class ir_dereference *sampler,
-				struct gl_shader_program *shader_program,
-				const struct gl_program *prog)
-{
-   get_sampler_name getname(sampler, shader_program);
-
-   GLuint shader = _mesa_program_enum_to_shader_stage(prog->Target);
-
-   sampler->accept(&getname);
-
-   unsigned location;
-   if (!shader_program->UniformHash->get(location, getname.name)) {
-      linker_error(shader_program,
-		   "failed to find sampler named %s.\n", getname.name);
-      return 0;
-   }
-
-   if (!shader_program->UniformStorage[location].opaque[shader].active) {
-      assert(0 && "cannot return a sampler");
-      linker_error(shader_program,
-		   "cannot return a sampler named %s, because it is not "
-                   "used in this shader stage. This is a driver bug.\n",
-                   getname.name);
-      return 0;
-   }
-
-   return shader_program->UniformStorage[location].opaque[shader].index +
-          getname.offset;
-}
-
-
-class ir_rvalue *
-_mesa_get_sampler_array_nonconst_index(class ir_dereference *sampler)
-{
-   ir_dereference_array *deref_arr = sampler->as_dereference_array();
-   if (!deref_arr || deref_arr->array_index->as_constant())
-      return NULL;
-
-   return deref_arr->array_index;
-}
diff --git a/src/mesa/program/sampler.h b/src/mesa/program/sampler.h
deleted file mode 100644
index 61c7f5851e7..00000000000
--- a/src/mesa/program/sampler.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
- * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
- * Copyright © 2010 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef SAMPLER_H
-#define SAMPLER_H
-
-
-int
-_mesa_get_sampler_uniform_value(class ir_dereference *sampler,
-				struct gl_shader_program *shader_program,
-				const struct gl_program *prog);
-
-class ir_rvalue *
-_mesa_get_sampler_array_nonconst_index(class ir_dereference *sampler);
-
-
-#endif /* SAMPLER_H */
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 0f9ea101889..23b7abfc1c5 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -163,7 +163,6 @@ static void
 update_gp( struct st_context *st )
 {
    struct st_geometry_program *stgp;
-   struct st_gp_variant_key key;
 
    if (!st->ctx->GeometryProgram._Current) {
       cso_set_geometry_shader_handle(st->cso_context, NULL);
@@ -173,10 +172,7 @@ update_gp( struct st_context *st )
    stgp = st_geometry_program(st->ctx->GeometryProgram._Current);
    assert(stgp->Base.Base.Target == GL_GEOMETRY_PROGRAM_NV);
 
-   memset(&key, 0, sizeof(key));
-   key.st = st->has_shareable_shaders ? NULL : st;
-
-   st->gp_variant = st_get_gp_variant(st, stgp, &key);
+   st->gp_variant = st_get_basic_variant(st, &stgp->tgsi, &stgp->variants);
 
    st_reference_geomprog(st, &st->gp, stgp);
 
@@ -199,7 +195,6 @@ static void
 update_tcp( struct st_context *st )
 {
    struct st_tessctrl_program *sttcp;
-   struct st_tcp_variant_key key;
 
    if (!st->ctx->TessCtrlProgram._Current) {
       cso_set_tessctrl_shader_handle(st->cso_context, NULL);
@@ -209,10 +204,7 @@ update_tcp( struct st_context *st )
    sttcp = st_tessctrl_program(st->ctx->TessCtrlProgram._Current);
    assert(sttcp->Base.Base.Target == GL_TESS_CONTROL_PROGRAM_NV);
 
-   memset(&key, 0, sizeof(key));
-   key.st = st->has_shareable_shaders ? NULL : st;
-
-   st->tcp_variant = st_get_tcp_variant(st, sttcp, &key);
+   st->tcp_variant = st_get_basic_variant(st, &sttcp->tgsi, &sttcp->variants);
 
    st_reference_tesscprog(st, &st->tcp, sttcp);
 
@@ -235,7 +227,6 @@ static void
 update_tep( struct st_context *st )
 {
    struct st_tesseval_program *sttep;
-   struct st_tep_variant_key key;
 
    if (!st->ctx->TessEvalProgram._Current) {
       cso_set_tesseval_shader_handle(st->cso_context, NULL);
@@ -245,10 +236,7 @@ update_tep( struct st_context *st )
    sttep = st_tesseval_program(st->ctx->TessEvalProgram._Current);
    assert(sttep->Base.Base.Target == GL_TESS_EVALUATION_PROGRAM_NV);
 
-   memset(&key, 0, sizeof(key));
-   key.st = st->has_shareable_shaders ? NULL : st;
-
-   st->tep_variant = st_get_tep_variant(st, sttep, &key);
+   st->tep_variant = st_get_basic_variant(st, &sttep->tgsi, &sttep->variants);
 
    st_reference_tesseprog(st, &st->tep, sttep);
 
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index d8c3dbdd793..34809ad7163 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -176,97 +176,19 @@ make_bitmap_texture(struct gl_context *ctx, GLsizei width, GLsizei height,
    return pt;
 }
 
-static void
-setup_bitmap_vertex_data(struct st_context *st, bool normalized,
-                         int x, int y, int width, int height,
-                         float z, const float color[4],
-			 struct pipe_resource **vbuf,
-			 unsigned *vbuf_offset)
-{
-   const GLfloat fb_width = (GLfloat)st->state.framebuffer.width;
-   const GLfloat fb_height = (GLfloat)st->state.framebuffer.height;
-   const GLfloat x0 = (GLfloat)x;
-   const GLfloat x1 = (GLfloat)(x + width);
-   const GLfloat y0 = (GLfloat)y;
-   const GLfloat y1 = (GLfloat)(y + height);
-   GLfloat sLeft = (GLfloat)0.0, sRight = (GLfloat)1.0;
-   GLfloat tTop = (GLfloat)0.0, tBot = (GLfloat)1.0 - tTop;
-   const GLfloat clip_x0 = (GLfloat)(x0 / fb_width * 2.0 - 1.0);
-   const GLfloat clip_y0 = (GLfloat)(y0 / fb_height * 2.0 - 1.0);
-   const GLfloat clip_x1 = (GLfloat)(x1 / fb_width * 2.0 - 1.0);
-   const GLfloat clip_y1 = (GLfloat)(y1 / fb_height * 2.0 - 1.0);
-   GLuint i;
-   float (*vertices)[3][4];  /**< vertex pos + color + texcoord */
-
-   if(!normalized)
-   {
-      sRight = (GLfloat) width;
-      tBot = (GLfloat) height;
-   }
-
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), 4,
-                  vbuf_offset, vbuf, (void **) &vertices);
-   if (!*vbuf) {
-      return;
-   }
-
-   /* Positions are in clip coords since we need to do clipping in case
-    * the bitmap quad goes beyond the window bounds.
-    */
-   vertices[0][0][0] = clip_x0;
-   vertices[0][0][1] = clip_y0;
-   vertices[0][2][0] = sLeft;
-   vertices[0][2][1] = tTop;
-
-   vertices[1][0][0] = clip_x1;
-   vertices[1][0][1] = clip_y0;
-   vertices[1][2][0] = sRight;
-   vertices[1][2][1] = tTop;
-   
-   vertices[2][0][0] = clip_x1;
-   vertices[2][0][1] = clip_y1;
-   vertices[2][2][0] = sRight;
-   vertices[2][2][1] = tBot;
-   
-   vertices[3][0][0] = clip_x0;
-   vertices[3][0][1] = clip_y1;
-   vertices[3][2][0] = sLeft;
-   vertices[3][2][1] = tBot;
-   
-   /* same for all verts: */
-   for (i = 0; i < 4; i++) {
-      vertices[i][0][2] = z;
-      vertices[i][0][3] = 1.0f;
-      vertices[i][1][0] = color[0];
-      vertices[i][1][1] = color[1];
-      vertices[i][1][2] = color[2];
-      vertices[i][1][3] = color[3];
-      vertices[i][2][2] = 0.0; /*R*/
-      vertices[i][2][3] = 1.0; /*Q*/
-   }
-
-   u_upload_unmap(st->uploader);
-}
-
-
 
 /**
- * Render a glBitmap by drawing a textured quad
+ * Setup pipeline state prior to rendering the bitmap textured quad.
  */
 static void
-draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
-                 GLsizei width, GLsizei height,
-                 struct pipe_sampler_view *sv,
-                 const GLfloat *color)
+setup_render_state(struct gl_context *ctx,
+                   struct pipe_sampler_view *sv,
+                   const GLfloat *color)
 {
    struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
    struct cso_context *cso = st->cso_context;
    struct st_fp_variant *fpv;
    struct st_fp_variant_key key;
-   GLuint maxSize;
-   GLuint offset;
-   struct pipe_resource *vbuf = NULL;
 
    memset(&key, 0, sizeof(key));
    key.st = st->has_shareable_shaders ? NULL : st;
@@ -292,16 +214,6 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
       COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], colorSave);
    }
 
-
-   /* limit checks */
-   /* XXX if the bitmap is larger than the max texture size, break
-    * it up into chunks.
-    */
-   maxSize = 1 << (pipe->screen->get_param(pipe->screen,
-                                    PIPE_CAP_MAX_TEXTURE_2D_LEVELS) - 1);
-   assert(width <= (GLsizei)maxSize);
-   assert(height <= (GLsizei)maxSize);
-
    cso_save_rasterizer(cso);
    cso_save_fragment_samplers(cso);
    cso_save_fragment_sampler_views(cso);
@@ -373,24 +285,18 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
    cso_set_vertex_elements(cso, 3, st->velems_util_draw);
    cso_set_stream_outputs(st->cso_context, 0, NULL, NULL);
+}
 
-   /* convert Z from [0,1] to [-1,-1] to match viewport Z scale/bias */
-   z = z * 2.0f - 1.0f;
 
-   /* draw textured quad */
-   setup_bitmap_vertex_data(st, sv->texture->target != PIPE_TEXTURE_RECT,
-			    x, y, width, height, z, color, &vbuf, &offset);
-
-   if (vbuf) {
-      util_draw_vertex_buffer(pipe, st->cso_context, vbuf,
-                              cso_get_aux_vertex_buffer_slot(st->cso_context),
-                              offset,
-                              PIPE_PRIM_TRIANGLE_FAN,
-                              4,  /* verts */
-                              3); /* attribs/vert */
-   }
+/**
+ * Restore pipeline state after rendering the bitmap textured quad.
+ */
+static void
+restore_render_state(struct gl_context *ctx)
+{
+   struct st_context *st = st_context(ctx);
+   struct cso_context *cso = st->cso_context;
 
-   /* restore state */
    cso_restore_rasterizer(cso);
    cso_restore_fragment_samplers(cso);
    cso_restore_fragment_sampler_views(cso);
@@ -403,6 +309,111 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_restore_vertex_elements(cso);
    cso_restore_aux_vertex_buffer_slot(cso);
    cso_restore_stream_outputs(cso);
+}
+
+
+/**
+ * Render a glBitmap by drawing a textured quad
+ */
+static void
+draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
+                 GLsizei width, GLsizei height,
+                 struct pipe_sampler_view *sv,
+                 const GLfloat *color)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_resource *vbuf = NULL;
+   const float fb_width = (float) st->state.framebuffer.width;
+   const float fb_height = (float) st->state.framebuffer.height;
+   const float x0 = (float) x;
+   const float x1 = (float) (x + width);
+   const float y0 = (float) y;
+   const float y1 = (float) (y + height);
+   float sLeft = 0.0f, sRight = 1.0f;
+   float tTop = 0.0f, tBot = 1.0f - tTop;
+   const float clip_x0 = x0 / fb_width * 2.0f - 1.0f;
+   const float clip_y0 = y0 / fb_height * 2.0f - 1.0f;
+   const float clip_x1 = x1 / fb_width * 2.0f - 1.0f;
+   const float clip_y1 = y1 / fb_height * 2.0f - 1.0f;
+   float (*vertices)[3][4];  /**< vertex pos + color + texcoord */
+   unsigned offset, i;
+
+   /* limit checks */
+   {
+      /* XXX if the bitmap is larger than the max texture size, break
+       * it up into chunks.
+       */
+      GLuint maxSize = 1 << (pipe->screen->get_param(pipe->screen,
+                                    PIPE_CAP_MAX_TEXTURE_2D_LEVELS) - 1);
+      assert(width <= (GLsizei) maxSize);
+      assert(height <= (GLsizei) maxSize);
+   }
+
+   setup_render_state(ctx, sv, color);
+
+   /* convert Z from [0,1] to [-1,-1] to match viewport Z scale/bias */
+   z = z * 2.0f - 1.0f;
+
+   if (sv->texture->target == PIPE_TEXTURE_RECT) {
+      /* use non-normalized texcoords */
+      sRight = (float) width;
+      tBot = (float) height;
+   }
+
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), 4,
+                  &offset, &vbuf, (void **) &vertices);
+   if (!vbuf) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glBitmap");
+      restore_render_state(ctx);
+      return;
+   }
+
+   /* Positions are in clip coords since we need to do clipping in case
+    * the bitmap quad goes beyond the window bounds.
+    */
+   vertices[0][0][0] = clip_x0;
+   vertices[0][0][1] = clip_y0;
+   vertices[0][2][0] = sLeft;
+   vertices[0][2][1] = tTop;
+
+   vertices[1][0][0] = clip_x1;
+   vertices[1][0][1] = clip_y0;
+   vertices[1][2][0] = sRight;
+   vertices[1][2][1] = tTop;
+
+   vertices[2][0][0] = clip_x1;
+   vertices[2][0][1] = clip_y1;
+   vertices[2][2][0] = sRight;
+   vertices[2][2][1] = tBot;
+
+   vertices[3][0][0] = clip_x0;
+   vertices[3][0][1] = clip_y1;
+   vertices[3][2][0] = sLeft;
+   vertices[3][2][1] = tBot;
+
+   /* same for all verts: */
+   for (i = 0; i < 4; i++) {
+      vertices[i][0][2] = z;
+      vertices[i][0][3] = 1.0f;
+      vertices[i][1][0] = color[0];
+      vertices[i][1][1] = color[1];
+      vertices[i][1][2] = color[2];
+      vertices[i][1][3] = color[3];
+      vertices[i][2][2] = 0.0; /*R*/
+      vertices[i][2][3] = 1.0; /*Q*/
+   }
+
+   u_upload_unmap(st->uploader);
+
+   util_draw_vertex_buffer(pipe, st->cso_context, vbuf,
+                           cso_get_aux_vertex_buffer_slot(st->cso_context),
+                           offset,
+                           PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           3); /* attribs/vert */
+
+   restore_render_state(ctx);
 
    pipe_resource_reference(&vbuf, NULL);
 
@@ -486,9 +497,9 @@ create_cache_trans(struct st_context *st)
 void
 st_flush_bitmap_cache(struct st_context *st)
 {
-   if (!st->bitmap.cache->empty) {
-      struct bitmap_cache *cache = st->bitmap.cache;
+   struct bitmap_cache *cache = st->bitmap.cache;
 
+   if (cache && !cache->empty) {
       struct pipe_context *pipe = st->pipe;
       struct pipe_sampler_view *sv;
 
@@ -607,6 +618,76 @@ accum_bitmap(struct gl_context *ctx,
 }
 
 
+/**
+ * One-time init for drawing bitmaps.
+ */
+static void
+init_bitmap_state(struct st_context *st)
+{
+   struct pipe_sampler_state *sampler = &st->bitmap.samplers[0];
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
+
+   /* This function should only be called once */
+   assert(st->bitmap.cache == NULL);
+
+   /* alloc bitmap cache object */
+   st->bitmap.cache = ST_CALLOC_STRUCT(bitmap_cache);
+
+   /* init sampler state once */
+   memset(sampler, 0, sizeof(*sampler));
+   sampler->wrap_s = PIPE_TEX_WRAP_CLAMP;
+   sampler->wrap_t = PIPE_TEX_WRAP_CLAMP;
+   sampler->wrap_r = PIPE_TEX_WRAP_CLAMP;
+   sampler->min_img_filter = PIPE_TEX_FILTER_NEAREST;
+   sampler->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler->mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+   st->bitmap.samplers[1] = *sampler;
+   st->bitmap.samplers[1].normalized_coords = 1;
+
+   /* init baseline rasterizer state once */
+   memset(&st->bitmap.rasterizer, 0, sizeof(st->bitmap.rasterizer));
+   st->bitmap.rasterizer.half_pixel_center = 1;
+   st->bitmap.rasterizer.bottom_edge_rule = 1;
+   st->bitmap.rasterizer.depth_clip = 1;
+
+   /* find a usable texture format */
+   if (screen->is_format_supported(screen, PIPE_FORMAT_I8_UNORM,
+                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_BIND_SAMPLER_VIEW)) {
+      st->bitmap.tex_format = PIPE_FORMAT_I8_UNORM;
+   }
+   else if (screen->is_format_supported(screen, PIPE_FORMAT_A8_UNORM,
+                                        PIPE_TEXTURE_2D, 0,
+                                        PIPE_BIND_SAMPLER_VIEW)) {
+      st->bitmap.tex_format = PIPE_FORMAT_A8_UNORM;
+   }
+   else if (screen->is_format_supported(screen, PIPE_FORMAT_L8_UNORM,
+                                        PIPE_TEXTURE_2D, 0,
+                                        PIPE_BIND_SAMPLER_VIEW)) {
+      st->bitmap.tex_format = PIPE_FORMAT_L8_UNORM;
+   }
+   else {
+      /* XXX support more formats */
+      assert(0);
+   }
+
+   /* Create the vertex shader */
+   {
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_COLOR,
+        st->needs_texcoord_semantic ? TGSI_SEMANTIC_TEXCOORD :
+                                      TGSI_SEMANTIC_GENERIC };
+      const uint semantic_indexes[] = { 0, 0, 0 };
+      st->bitmap.vs = util_make_vertex_passthrough_shader(st->pipe, 3,
+                                                          semantic_names,
+                                                          semantic_indexes,
+                                                          FALSE);
+   }
+
+   reset_cache(st);
+}
+
 
 /**
  * Called via ctx->Driver.Bitmap()
@@ -622,6 +703,10 @@ st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
    assert(width > 0);
    assert(height > 0);
 
+   if (!st->bitmap.cache) {
+      init_bitmap_state(st);
+   }
+
    /* We only need to validate state of the st dirty flags are set or
     * any non-_NEW_PROGRAM_CONSTANTS mesa flags are set.  The VS we use
     * for bitmap drawing uses no constants and the FS constants are
@@ -631,19 +716,6 @@ st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
       st_validate_state(st);
    }
 
-   if (!st->bitmap.vs) {
-      /* create pass-through vertex shader now */
-      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
-                                      TGSI_SEMANTIC_COLOR,
-        st->needs_texcoord_semantic ? TGSI_SEMANTIC_TEXCOORD :
-                                      TGSI_SEMANTIC_GENERIC };
-      const uint semantic_indexes[] = { 0, 0, 0 };
-      st->bitmap.vs = util_make_vertex_passthrough_shader(st->pipe, 3,
-                                                          semantic_names,
-                                                          semantic_indexes,
-                                                          FALSE);
-   }
-
    if (UseBitmapCache && accum_bitmap(ctx, x, y, width, height, unpack, bitmap))
       return;
 
@@ -676,59 +748,6 @@ st_init_bitmap_functions(struct dd_function_table *functions)
 }
 
 
-/** Per-context init */
-void
-st_init_bitmap(struct st_context *st)
-{
-   struct pipe_sampler_state *sampler = &st->bitmap.samplers[0];
-   struct pipe_context *pipe = st->pipe;
-   struct pipe_screen *screen = pipe->screen;
-
-   /* init sampler state once */
-   memset(sampler, 0, sizeof(*sampler));
-   sampler->wrap_s = PIPE_TEX_WRAP_CLAMP;
-   sampler->wrap_t = PIPE_TEX_WRAP_CLAMP;
-   sampler->wrap_r = PIPE_TEX_WRAP_CLAMP;
-   sampler->min_img_filter = PIPE_TEX_FILTER_NEAREST;
-   sampler->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-   sampler->mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-   st->bitmap.samplers[1] = *sampler;
-   st->bitmap.samplers[1].normalized_coords = 1;
-
-   /* init baseline rasterizer state once */
-   memset(&st->bitmap.rasterizer, 0, sizeof(st->bitmap.rasterizer));
-   st->bitmap.rasterizer.half_pixel_center = 1;
-   st->bitmap.rasterizer.bottom_edge_rule = 1;
-   st->bitmap.rasterizer.depth_clip = 1;
-
-   /* find a usable texture format */
-   if (screen->is_format_supported(screen, PIPE_FORMAT_I8_UNORM,
-                                   PIPE_TEXTURE_2D, 0,
-                                   PIPE_BIND_SAMPLER_VIEW)) {
-      st->bitmap.tex_format = PIPE_FORMAT_I8_UNORM;
-   }
-   else if (screen->is_format_supported(screen, PIPE_FORMAT_A8_UNORM,
-                                        PIPE_TEXTURE_2D, 0,
-                                        PIPE_BIND_SAMPLER_VIEW)) {
-      st->bitmap.tex_format = PIPE_FORMAT_A8_UNORM;
-   }
-   else if (screen->is_format_supported(screen, PIPE_FORMAT_L8_UNORM,
-                                        PIPE_TEXTURE_2D, 0,
-                                        PIPE_BIND_SAMPLER_VIEW)) {
-      st->bitmap.tex_format = PIPE_FORMAT_L8_UNORM;
-   }
-   else {
-      /* XXX support more formats */
-      assert(0);
-   }
-
-   /* alloc bitmap cache object */
-   st->bitmap.cache = ST_CALLOC_STRUCT(bitmap_cache);
-
-   reset_cache(st);
-}
-
-
 /** Per-context tear-down */
 void
 st_destroy_bitmap(struct st_context *st)
diff --git a/src/mesa/state_tracker/st_cb_bitmap.h b/src/mesa/state_tracker/st_cb_bitmap.h
index dc7e5cb5c9e..4d1ae222b81 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.h
+++ b/src/mesa/state_tracker/st_cb_bitmap.h
@@ -42,9 +42,6 @@ extern void
 st_init_bitmap_functions(struct dd_function_table *functions);
 
 extern void
-st_init_bitmap(struct st_context *st);
-
-extern void
 st_destroy_bitmap(struct st_context *st);
 
 extern void
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 2c4eccf1e06..ca493d84715 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -153,7 +153,8 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
          struct st_geometry_program *stgp =
             (struct st_geometry_program *) prog;
 
-         st_release_gp_variants(st, stgp);
+         st_release_basic_variants(st, stgp->Base.Base.Target,
+                                   &stgp->variants, &stgp->tgsi);
          
          if (stgp->glsl_to_tgsi)
             free_glsl_to_tgsi_visitor(stgp->glsl_to_tgsi);
@@ -175,7 +176,8 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
          struct st_tessctrl_program *sttcp =
             (struct st_tessctrl_program *) prog;
 
-         st_release_tcp_variants(st, sttcp);
+         st_release_basic_variants(st, sttcp->Base.Base.Target,
+                                   &sttcp->variants, &sttcp->tgsi);
 
          if (sttcp->glsl_to_tgsi)
             free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi);
@@ -186,7 +188,8 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
          struct st_tesseval_program *sttep =
             (struct st_tesseval_program *) prog;
 
-         st_release_tep_variants(st, sttep);
+         st_release_basic_variants(st, sttep->Base.Base.Target,
+                                   &sttep->variants, &sttep->tgsi);
 
          if (sttep->glsl_to_tgsi)
             free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi);
@@ -202,18 +205,6 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
 
 
 /**
- * Called via ctx->Driver.IsProgramNative()
- */
-static GLboolean
-st_is_program_native(struct gl_context *ctx,
-                     GLenum target, 
-                     struct gl_program *prog)
-{
-   return GL_TRUE;
-}
-
-
-/**
  * Called via ctx->Driver.ProgramStringNotify()
  * Called when the program's text/code is changed.  We have to free
  * all shader variants and corresponding gallium shaders when this happens.
@@ -239,7 +230,8 @@ st_program_string_notify( struct gl_context *ctx,
    else if (target == GL_GEOMETRY_PROGRAM_NV) {
       struct st_geometry_program *stgp = (struct st_geometry_program *) prog;
 
-      st_release_gp_variants(st, stgp);
+      st_release_basic_variants(st, stgp->Base.Base.Target,
+                                &stgp->variants, &stgp->tgsi);
       if (!st_translate_geometry_program(st, stgp))
          return false;
 
@@ -260,7 +252,8 @@ st_program_string_notify( struct gl_context *ctx,
       struct st_tessctrl_program *sttcp =
          (struct st_tessctrl_program *) prog;
 
-      st_release_tcp_variants(st, sttcp);
+      st_release_basic_variants(st, sttcp->Base.Base.Target,
+                                &sttcp->variants, &sttcp->tgsi);
       if (!st_translate_tessctrl_program(st, sttcp))
          return false;
 
@@ -271,7 +264,8 @@ st_program_string_notify( struct gl_context *ctx,
       struct st_tesseval_program *sttep =
          (struct st_tesseval_program *) prog;
 
-      st_release_tep_variants(st, sttep);
+      st_release_basic_variants(st, sttep->Base.Base.Target,
+                                &sttep->variants, &sttep->tgsi);
       if (!st_translate_tesseval_program(st, sttep))
          return false;
 
@@ -297,7 +291,6 @@ st_init_program_functions(struct dd_function_table *functions)
    functions->UseProgram = st_use_program;
    functions->NewProgram = st_new_program;
    functions->DeleteProgram = st_delete_program;
-   functions->IsProgramNative = st_is_program_native;
    functions->ProgramStringNotify = st_program_string_notify;
    
    functions->LinkShader = st_link_shader;
diff --git a/src/mesa/state_tracker/st_cb_queryobj.c b/src/mesa/state_tracker/st_cb_queryobj.c
index fc239bc778c..cdb9efc762b 100644
--- a/src/mesa/state_tracker/st_cb_queryobj.c
+++ b/src/mesa/state_tracker/st_cb_queryobj.c
@@ -96,7 +96,8 @@ st_BeginQuery(struct gl_context *ctx, struct gl_query_object *q)
    switch (q->Target) {
    case GL_ANY_SAMPLES_PASSED:
    case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
-      /* fall-through */
+      type = PIPE_QUERY_OCCLUSION_PREDICATE;
+      break;
    case GL_SAMPLES_PASSED_ARB:
       type = PIPE_QUERY_OCCLUSION_COUNTER;
       break;
@@ -240,7 +241,14 @@ get_query_result(struct pipe_context *pipe,
       stq->base.Result = data.pipeline_statistics.c_primitives;
       break;
    default:
-      stq->base.Result = data.u64;
+      switch (stq->type) {
+      case PIPE_QUERY_OCCLUSION_PREDICATE:
+         stq->base.Result = !!data.b;
+         break;
+      default:
+         stq->base.Result = data.u64;
+         break;
+      }
       break;
    }
 
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 446ebfb563f..9016846b148 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -231,7 +231,6 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
    st->cso_context = cso_create_context(pipe);
 
    st_init_atoms( st );
-   st_init_bitmap(st);
    st_init_clear(st);
    st_init_draw( st );
    st_init_pbo_upload(st);
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 57076ad0d18..352e795d06a 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -168,9 +168,9 @@ struct st_context
 
    struct st_vp_variant *vp_variant;
    struct st_fp_variant *fp_variant;
-   struct st_gp_variant *gp_variant;
-   struct st_tcp_variant *tcp_variant;
-   struct st_tep_variant *tep_variant;
+   struct st_basic_variant *gp_variant;
+   struct st_basic_variant *tcp_variant;
+   struct st_basic_variant *tep_variant;
 
    struct gl_texture_object *default_texture;
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index f25bd742f79..feabe6290eb 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -808,6 +808,7 @@ void st_init_extensions(struct pipe_screen *screen,
       }
 
       extensions->EXT_shader_integer_mix = GL_TRUE;
+      extensions->ARB_arrays_of_arrays = GL_TRUE;
    } else {
       /* Optional integer support for GLSL 1.2. */
       if (screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index b8182de0be8..ce93aec4e71 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -40,7 +40,6 @@
 #include "main/uniforms.h"
 #include "main/shaderapi.h"
 #include "program/prog_instruction.h"
-#include "program/sampler.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_screen.h"
@@ -257,6 +256,7 @@ public:
    GLboolean cond_update;
    bool saturate;
    st_src_reg sampler; /**< sampler register */
+   int sampler_base;
    int sampler_array_size; /**< 1-based size of sampler array, 1 if not array */
    int tex_target; /**< One of TEXTURE_*_INDEX */
    glsl_base_type tex_type;
@@ -502,6 +502,19 @@ public:
 
    void emit_arl(ir_instruction *ir, st_dst_reg dst, st_src_reg src0);
 
+   void get_deref_offsets(ir_dereference *ir,
+                          unsigned *array_size,
+                          unsigned *base,
+                          unsigned *index,
+                          st_src_reg *reladdr);
+  void calc_deref_offsets(ir_dereference *head,
+                          ir_dereference *tail,
+                          unsigned *array_elements,
+                          unsigned *base,
+                          unsigned *index,
+                          st_src_reg *indirect,
+                          unsigned *location);
+
    bool try_emit_mad(ir_expression *ir,
               int mul_operand);
    bool try_emit_mad_for_and_not(ir_expression *ir,
@@ -2350,7 +2363,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
       switch (var->data.mode) {
       case ir_var_uniform:
          entry = new(mem_ctx) variable_storage(var, PROGRAM_UNIFORM,
-                                               var->data.location);
+                                               var->data.param_index);
          this->variables.push_tail(entry);
          break;
       case ir_var_shader_in:
@@ -3147,19 +3160,17 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
 
    /* Calculate the surface offset */
    st_src_reg offset;
-   ir_dereference_array *deref_array = deref->as_dereference_array();
-
-   if (deref_array) {
-      offset = get_temp(glsl_type::uint_type);
+   unsigned array_size = 0, base = 0, index = 0;
 
-      deref_array->array_index->accept(this);
+   get_deref_offsets(deref, &array_size, &base, &index, &offset);
 
+   if (offset.file != PROGRAM_UNDEFINED) {
       emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
-               this->result, st_src_reg_for_int(ATOMIC_COUNTER_SIZE));
+               offset, st_src_reg_for_int(ATOMIC_COUNTER_SIZE));
       emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(offset),
-               offset, st_src_reg_for_int(location->data.offset));
+               offset, st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE));
    } else {
-      offset = st_src_reg_for_int(location->data.offset);
+      offset = st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE);
    }
 
    ir->return_deref->accept(this);
@@ -3437,17 +3448,112 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
 }
 
 void
+glsl_to_tgsi_visitor::calc_deref_offsets(ir_dereference *head,
+                                         ir_dereference *tail,
+                                         unsigned *array_elements,
+                                         unsigned *base,
+                                         unsigned *index,
+                                         st_src_reg *indirect,
+                                         unsigned *location)
+{
+   switch (tail->ir_type) {
+   case ir_type_dereference_record: {
+      ir_dereference_record *deref_record = tail->as_dereference_record();
+      const glsl_type *struct_type = deref_record->record->type;
+      int field_index = deref_record->record->type->field_index(deref_record->field);
+
+      calc_deref_offsets(head, deref_record->record->as_dereference(), array_elements, base, index, indirect, location);
+
+      assert(field_index >= 0);
+      *location += struct_type->record_location_offset(field_index);
+      break;
+   }
+
+   case ir_type_dereference_array: {
+      ir_dereference_array *deref_arr = tail->as_dereference_array();
+      ir_constant *array_index = deref_arr->array_index->constant_expression_value();
+
+      if (!array_index) {
+         st_src_reg temp_reg;
+         st_dst_reg temp_dst;
+
+         temp_reg = get_temp(glsl_type::uint_type);
+         temp_dst = st_dst_reg(temp_reg);
+         temp_dst.writemask = 1;
+
+         deref_arr->array_index->accept(this);
+         if (*array_elements != 1)
+            emit_asm(NULL, TGSI_OPCODE_MUL, temp_dst, this->result, st_src_reg_for_int(*array_elements));
+         else
+            emit_asm(NULL, TGSI_OPCODE_MOV, temp_dst, this->result);
+
+         if (indirect->file == PROGRAM_UNDEFINED)
+            *indirect = temp_reg;
+         else {
+            temp_dst = st_dst_reg(*indirect);
+            temp_dst.writemask = 1;
+            emit_asm(NULL, TGSI_OPCODE_ADD, temp_dst, *indirect, temp_reg);
+         }
+      } else
+         *index += array_index->value.u[0] * *array_elements;
+
+      *array_elements *= deref_arr->array->type->length;
+
+      calc_deref_offsets(head, deref_arr->array->as_dereference(), array_elements, base, index, indirect, location);
+      break;
+   }
+   default:
+      break;
+   }
+}
+
+void
+glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
+                                        unsigned *array_size,
+                                        unsigned *base,
+                                        unsigned *index,
+                                        st_src_reg *reladdr)
+{
+   GLuint shader = _mesa_program_enum_to_shader_stage(this->prog->Target);
+   unsigned location = 0;
+   ir_variable *var = ir->variable_referenced();
+
+   memset(reladdr, 0, sizeof(*reladdr));
+   reladdr->file = PROGRAM_UNDEFINED;
+
+   *base = 0;
+   *array_size = 1;
+
+   assert(var);
+   location = var->data.location;
+   calc_deref_offsets(ir, ir, array_size, base, index, reladdr, &location);
+
+   /*
+    * If we end up with no indirect then adjust the base to the index,
+    * and set the array size to 1.
+    */
+   if (reladdr->file == PROGRAM_UNDEFINED) {
+      *base = *index;
+      *array_size = 1;
+   }
+
+   if (location != 0xffffffff) {
+      *base += this->shader_program->UniformStorage[location].opaque[shader].index;
+      *index += this->shader_program->UniformStorage[location].opaque[shader].index;
+   }
+}
+
+void
 glsl_to_tgsi_visitor::visit(ir_texture *ir)
 {
    st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy;
    st_src_reg offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component;
-   st_src_reg levels_src;
+   st_src_reg levels_src, reladdr;
    st_dst_reg result_dst, coord_dst, cube_sc_dst;
    glsl_to_tgsi_instruction *inst = NULL;
    unsigned opcode = TGSI_OPCODE_NOP;
    const glsl_type *sampler_type = ir->sampler->type;
-   ir_rvalue *sampler_index =
-      _mesa_get_sampler_array_nonconst_index(ir->sampler);
+   unsigned sampler_array_size = 1, sampler_index = 0, sampler_base = 0;
    bool is_cube_array = false;
    unsigned i;
 
@@ -3669,10 +3775,10 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       coord_dst.writemask = WRITEMASK_XYZW;
    }
 
-   if (sampler_index) {
-      sampler_index->accept(this);
-      emit_arl(ir, sampler_reladdr, this->result);
-   }
+   get_deref_offsets(ir->sampler, &sampler_array_size, &sampler_base,
+                     &sampler_index, &reladdr);
+   if (reladdr.file != PROGRAM_UNDEFINED)
+      emit_arl(ir, sampler_reladdr, reladdr);
 
    if (opcode == TGSI_OPCODE_TXD)
       inst = emit_asm(ir, opcode, result_dst, coord, dx, dy);
@@ -3705,16 +3811,13 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
    if (ir->shadow_comparitor)
       inst->tex_shadow = GL_TRUE;
 
-   inst->sampler.index = _mesa_get_sampler_uniform_value(ir->sampler,
-                                                         this->shader_program,
-                                                         this->prog);
-   if (sampler_index) {
+   inst->sampler.index = sampler_index;
+   inst->sampler_array_size = sampler_array_size;
+   inst->sampler_base = sampler_base;
+
+   if (reladdr.file != PROGRAM_UNDEFINED) {
       inst->sampler.reladdr = ralloc(mem_ctx, st_src_reg);
-      memcpy(inst->sampler.reladdr, &sampler_reladdr, sizeof(sampler_reladdr));
-      inst->sampler_array_size =
-         ir->sampler->as_dereference_array()->array->type->array_size();
-   } else {
-      inst->sampler_array_size = 1;
+      memcpy(inst->sampler.reladdr, &reladdr, sizeof(reladdr));
    }
 
    if (ir->offset) {
@@ -3915,7 +4018,7 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
       if (inst->info->is_tex) {
          for (int i = 0; i < inst->sampler_array_size; i++) {
-            unsigned idx = inst->sampler.index + i;
+            unsigned idx = inst->sampler_base + i;
             v->samplers_used |= 1 << idx;
 
             debug_assert(idx < (int)ARRAY_SIZE(v->sampler_types));
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 385e26b946e..84b65369d80 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -28,6 +28,7 @@
 #include "main/mtypes.h"
 #include "main/extensions.h"
 #include "main/context.h"
+#include "main/debug_output.h"
 #include "main/texobj.h"
 #include "main/teximage.h"
 #include "main/texstate.h"
@@ -635,6 +636,7 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
    struct pipe_context *pipe;
    struct gl_config mode;
    gl_api api;
+   unsigned ctx_flags = 0;
 
    if (!(stapi->profile_mask & (1 << attribs->profile)))
       return NULL;
@@ -658,7 +660,10 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
       break;
    }
 
-   pipe = smapi->screen->context_create(smapi->screen, NULL, 0);
+   if (attribs->flags & ST_CONTEXT_FLAG_ROBUST_ACCESS)
+      ctx_flags |= PIPE_CONTEXT_ROBUST_BUFFER_ACCESS;
+
+   pipe = smapi->screen->context_create(smapi->screen, NULL, ctx_flags);
    if (!pipe) {
       *error = ST_CONTEXT_ERROR_NO_MEMORY;
       return NULL;
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index b3954547418..624586e6d67 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -140,112 +140,54 @@ st_release_fp_variants(struct st_context *st, struct st_fragment_program *stfp)
 
 
 /**
- * Delete a geometry program variant.  Note the caller must unlink
+ * Delete a basic program variant.  Note the caller must unlink
  * the variant from the linked list.
  */
 static void
-delete_gp_variant(struct st_context *st, struct st_gp_variant *gpv)
+delete_basic_variant(struct st_context *st, struct st_basic_variant *v,
+                     GLenum target)
 {
-   if (gpv->driver_shader) 
-      cso_delete_geometry_shader(st->cso_context, gpv->driver_shader);
-      
-   free(gpv);
-}
-
-
-/**
- * Free all variants of a geometry program.
- */
-void
-st_release_gp_variants(struct st_context *st, struct st_geometry_program *stgp)
-{
-   struct st_gp_variant *gpv;
-
-   for (gpv = stgp->variants; gpv; ) {
-      struct st_gp_variant *next = gpv->next;
-      delete_gp_variant(st, gpv);
-      gpv = next;
-   }
-
-   stgp->variants = NULL;
-
-   if (stgp->tgsi.tokens) {
-      ureg_free_tokens(stgp->tgsi.tokens);
-      stgp->tgsi.tokens = NULL;
-   }
-}
-
-
-/**
- * Delete a tessellation control program variant.  Note the caller must unlink
- * the variant from the linked list.
- */
-static void
-delete_tcp_variant(struct st_context *st, struct st_tcp_variant *tcpv)
-{
-   if (tcpv->driver_shader)
-      cso_delete_tessctrl_shader(st->cso_context, tcpv->driver_shader);
-
-   free(tcpv);
-}
-
-
-/**
- * Free all variants of a tessellation control program.
- */
-void
-st_release_tcp_variants(struct st_context *st, struct st_tessctrl_program *sttcp)
-{
-   struct st_tcp_variant *tcpv;
-
-   for (tcpv = sttcp->variants; tcpv; ) {
-      struct st_tcp_variant *next = tcpv->next;
-      delete_tcp_variant(st, tcpv);
-      tcpv = next;
-   }
-
-   sttcp->variants = NULL;
-
-   if (sttcp->tgsi.tokens) {
-      ureg_free_tokens(sttcp->tgsi.tokens);
-      sttcp->tgsi.tokens = NULL;
+   if (v->driver_shader) {
+      switch (target) {
+      case GL_TESS_CONTROL_PROGRAM_NV:
+         cso_delete_tessctrl_shader(st->cso_context, v->driver_shader);
+         break;
+      case GL_TESS_EVALUATION_PROGRAM_NV:
+         cso_delete_tesseval_shader(st->cso_context, v->driver_shader);
+         break;
+      case GL_GEOMETRY_PROGRAM_NV:
+         cso_delete_geometry_shader(st->cso_context, v->driver_shader);
+         break;
+      default:
+         assert(!"this shouldn't occur");
+      }
    }
-}
-
 
-/**
- * Delete a tessellation evaluation program variant.  Note the caller must
- * unlink the variant from the linked list.
- */
-static void
-delete_tep_variant(struct st_context *st, struct st_tep_variant *tepv)
-{
-   if (tepv->driver_shader)
-      cso_delete_tesseval_shader(st->cso_context, tepv->driver_shader);
-
-   free(tepv);
+   free(v);
 }
 
 
 /**
- * Free all variants of a tessellation evaluation program.
+ * Free all basic program variants.
  */
 void
-st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep)
+st_release_basic_variants(struct st_context *st, GLenum target,
+                          struct st_basic_variant **variants,
+                          struct pipe_shader_state *tgsi)
 {
-   struct st_tep_variant *tepv;
+   struct st_basic_variant *v;
 
-   for (tepv = sttep->variants; tepv; ) {
-      struct st_tep_variant *next = tepv->next;
-      delete_tep_variant(st, tepv);
-      tepv = next;
+   for (v = *variants; v; ) {
+      struct st_basic_variant *next = v->next;
+      delete_basic_variant(st, v, target);
+      v = next;
    }
 
-   sttep->variants = NULL;
+   *variants = NULL;
 
-   if (sttep->tgsi.tokens) {
-      ureg_free_tokens(sttep->tgsi.tokens);
-      sttep->tgsi.tokens = NULL;
+   if (tgsi->tokens) {
+      ureg_free_tokens(tgsi->tokens);
+      tgsi->tokens = NULL;
    }
 }
 
@@ -1324,53 +1266,43 @@ st_translate_geometry_program(struct st_context *st,
 }
 
 
-static struct st_gp_variant *
-st_create_gp_variant(struct st_context *st,
-                     struct st_geometry_program *stgp,
-                     const struct st_gp_variant_key *key)
-{
-   struct pipe_context *pipe = st->pipe;
-   struct st_gp_variant *gpv;
-
-   gpv = CALLOC_STRUCT(st_gp_variant);
-   if (!gpv)
-      return NULL;
-
-   /* fill in new variant */
-   gpv->driver_shader = pipe->create_gs_state(pipe, &stgp->tgsi);
-   gpv->key = *key;
-   return gpv;
-}
-
-
 /**
- * Get/create geometry program variant.
+ * Get/create a basic program variant.
  */
-struct st_gp_variant *
-st_get_gp_variant(struct st_context *st,
-                  struct st_geometry_program *stgp,
-                  const struct st_gp_variant_key *key)
+struct st_basic_variant *
+st_get_basic_variant(struct st_context *st,
+                     struct pipe_shader_state *tgsi,
+                     struct st_basic_variant **variants)
 {
-   struct st_gp_variant *gpv;
+   struct pipe_context *pipe = st->pipe;
+   struct st_basic_variant *v;
+   struct st_basic_variant_key key;
+
+   memset(&key, 0, sizeof(key));
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    /* Search for existing variant */
-   for (gpv = stgp->variants; gpv; gpv = gpv->next) {
-      if (memcmp(&gpv->key, key, sizeof(*key)) == 0) {
+   for (v = *variants; v; v = v->next) {
+      if (memcmp(&v->key, &key, sizeof(key)) == 0) {
          break;
       }
    }
 
-   if (!gpv) {
+   if (!v) {
       /* create new */
-      gpv = st_create_gp_variant(st, stgp, key);
-      if (gpv) {
+      v = CALLOC_STRUCT(st_basic_variant);
+      if (v) {
+         /* fill in new variant */
+         v->driver_shader = pipe->create_gs_state(pipe, tgsi);
+         v->key = key;
+
          /* insert into list */
-         gpv->next = stgp->variants;
-         stgp->variants = gpv;
+         v->next = *variants;
+         *variants = v;
       }
    }
 
-   return gpv;
+   return v;
 }
 
 
@@ -1399,56 +1331,6 @@ st_translate_tessctrl_program(struct st_context *st,
 }
 
 
-static struct st_tcp_variant *
-st_create_tcp_variant(struct st_context *st,
-                      struct st_tessctrl_program *sttcp,
-                      const struct st_tcp_variant_key *key)
-{
-   struct pipe_context *pipe = st->pipe;
-   struct st_tcp_variant *tcpv;
-
-   tcpv = CALLOC_STRUCT(st_tcp_variant);
-   if (!tcpv)
-      return NULL;
-
-   /* fill in new variant */
-   tcpv->driver_shader = pipe->create_tcs_state(pipe, &sttcp->tgsi);
-   tcpv->key = *key;
-   return tcpv;
-}
-
-
-/**
- * Get/create tessellation control program variant.
- */
-struct st_tcp_variant *
-st_get_tcp_variant(struct st_context *st,
-                  struct st_tessctrl_program *sttcp,
-                  const struct st_tcp_variant_key *key)
-{
-   struct st_tcp_variant *tcpv;
-
-   /* Search for existing variant */
-   for (tcpv = sttcp->variants; tcpv; tcpv = tcpv->next) {
-      if (memcmp(&tcpv->key, key, sizeof(*key)) == 0) {
-         break;
-      }
-   }
-
-   if (!tcpv) {
-      /* create new */
-      tcpv = st_create_tcp_variant(st, sttcp, key);
-      if (tcpv) {
-         /* insert into list */
-         tcpv->next = sttcp->variants;
-         sttcp->variants = tcpv;
-      }
-   }
-
-   return tcpv;
-}
-
-
 /**
  * Translate a tessellation evaluation program to create a new variant.
  */
@@ -1496,70 +1378,20 @@ st_translate_tesseval_program(struct st_context *st,
 }
 
 
-static struct st_tep_variant *
-st_create_tep_variant(struct st_context *st,
-                      struct st_tesseval_program *sttep,
-                      const struct st_tep_variant_key *key)
-{
-   struct pipe_context *pipe = st->pipe;
-   struct st_tep_variant *tepv;
-
-   tepv = CALLOC_STRUCT(st_tep_variant);
-   if (!tepv)
-      return NULL;
-
-   /* fill in new variant */
-   tepv->driver_shader = pipe->create_tes_state(pipe, &sttep->tgsi);
-   tepv->key = *key;
-   return tepv;
-}
-
-
-/**
- * Get/create tessellation evaluation program variant.
- */
-struct st_tep_variant *
-st_get_tep_variant(struct st_context *st,
-                  struct st_tesseval_program *sttep,
-                  const struct st_tep_variant_key *key)
-{
-   struct st_tep_variant *tepv;
-
-   /* Search for existing variant */
-   for (tepv = sttep->variants; tepv; tepv = tepv->next) {
-      if (memcmp(&tepv->key, key, sizeof(*key)) == 0) {
-         break;
-      }
-   }
-
-   if (!tepv) {
-      /* create new */
-      tepv = st_create_tep_variant(st, sttep, key);
-      if (tepv) {
-         /* insert into list */
-         tepv->next = sttep->variants;
-         sttep->variants = tepv;
-      }
-   }
-
-   return tepv;
-}
-
-
 /**
  * Vert/Geom/Frag programs have per-context variants.  Free all the
  * variants attached to the given program which match the given context.
  */
 static void
-destroy_program_variants(struct st_context *st, struct gl_program *program)
+destroy_program_variants(struct st_context *st, struct gl_program *target)
 {
-   if (!program || program == &_mesa_DummyProgram)
+   if (!target || target == &_mesa_DummyProgram)
       return;
 
-   switch (program->Target) {
+   switch (target->Target) {
    case GL_VERTEX_PROGRAM_ARB:
       {
-         struct st_vertex_program *stvp = (struct st_vertex_program *) program;
+         struct st_vertex_program *stvp = (struct st_vertex_program *) target;
          struct st_vp_variant *vpv, **prevPtr = &stvp->variants;
 
          for (vpv = stvp->variants; vpv; ) {
@@ -1580,7 +1412,7 @@ destroy_program_variants(struct st_context *st, struct gl_program *program)
    case GL_FRAGMENT_PROGRAM_ARB:
       {
          struct st_fragment_program *stfp =
-            (struct st_fragment_program *) program;
+            (struct st_fragment_program *) target;
          struct st_fp_variant *fpv, **prevPtr = &stfp->variants;
 
          for (fpv = stfp->variants; fpv; ) {
@@ -1599,71 +1431,37 @@ destroy_program_variants(struct st_context *st, struct gl_program *program)
       }
       break;
    case GL_GEOMETRY_PROGRAM_NV:
-      {
-         struct st_geometry_program *stgp =
-            (struct st_geometry_program *) program;
-         struct st_gp_variant *gpv, **prevPtr = &stgp->variants;
-
-         for (gpv = stgp->variants; gpv; ) {
-            struct st_gp_variant *next = gpv->next;
-            if (gpv->key.st == st) {
-               /* unlink from list */
-               *prevPtr = next;
-               /* destroy this variant */
-               delete_gp_variant(st, gpv);
-            }
-            else {
-               prevPtr = &gpv->next;
-            }
-            gpv = next;
-         }
-      }
-      break;
    case GL_TESS_CONTROL_PROGRAM_NV:
-      {
-         struct st_tessctrl_program *sttcp =
-            (struct st_tessctrl_program *) program;
-         struct st_tcp_variant *tcpv, **prevPtr = &sttcp->variants;
-
-         for (tcpv = sttcp->variants; tcpv; ) {
-            struct st_tcp_variant *next = tcpv->next;
-            if (tcpv->key.st == st) {
-               /* unlink from list */
-               *prevPtr = next;
-               /* destroy this variant */
-               delete_tcp_variant(st, tcpv);
-            }
-            else {
-               prevPtr = &tcpv->next;
-            }
-            tcpv = next;
-         }
-      }
-      break;
    case GL_TESS_EVALUATION_PROGRAM_NV:
       {
-         struct st_tesseval_program *sttep =
-            (struct st_tesseval_program *) program;
-         struct st_tep_variant *tepv, **prevPtr = &sttep->variants;
-
-         for (tepv = sttep->variants; tepv; ) {
-            struct st_tep_variant *next = tepv->next;
-            if (tepv->key.st == st) {
+         struct st_geometry_program *gp = (struct st_geometry_program*)target;
+         struct st_tessctrl_program *tcp = (struct st_tessctrl_program*)target;
+         struct st_tesseval_program *tep = (struct st_tesseval_program*)target;
+         struct st_basic_variant **variants =
+            target->Target == GL_GEOMETRY_PROGRAM_NV ? &gp->variants :
+            target->Target == GL_TESS_CONTROL_PROGRAM_NV ? &tcp->variants :
+            target->Target == GL_TESS_EVALUATION_PROGRAM_NV ? &tep->variants :
+            NULL;
+         struct st_basic_variant *v, **prevPtr = variants;
+
+         for (v = *variants; v; ) {
+            struct st_basic_variant *next = v->next;
+            if (v->key.st == st) {
                /* unlink from list */
                *prevPtr = next;
                /* destroy this variant */
-               delete_tep_variant(st, tepv);
+               delete_basic_variant(st, v, target->Target);
             }
             else {
-               prevPtr = &tepv->next;
+               prevPtr = &v->next;
             }
-            tepv = next;
+            v = next;
          }
       }
       break;
    default:
       _mesa_problem(NULL, "Unexpected program target 0x%x in "
-                    "destroy_program_variants_cb()", program->Target);
+                    "destroy_program_variants_cb()", target->Target);
    }
 }
 
@@ -1789,31 +1587,19 @@ st_precompile_shader_variant(struct st_context *st,
 
    case GL_TESS_CONTROL_PROGRAM_NV: {
       struct st_tessctrl_program *p = (struct st_tessctrl_program *)prog;
-      struct st_tcp_variant_key key;
-
-      memset(&key, 0, sizeof(key));
-      key.st = st->has_shareable_shaders ? NULL : st;
-      st_get_tcp_variant(st, p, &key);
+      st_get_basic_variant(st, &p->tgsi, &p->variants);
       break;
    }
 
    case GL_TESS_EVALUATION_PROGRAM_NV: {
       struct st_tesseval_program *p = (struct st_tesseval_program *)prog;
-      struct st_tep_variant_key key;
-
-      memset(&key, 0, sizeof(key));
-      key.st = st->has_shareable_shaders ? NULL : st;
-      st_get_tep_variant(st, p, &key);
+      st_get_basic_variant(st, &p->tgsi, &p->variants);
       break;
    }
 
    case GL_GEOMETRY_PROGRAM_NV: {
       struct st_geometry_program *p = (struct st_geometry_program *)prog;
-      struct st_gp_variant_key key;
-
-      memset(&key, 0, sizeof(key));
-      key.st = st->has_shareable_shaders ? NULL : st;
-      st_get_gp_variant(st, p, &key);
+      st_get_basic_variant(st, &p->tgsi, &p->variants);
       break;
    }
 
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index a74531581b4..7717d02cd3f 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -171,25 +171,24 @@ struct st_vertex_program
 
 
 
-/** Geometry program variant key */
-struct st_gp_variant_key
+/** Key shared by all shaders except VP, FP */
+struct st_basic_variant_key
 {
    struct st_context *st;          /**< variants are per-context */
-   /* no other fields yet */
 };
 
 
 /**
  * Geometry program variant.
  */
-struct st_gp_variant
+struct st_basic_variant
 {
    /* Parameters which generated this variant. */
-   struct st_gp_variant_key key;
+   struct st_basic_variant_key key;
 
    void *driver_shader;
 
-   struct st_gp_variant *next;
+   struct st_basic_variant *next;
 };
 
 
@@ -202,30 +201,7 @@ struct st_geometry_program
    struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
 
-   struct st_gp_variant *variants;
-};
-
-
-
-/** Tessellation control program variant key */
-struct st_tcp_variant_key
-{
-   struct st_context *st;          /**< variants are per-context */
-   /* no other fields yet */
-};
-
-
-/**
- * Tessellation control program variant.
- */
-struct st_tcp_variant
-{
-   /* Parameters which generated this variant. */
-   struct st_tcp_variant_key key;
-
-   void *driver_shader;
-
-   struct st_tcp_variant *next;
+   struct st_basic_variant *variants;
 };
 
 
@@ -238,30 +214,7 @@ struct st_tessctrl_program
    struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
 
-   struct st_tcp_variant *variants;
-};
-
-
-
-/** Tessellation evaluation program variant key */
-struct st_tep_variant_key
-{
-   struct st_context *st;          /**< variants are per-context */
-   /* no other fields yet */
-};
-
-
-/**
- * Tessellation evaluation program variant.
- */
-struct st_tep_variant
-{
-   /* Parameters which generated this variant. */
-   struct st_tep_variant_key key;
-
-   void *driver_shader;
-
-   struct st_tep_variant *next;
+   struct st_basic_variant *variants;
 };
 
 
@@ -274,7 +227,7 @@ struct st_tesseval_program
    struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
 
-   struct st_tep_variant *variants;
+   struct st_basic_variant *variants;
 };
 
 
@@ -397,21 +350,10 @@ st_get_fp_variant(struct st_context *st,
                   struct st_fragment_program *stfp,
                   const struct st_fp_variant_key *key);
 
-
-extern struct st_gp_variant *
-st_get_gp_variant(struct st_context *st,
-                  struct st_geometry_program *stgp,
-                  const struct st_gp_variant_key *key);
-
-extern struct st_tcp_variant *
-st_get_tcp_variant(struct st_context *st,
-                   struct st_tessctrl_program *sttcp,
-                   const struct st_tcp_variant_key *key);
-
-extern struct st_tep_variant *
-st_get_tep_variant(struct st_context *st,
-                   struct st_tesseval_program *sttep,
-                   const struct st_tep_variant_key *key);
+extern struct st_basic_variant *
+st_get_basic_variant(struct st_context *st,
+                     struct pipe_shader_state *tgsi,
+                     struct st_basic_variant **variants);
 
 extern void
 st_release_vp_variants( struct st_context *st,
@@ -422,16 +364,9 @@ st_release_fp_variants( struct st_context *st,
                         struct st_fragment_program *stfp );
 
 extern void
-st_release_gp_variants(struct st_context *st,
-                       struct st_geometry_program *stgp);
-
-extern void
-st_release_tcp_variants(struct st_context *st,
-                        struct st_tessctrl_program *sttcp);
-
-extern void
-st_release_tep_variants(struct st_context *st,
-                        struct st_tesseval_program *sttep);
+st_release_basic_variants(struct st_context *st, GLenum target,
+                          struct st_basic_variant **variants,
+                          struct pipe_shader_state *tgsi);
 
 extern void
 st_destroy_program_variants(struct st_context *st);