316 files changed, 6860 insertions, 3511 deletions
diff --git a/configure.ac b/configure.ac
index 577ca222f41..b13a6c2f21d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -396,6 +396,61 @@ fi
 AM_CONDITIONAL([SSE41_SUPPORTED], [test x$SSE41_SUPPORTED = x1])
 AC_SUBST([SSE41_CFLAGS], $SSE41_CFLAGS)
 
+dnl Check for Endianness
+AC_C_BIGENDIAN(
+   little_endian=no,
+   little_endian=yes,
+   little_endian=no,
+   little_endian=no
+)
+
+dnl Check for POWER8 Architecture
+PWR8_CFLAGS="-mpower8-vector"
+have_pwr8_intrinsics=no
+AC_MSG_CHECKING(whether gcc supports -mpower8-vector)
+save_CFLAGS=$CFLAGS
+CFLAGS="$PWR8_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
+#error "Need GCC >= 4.8 for sane POWER8 support"
+#endif
+#include <altivec.h>
+int main () {
+    vector unsigned char r;
+    vector unsigned int v = vec_splat_u32 (1);
+    r = __builtin_vec_vgbbd ((vector unsigned char) v);
+    return 0;
+}]])], have_pwr8_intrinsics=yes)
+CFLAGS=$save_CFLAGS
+
+AC_ARG_ENABLE(pwr8,
+   [AC_HELP_STRING([--disable-pwr8-inst],
+                   [disable POWER8-specific instructions])],
+   [enable_pwr8=$enableval], [enable_pwr8=auto])
+
+if test "x$enable_pwr8" = xno ; then
+   have_pwr8_intrinsics=disabled
+fi
+
+if test $have_pwr8_intrinsics = yes && test $little_endian = yes ; then
+   DEFINES="$DEFINES -D_ARCH_PWR8"
+   CXXFLAGS="$CXXFLAGS $PWR8_CFLAGS"
+   CFLAGS="$CFLAGS $PWR8_CFLAGS"
+else
+   PWR8_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_pwr8_intrinsics)
+if test "x$enable_pwr8" = xyes && test $have_pwr8_intrinsics = no ; then
+   AC_MSG_ERROR([POWER8 compiler support not detected])
+fi
+
+if test $have_pwr8_intrinsics = yes && test $little_endian = no ; then
+   AC_MSG_WARN([POWER8 optimization is enabled only on POWER8 Little-Endian])
+fi
+
+AC_SUBST([PWR8_CFLAGS], $PWR8_CFLAGS)
+
 dnl Can't have static and shared libraries, default to static if user
 dnl explicitly requested. If both disabled, set to static since shared
 dnl was explicitly requested.
@@ -421,8 +476,29 @@ AC_ARG_ENABLE([debug],
     [enable_debug="$enableval"],
     [enable_debug=no]
 )
+
+AC_ARG_ENABLE([profile],
+    [AS_HELP_STRING([--enable-profile],
+        [enable profiling of code @<:@default=disabled@:>@])],
+    [enable_profile="$enableval"],
+    [enable_profile=no]
+)
+
+if test "x$enable_profile" = xyes; then
+    DEFINES="$DEFINES -DPROFILE"
+    if test "x$GCC" = xyes; then
+        CFLAGS="$CFLAGS -fno-omit-frame-pointer"
+    fi
+    if test "x$GXX" = xyes; then
+        CXXFLAGS="$CXXFLAGS -fno-omit-frame-pointer"
+    fi
+fi
+
 if test "x$enable_debug" = xyes; then
     DEFINES="$DEFINES -DDEBUG"
+    if test "x$enable_profile" = xyes; then
+        AC_MSG_WARN([Debug and Profile are enabled at the same time])
+    fi
     if test "x$GCC" = xyes; then
         if ! echo "$CFLAGS" | grep -q -e '-g'; then
             CFLAGS="$CFLAGS -g"
diff --git a/docs/contents.html b/docs/contents.html
index 5e0f514c1e1..294804f6f5a 100644
--- a/docs/contents.html
+++ b/docs/contents.html
@@ -90,6 +90,7 @@
 <li><a href="http://www.opengl.org" target="_parent">OpenGL website</a>
 <li><a href="http://dri.freedesktop.org" target="_parent">DRI website</a>
 <li><a href="http://www.freedesktop.org" target="_parent">freedesktop.org</a>
+<li><a href="http://planet.freedesktop.org" target="_parent">Developer blogs</a>
 </ul>
 
 <b>Hosted by:</b>
diff --git a/docs/relnotes/11.2.0.html b/docs/relnotes/11.2.0.html
index 23bb31c6235..616c134a768 100644
--- a/docs/relnotes/11.2.0.html
+++ b/docs/relnotes/11.2.0.html
@@ -47,6 +47,8 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_base_instance on freedreno/a4xx</li>
 <li>GL_ARB_compute_shader on i965</li>
 <li>GL_ARB_copy_image on r600</li>
+<li>GL_ARB_indirect_parameters on nvc0</li>
+<li>GL_ARB_shader_draw_parameters on i965, nvc0</li>
 <li>GL_ARB_tessellation_shader on i965 and r600 (evergreen/cayman only)</li>
 <li>GL_ARB_texture_buffer_object_rgb32 on freedreno/a4xx</li>
 <li>GL_ARB_texture_buffer_range on freedreno/a4xx</li>
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index 5891ba67ea4..5139e279bcc 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -132,6 +132,28 @@ CHIPSET(0x1932, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193A, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193B, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193D, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x5902, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x5906, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590A, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590B, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590E, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x5913, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5915, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5917, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5912, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x5916, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591A, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591B, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591D, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591E, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x5921, kbl_gt2, "Intel(R) Kabylake GT2F")
+CHIPSET(0x5926, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x592A, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x592B, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x5932, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593A, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593D, kbl_gt4, "Intel(R) Kabylake GT4")
 CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 3ce550a3ae8..e85ae16c1df 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -938,7 +938,7 @@ draw_aaline_prepare_outputs(struct draw_context *draw,
    const struct pipe_rasterizer_state *rast = draw->rasterizer;
 
    /* update vertex attrib info */
-   aaline->pos_slot = draw_current_shader_position_output(draw);;
+   aaline->pos_slot = draw_current_shader_position_output(draw);
 
    if (!rast->line_smooth)
       return;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 67d8ecaa35f..2d92d650ab6 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -611,6 +611,8 @@ do_clip_line(struct draw_stage *stage,
    struct prim_header newprim;
    int viewport_index;
 
+   newprim.flags = header->flags;
+
    if (stage->draw->rasterizer->flatshade_first) {
       prov_vertex = v0;
    }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index 2517d610e71..c465c7526f5 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -86,27 +86,33 @@ inject_front_face_info(struct draw_stage *stage,
 }
 
    
-static void point( struct draw_stage *stage,
-		   struct vertex_header *v0 )
+static void point(struct draw_stage *stage,
+                  struct prim_header *header,
+                  struct vertex_header *v0)
 {
    struct prim_header tmp;
+   tmp.det = header->det;
+   tmp.flags = 0;
    tmp.v[0] = v0;
-   stage->next->point( stage->next, &tmp );
+   stage->next->point(stage->next, &tmp);
 }
 
-static void line( struct draw_stage *stage,
-		  struct vertex_header *v0,
-		  struct vertex_header *v1 )
+static void line(struct draw_stage *stage,
+                 struct prim_header *header,
+                 struct vertex_header *v0,
+                 struct vertex_header *v1)
 {
    struct prim_header tmp;
+   tmp.det = header->det;
+   tmp.flags = 0;
    tmp.v[0] = v0;
    tmp.v[1] = v1;
-   stage->next->line( stage->next, &tmp );
+   stage->next->line(stage->next, &tmp);
 }
 
 
-static void points( struct draw_stage *stage,
-		    struct prim_header *header )
+static void points(struct draw_stage *stage,
+                   struct prim_header *header)
 {
    struct vertex_header *v0 = header->v[0];
    struct vertex_header *v1 = header->v[1];
@@ -114,27 +120,41 @@ static void points( struct draw_stage *stage,
 
    inject_front_face_info(stage, header);
 
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag) point( stage, v0 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag) point( stage, v1 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag) point( stage, v2 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag)
+      point(stage, header, v0);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag)
+      point(stage, header, v1);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag)
+      point(stage, header, v2);
 }
 
 
-static void lines( struct draw_stage *stage,
-		   struct prim_header *header )
+static void lines(struct draw_stage *stage,
+                  struct prim_header *header)
 {
    struct vertex_header *v0 = header->v[0];
    struct vertex_header *v1 = header->v[1];
    struct vertex_header *v2 = header->v[2];
 
    if (header->flags & DRAW_PIPE_RESET_STIPPLE)
-      stage->next->reset_stipple_counter( stage->next );
+      /*
+       * XXX could revisit this. The only stage which cares is the line
+       * stipple stage. Could just emit correct reset flags here and not
+       * bother about all the calling through reset_stipple_counter
+       * stages. Though technically it is necessary if line stipple is
+       * handled by the driver, but this is not actually hooked up when
+       * using vbuf (vbuf stage reset_stipple_counter does nothing).
+       */
+      stage->next->reset_stipple_counter(stage->next);
 
    inject_front_face_info(stage, header);
 
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag) line( stage, v2, v0 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag) line( stage, v0, v1 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag) line( stage, v1, v2 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag)
+      line(stage, header, v2, v0);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag)
+      line(stage, header, v0, v1);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag)
+      line(stage, header, v1, v2);
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index f36706cee01..6df7149b531 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -74,9 +74,10 @@ struct vbuf_stage {
    unsigned max_indices;
    unsigned nr_indices;
 
-   /* Cache point size somewhere it's address won't change:
+   /* Cache point size somewhere its address won't change:
     */
    float point_size;
+   float zero4[4];
 
    struct translate_cache *cache;
 };
@@ -205,6 +206,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
    struct translate_key hw_key;
    unsigned dst_offset;
    unsigned i;
+   const struct vertex_info *vinfo;
 
    vbuf->render->set_primitive(vbuf->render, prim);
 
@@ -215,27 +217,33 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
     * state change.
     */
    vbuf->vinfo = vbuf->render->get_vertex_info(vbuf->render);
-   vbuf->vertex_size = vbuf->vinfo->size * sizeof(float);
+   vinfo = vbuf->vinfo;
+   vbuf->vertex_size = vinfo->size * sizeof(float);
 
    /* Translate from pipeline vertices to hw vertices.
     */
    dst_offset = 0;
 
-   for (i = 0; i < vbuf->vinfo->num_attribs; i++) {
+   for (i = 0; i < vinfo->num_attribs; i++) {
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       enum pipe_format output_format;
-      unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
-      output_format = draw_translate_vinfo_format(vbuf->vinfo->attrib[i].emit);
-      emit_sz = draw_translate_vinfo_size(vbuf->vinfo->attrib[i].emit);
+      output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
+      emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
 
       /* doesn't handle EMIT_OMIT */
       assert(emit_sz != 0);
 
-      if (vbuf->vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
-	 src_buffer = 1;
-	 src_offset = 0;
+      if (vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
+         src_buffer = 1;
+         src_offset = 0;
+      }
+      else if (vinfo->attrib[i].src_index == DRAW_ATTR_NONEXIST) {
+         /* elements which don't exist will get assigned zeros */
+         src_buffer = 2;
+         src_offset = 0;
       }
 
       hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
@@ -249,7 +257,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
       dst_offset += emit_sz;
    }
 
-   hw_key.nr_elements = vbuf->vinfo->num_attribs;
+   hw_key.nr_elements = vinfo->num_attribs;
    hw_key.output_stride = vbuf->vertex_size;
 
    /* Don't bother with caching at this stage:
@@ -261,6 +269,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
       vbuf->translate = translate_cache_find(vbuf->cache, &hw_key);
 
       vbuf->translate->set_buffer(vbuf->translate, 1, &vbuf->point_size, 0, ~0);
+      vbuf->translate->set_buffer(vbuf->translate, 2, &vbuf->zero4[0], 0, ~0);
    }
 
    vbuf->point_size = vbuf->stage.draw->rasterizer->point_size;
@@ -428,7 +437,7 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
    if (!vbuf)
       goto fail;
-   
+
    vbuf->stage.draw = draw;
    vbuf->stage.name = "vbuf";
    vbuf->stage.point = vbuf_first_point;
@@ -437,29 +446,30 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    vbuf->stage.flush = vbuf_flush;
    vbuf->stage.reset_stipple_counter = vbuf_reset_stipple_counter;
    vbuf->stage.destroy = vbuf_destroy;
-   
+
    vbuf->render = render;
    vbuf->max_indices = MIN2(render->max_indices, UNDEFINED_VERTEX_ID-1);
 
-   vbuf->indices = (ushort *) align_malloc( vbuf->max_indices * 
-					    sizeof(vbuf->indices[0]), 
-					    16 );
+   vbuf->indices = (ushort *) align_malloc(vbuf->max_indices *
+                    sizeof(vbuf->indices[0]),
+                    16);
    if (!vbuf->indices)
       goto fail;
 
    vbuf->cache = translate_cache_create();
-   if (!vbuf->cache) 
+   if (!vbuf->cache)
       goto fail;
-      
-   
+
    vbuf->vertices = NULL;
    vbuf->vertex_ptr = vbuf->vertices;
-   
+
+   vbuf->zero4[0] = vbuf->zero4[1] = vbuf->zero4[2] = vbuf->zero4[3] = 0.0f;
+
    return &vbuf->stage;
 
- fail:
+fail:
    if (vbuf)
       vbuf_destroy(&vbuf->stage);
-   
+
    return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 0204b439dee..5a49acb64d3 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -524,7 +524,7 @@ draw_vbo(struct draw_context *draw,
 #endif
    {
       if (index_limit == 0) {
-      /* one of the buffers is too small to do any valid drawing */
+         /* one of the buffers is too small to do any valid drawing */
          debug_warning("draw: VBO too small to draw anything\n");
          util_fpstate_set(fpstate);
          return;
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 0b9fab5721c..6fb630b5498 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -44,6 +44,9 @@ struct pt_emit {
    unsigned prim;
 
    const struct vertex_info *vinfo;
+
+   float zero4[4];
+
 };
 
 
@@ -92,6 +95,11 @@ draw_pt_emit_prepare(struct pt_emit *emit,
          src_buffer = 1;
          src_offset = 0;
       }
+      else if (vinfo->attrib[i].src_index == DRAW_ATTR_NONEXIST) {
+         /* elements which don't exist will get assigned zeros */
+         src_buffer = 2;
+         src_offset = 0;
+      }
 
       hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
@@ -111,6 +119,8 @@ draw_pt_emit_prepare(struct pt_emit *emit,
        translate_key_compare(&emit->translate->key, &hw_key) != 0) {
       translate_key_sanitize(&hw_key);
       emit->translate = translate_cache_find(emit->cache, &hw_key);
+
+      emit->translate->set_buffer(emit->translate, 2, &emit->zero4[0], 0, ~0);
    }
 
    if (!vinfo->size)
@@ -287,6 +297,8 @@ draw_pt_emit_create(struct draw_context *draw)
       return NULL;
    }
 
+   emit->zero4[0] = emit->zero4[1] = emit->zero4[2] = emit->zero4[3] = 0.0f;
+
    return emit;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index ee11d2f9276..c7b1afe5dde 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -44,6 +44,7 @@
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 
+#define DRAW_ATTR_NONEXIST 255
 
 /**
  * Vertex attribute emit modes
@@ -61,18 +62,6 @@ enum attrib_emit {
 
 
 /**
- * Attribute interpolation mode
- */
-enum interp_mode {
-   INTERP_NONE,      /**< never interpolate vertex header info */
-   INTERP_POS,       /**< special case for frag position */
-   INTERP_CONSTANT,
-   INTERP_LINEAR,
-   INTERP_PERSPECTIVE
-};
-
-
-/**
  * Information about hardware/rasterization vertex layout.
  */
 struct vertex_info
@@ -85,8 +74,7 @@ struct vertex_info
     * memcmp() comparisons.
     */
    struct {
-      unsigned interp_mode:4;      /**< INTERP_x */
-      unsigned emit:4;             /**< EMIT_x */
+      unsigned emit:8;             /**< EMIT_x */
       unsigned src_index:8;          /**< map to post-xform attribs */
    } attrib[PIPE_MAX_SHADER_OUTPUTS];
 };
@@ -124,20 +112,18 @@ draw_vinfo_copy( struct vertex_info *dst,
 static inline uint
 draw_emit_vertex_attr(struct vertex_info *vinfo,
                       enum attrib_emit emit, 
-                      enum interp_mode interp, /* only used by softpipe??? */
                       int src_index)
 {
    const uint n = vinfo->num_attribs;
 
    /* If the src_index is negative, meaning it hasn't been found
-    * lets just redirect it to the first output slot */
+    * we'll assign it all zeros later - set to DRAW_ATTR_NONEXIST */
    if (src_index < 0) {
-      src_index = 0;
+      src_index = DRAW_ATTR_NONEXIST;
    }
 
    assert(n < Elements(vinfo->attrib));
    vinfo->attrib[n].emit = emit;
-   vinfo->attrib[n].interp_mode = interp;
    vinfo->attrib[n].src_index = src_index;
    vinfo->num_attribs++;
    return n;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 14244470c90..7854142f736 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -458,7 +458,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
    {
       /* Special case 4x4f --> 1x16ub */
       if (src_type.length == 4 &&
-          util_cpu_caps.has_sse2)
+            (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
       {
          num_dsts = (num_srcs + 3) / 4;
          dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
@@ -545,7 +545,7 @@ lp_build_conv(struct gallivm_state *gallivm,
        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
 
-       util_cpu_caps.has_sse2)
+       (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
    {
       struct lp_build_context bld;
       struct lp_type int16_type, int32_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index ad64ae058b6..4598db851ae 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -136,6 +136,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index b1aef715e20..f5718389f33 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -720,7 +720,7 @@ lp_build_transpose_aos_n(struct gallivm_state *gallivm,
 
       default:
          assert(0);
-   };
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 09c1b379172..8c39ab0afe9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1197,7 +1197,7 @@ get_soa_array_offsets(struct lp_build_context *uint_bld,
 
    if (need_perelement_offset) {
       LLVMValueRef pixel_offsets;
-      int i;
+      unsigned i;
      /* build pixel offset vector: {0, 1, 2, 3, ...} */
       pixel_offsets = uint_bld->undef;
       for (i = 0; i < uint_bld->type.length; i++) {
@@ -1809,7 +1809,7 @@ emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *float_bld = &bld_base->base;
-   int i;
+   unsigned i;
    LLVMValueRef temp, temp2;
    LLVMValueRef shuffles[8];
    LLVMValueRef shuffles2[8];
@@ -2713,7 +2713,7 @@ static boolean
 near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
                    int pc)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < 5; i++) {
       unsigned opcode;
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index c5c33327702..75afebe4919 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -431,7 +431,7 @@ hud_alloc_vertices(struct hud_context *hud, struct vertex_queue *v,
    v->max_num_vertices = num_vertices;
    v->vbuf.stride = stride;
    u_upload_alloc(hud->uploader, 0, v->vbuf.stride * v->max_num_vertices,
-                  &v->vbuf.buffer_offset, &v->vbuf.buffer,
+                  16, &v->vbuf.buffer_offset, &v->vbuf.buffer,
                   (void**)&v->vertices);
 }
 
@@ -1176,8 +1176,8 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
 
    hud->pipe = pipe;
    hud->cso = cso;
-   hud->uploader = u_upload_create(pipe, 256 * 1024, 16,
-                                   PIPE_BIND_VERTEX_BUFFER);
+   hud->uploader = u_upload_create(pipe, 256 * 1024,
+                                   PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM);
 
    /* font */
    if (!util_font_create(pipe, UTIL_FONT_FIXED_8X13, &hud->font)) {
diff --git a/src/gallium/auxiliary/indices/u_primconvert.c b/src/gallium/auxiliary/indices/u_primconvert.c
index 70d3e8530b8..5effd883f67 100644
--- a/src/gallium/auxiliary/indices/u_primconvert.c
+++ b/src/gallium/auxiliary/indices/u_primconvert.c
@@ -153,10 +153,11 @@ util_primconvert_draw_vbo(struct primconvert_context *pc,
    }
 
    if (!pc->upload) {
-      pc->upload = u_upload_create(pc->pipe, 4096, 4, PIPE_BIND_INDEX_BUFFER);
+      pc->upload = u_upload_create(pc->pipe, 4096, PIPE_BIND_INDEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
    }
 
-   u_upload_alloc(pc->upload, 0, new_ib.index_size * new_info.count,
+   u_upload_alloc(pc->upload, 0, new_ib.index_size * new_info.count, 4,
                   &new_ib.offset, &new_ib.buffer, &dst);
 
    if (info->indexed) {
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 94d992b0031..7c577592f70 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1950,7 +1950,7 @@ tgsi_processor_to_shader_stage(unsigned processor)
    case TGSI_PROCESSOR_COMPUTE:   return MESA_SHADER_COMPUTE;
    default:
       unreachable("invalid TGSI processor");
-   };
+   }
 }
 
 struct nir_shader *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index fdb7febf7ea..ea207461d27 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -259,36 +259,39 @@ tgsi_build_declaration_semantic(
    return ds;
 }
 
-static struct tgsi_declaration_resource
-tgsi_default_declaration_resource(void)
+static struct tgsi_declaration_image
+tgsi_default_declaration_image(void)
 {
-   struct tgsi_declaration_resource dr;
+   struct tgsi_declaration_image di;
 
-   dr.Resource = TGSI_TEXTURE_BUFFER;
-   dr.Raw = 0;
-   dr.Writable = 0;
-   dr.Padding = 0;
+   di.Resource = TGSI_TEXTURE_BUFFER;
+   di.Raw = 0;
+   di.Writable = 0;
+   di.Format = 0;
+   di.Padding = 0;
 
-   return dr;
+   return di;
 }
 
-static struct tgsi_declaration_resource
-tgsi_build_declaration_resource(unsigned texture,
-                                unsigned raw,
-                                unsigned writable,
-                                struct tgsi_declaration *declaration,
-                                struct tgsi_header *header)
+static struct tgsi_declaration_image
+tgsi_build_declaration_image(unsigned texture,
+                             unsigned format,
+                             unsigned raw,
+                             unsigned writable,
+                             struct tgsi_declaration *declaration,
+                             struct tgsi_header *header)
 {
-   struct tgsi_declaration_resource dr;
+   struct tgsi_declaration_image di;
 
-   dr = tgsi_default_declaration_resource();
-   dr.Resource = texture;
-   dr.Raw = raw;
-   dr.Writable = writable;
+   di = tgsi_default_declaration_image();
+   di.Resource = texture;
+   di.Format = format;
+   di.Raw = raw;
+   di.Writable = writable;
 
    declaration_grow(declaration, header);
 
-   return dr;
+   return di;
 }
 
 static struct tgsi_declaration_sampler_view
@@ -364,7 +367,7 @@ tgsi_default_full_declaration( void )
    full_declaration.Range = tgsi_default_declaration_range();
    full_declaration.Semantic = tgsi_default_declaration_semantic();
    full_declaration.Interp = tgsi_default_declaration_interp();
-   full_declaration.Resource = tgsi_default_declaration_resource();
+   full_declaration.Image = tgsi_default_declaration_image();
    full_declaration.SamplerView = tgsi_default_declaration_sampler_view();
    full_declaration.Array = tgsi_default_declaration_array();
 
@@ -454,20 +457,21 @@ tgsi_build_full_declaration(
          header );
    }
 
-   if (full_decl->Declaration.File == TGSI_FILE_RESOURCE) {
-      struct tgsi_declaration_resource *dr;
+   if (full_decl->Declaration.File == TGSI_FILE_IMAGE) {
+      struct tgsi_declaration_image *di;
 
       if (maxsize <= size) {
          return  0;
       }
-      dr = (struct tgsi_declaration_resource *)&tokens[size];
+      di = (struct tgsi_declaration_image *)&tokens[size];
       size++;
 
-      *dr = tgsi_build_declaration_resource(full_decl->Resource.Resource,
-                                            full_decl->Resource.Raw,
-                                            full_decl->Resource.Writable,
-                                            declaration,
-                                            header);
+      *di = tgsi_build_declaration_image(full_decl->Image.Resource,
+                                         full_decl->Image.Format,
+                                         full_decl->Image.Raw,
+                                         full_decl->Image.Writable,
+                                         declaration,
+                                         header);
    }
 
    if (full_decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
@@ -616,7 +620,8 @@ tgsi_default_instruction( void )
    instruction.NumSrcRegs = 1;
    instruction.Label = 0;
    instruction.Texture = 0;
-   instruction.Padding  = 0;
+   instruction.Memory = 0;
+   instruction.Padding = 0;
 
    return instruction;
 }
@@ -762,6 +767,34 @@ tgsi_build_instruction_texture(
    return instruction_texture;
 }
 
+static struct tgsi_instruction_memory
+tgsi_default_instruction_memory( void )
+{
+   struct tgsi_instruction_memory instruction_memory;
+
+   instruction_memory.Qualifier = 0;
+   instruction_memory.Padding = 0;
+
+   return instruction_memory;
+}
+
+static struct tgsi_instruction_memory
+tgsi_build_instruction_memory(
+   unsigned qualifier,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_memory instruction_memory;
+
+   instruction_memory.Qualifier = qualifier;
+   instruction_memory.Padding = 0;
+   instruction->Memory = 1;
+
+   instruction_grow( instruction, header );
+
+   return instruction_memory;
+}
 
 static struct tgsi_texture_offset
 tgsi_default_texture_offset( void )
@@ -1008,6 +1041,7 @@ tgsi_default_full_instruction( void )
    full_instruction.Predicate = tgsi_default_instruction_predicate();
    full_instruction.Label = tgsi_default_instruction_label();
    full_instruction.Texture = tgsi_default_instruction_texture();
+   full_instruction.Memory = tgsi_default_instruction_memory();
    for( i = 0;  i < TGSI_FULL_MAX_TEX_OFFSETS; i++ ) {
       full_instruction.TexOffsets[i] = tgsi_default_texture_offset();
    }
@@ -1119,6 +1153,24 @@ tgsi_build_full_instruction(
          prev_token = (struct tgsi_token *) texture_offset;
       }
    }
+
+   if (full_inst->Instruction.Memory) {
+      struct tgsi_instruction_memory *instruction_memory;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_memory =
+         (struct  tgsi_instruction_memory *) &tokens[size];
+      size++;
+
+      *instruction_memory = tgsi_build_instruction_memory(
+         full_inst->Memory.Qualifier,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_memory;
+   }
+
    for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
       const struct tgsi_full_dst_register *reg = &full_inst->Dst[i];
       struct tgsi_dst_register *dst_register;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index e29ffb39894..2ad29b9d49a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -348,15 +348,22 @@ iter_declaration(
       }
    }
 
-   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
+   if (decl->Declaration.File == TGSI_FILE_IMAGE) {
       TXT(", ");
-      ENM(decl->Resource.Resource, tgsi_texture_names);
-      if (decl->Resource.Writable)
+      ENM(decl->Image.Resource, tgsi_texture_names);
+      TXT(", ");
+      UID(decl->Image.Format);
+      if (decl->Image.Writable)
          TXT(", WR");
-      if (decl->Resource.Raw)
+      if (decl->Image.Raw)
          TXT(", RAW");
    }
 
+   if (decl->Declaration.File == TGSI_FILE_BUFFER) {
+      if (decl->Declaration.Atomic)
+         TXT(", ATOMIC");
+   }
+
    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
       TXT(", ");
       ENM(decl->SamplerView.Resource, tgsi_texture_names);
@@ -617,6 +624,16 @@ iter_instruction(
       }
    }
 
+   if (inst->Instruction.Memory) {
+      uint32_t qualifier = inst->Memory.Qualifier;
+      while (qualifier) {
+         int bit = ffs(qualifier) - 1;
+         qualifier &= ~(1U << bit);
+         TXT(", ");
+         ENM(bit, tgsi_memory_names);
+      }
+   }
+
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_IF:
    case TGSI_OPCODE_UIF:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index f86adcec506..26fec8e2142 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -473,6 +473,7 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
       return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 3b40c3de97d..b270dd73b67 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -37,231 +37,231 @@
 
 static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
 {
-   { 1, 1, 0, 0, 0, 0, COMP, "ARL", TGSI_OPCODE_ARL },
-   { 1, 1, 0, 0, 0, 0, COMP, "MOV", TGSI_OPCODE_MOV },
-   { 1, 1, 0, 0, 0, 0, CHAN, "LIT", TGSI_OPCODE_LIT },
-   { 1, 1, 0, 0, 0, 0, REPL, "RCP", TGSI_OPCODE_RCP },
-   { 1, 1, 0, 0, 0, 0, REPL, "RSQ", TGSI_OPCODE_RSQ },
-   { 1, 1, 0, 0, 0, 0, CHAN, "EXP", TGSI_OPCODE_EXP },
-   { 1, 1, 0, 0, 0, 0, CHAN, "LOG", TGSI_OPCODE_LOG },
-   { 1, 2, 0, 0, 0, 0, COMP, "MUL", TGSI_OPCODE_MUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "ADD", TGSI_OPCODE_ADD },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP3", TGSI_OPCODE_DP3 },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP4", TGSI_OPCODE_DP4 },
-   { 1, 2, 0, 0, 0, 0, CHAN, "DST", TGSI_OPCODE_DST },
-   { 1, 2, 0, 0, 0, 0, COMP, "MIN", TGSI_OPCODE_MIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "MAX", TGSI_OPCODE_MAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "SLT", TGSI_OPCODE_SLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "SGE", TGSI_OPCODE_SGE },
-   { 1, 3, 0, 0, 0, 0, COMP, "MAD", TGSI_OPCODE_MAD },
-   { 1, 2, 0, 0, 0, 0, COMP, "SUB", TGSI_OPCODE_SUB },
-   { 1, 3, 0, 0, 0, 0, COMP, "LRP", TGSI_OPCODE_LRP },
-   { 1, 3, 0, 0, 0, 0, COMP, "FMA", TGSI_OPCODE_FMA },
-   { 1, 1, 0, 0, 0, 0, REPL, "SQRT", TGSI_OPCODE_SQRT },
-   { 1, 3, 0, 0, 0, 0, REPL, "DP2A", TGSI_OPCODE_DP2A },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 22 },      /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "", 23 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "FRC", TGSI_OPCODE_FRC },
-   { 1, 3, 0, 0, 0, 0, COMP, "CLAMP", TGSI_OPCODE_CLAMP },
-   { 1, 1, 0, 0, 0, 0, COMP, "FLR", TGSI_OPCODE_FLR },
-   { 1, 1, 0, 0, 0, 0, COMP, "ROUND", TGSI_OPCODE_ROUND },
-   { 1, 1, 0, 0, 0, 0, REPL, "EX2", TGSI_OPCODE_EX2 },
-   { 1, 1, 0, 0, 0, 0, REPL, "LG2", TGSI_OPCODE_LG2 },
-   { 1, 2, 0, 0, 0, 0, REPL, "POW", TGSI_OPCODE_POW },
-   { 1, 2, 0, 0, 0, 0, COMP, "XPD", TGSI_OPCODE_XPD },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 32 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "ABS", TGSI_OPCODE_ABS },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 34 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, REPL, "DPH", TGSI_OPCODE_DPH },
-   { 1, 1, 0, 0, 0, 0, REPL, "COS", TGSI_OPCODE_COS },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
-   { 0, 0, 0, 0, 0, 0, NONE, "KILL", TGSI_OPCODE_KILL },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK2H", TGSI_OPCODE_PK2H },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK2US", TGSI_OPCODE_PK2US },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK4B", TGSI_OPCODE_PK4B },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK4UB", TGSI_OPCODE_PK4UB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 44 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "SEQ", TGSI_OPCODE_SEQ },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 46 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "SGT", TGSI_OPCODE_SGT },
-   { 1, 1, 0, 0, 0, 0, REPL, "SIN", TGSI_OPCODE_SIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "SLE", TGSI_OPCODE_SLE },
-   { 1, 2, 0, 0, 0, 0, COMP, "SNE", TGSI_OPCODE_SNE },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 51 },      /* removed */
-   { 1, 2, 1, 0, 0, 0, OTHR, "TEX", TGSI_OPCODE_TEX },
-   { 1, 4, 1, 0, 0, 0, OTHR, "TXD", TGSI_OPCODE_TXD },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXP", TGSI_OPCODE_TXP },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP2H", TGSI_OPCODE_UP2H },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP2US", TGSI_OPCODE_UP2US },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP4B", TGSI_OPCODE_UP4B },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP4UB", TGSI_OPCODE_UP4UB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 59 },      /* removed */
-   { 0, 1, 0, 0, 0, 1, NONE, "", 60 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "ARR", TGSI_OPCODE_ARR },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 62 },      /* removed */
-   { 0, 0, 0, 1, 0, 0, NONE, "CAL", TGSI_OPCODE_CAL },
-   { 0, 0, 0, 0, 0, 0, NONE, "RET", TGSI_OPCODE_RET },
-   { 1, 1, 0, 0, 0, 0, COMP, "SSG", TGSI_OPCODE_SSG },
-   { 1, 3, 0, 0, 0, 0, COMP, "CMP", TGSI_OPCODE_CMP },
-   { 1, 1, 0, 0, 0, 0, CHAN, "SCS", TGSI_OPCODE_SCS },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXB", TGSI_OPCODE_TXB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 69 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "DIV", TGSI_OPCODE_DIV },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP2", TGSI_OPCODE_DP2 },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXL", TGSI_OPCODE_TXL },
-   { 0, 0, 0, 0, 0, 0, NONE, "BRK", TGSI_OPCODE_BRK },
-   { 0, 1, 0, 1, 0, 1, NONE, "IF", TGSI_OPCODE_IF },
-   { 0, 1, 0, 1, 0, 1, NONE, "UIF", TGSI_OPCODE_UIF },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 76 },      /* removed */
-   { 0, 0, 0, 1, 1, 1, NONE, "ELSE", TGSI_OPCODE_ELSE },
-   { 0, 0, 0, 0, 1, 0, NONE, "ENDIF", TGSI_OPCODE_ENDIF },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDX_FINE", TGSI_OPCODE_DDX_FINE },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDY_FINE", TGSI_OPCODE_DDY_FINE },
-   { 0, 1, 0, 0, 0, 0, NONE, "PUSHA", TGSI_OPCODE_PUSHA },
-   { 1, 0, 0, 0, 0, 0, NONE, "POPA", TGSI_OPCODE_POPA },
-   { 1, 1, 0, 0, 0, 0, COMP, "CEIL", TGSI_OPCODE_CEIL },
-   { 1, 1, 0, 0, 0, 0, COMP, "I2F", TGSI_OPCODE_I2F },
-   { 1, 1, 0, 0, 0, 0, COMP, "NOT", TGSI_OPCODE_NOT },
-   { 1, 1, 0, 0, 0, 0, COMP, "TRUNC", TGSI_OPCODE_TRUNC },
-   { 1, 2, 0, 0, 0, 0, COMP, "SHL", TGSI_OPCODE_SHL },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 88 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "AND", TGSI_OPCODE_AND },
-   { 1, 2, 0, 0, 0, 0, COMP, "OR", TGSI_OPCODE_OR },
-   { 1, 2, 0, 0, 0, 0, COMP, "MOD", TGSI_OPCODE_MOD },
-   { 1, 2, 0, 0, 0, 0, COMP, "XOR", TGSI_OPCODE_XOR },
-   { 1, 3, 0, 0, 0, 0, COMP, "SAD", TGSI_OPCODE_SAD },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXF", TGSI_OPCODE_TXF },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXQ", TGSI_OPCODE_TXQ },
-   { 0, 0, 0, 0, 0, 0, NONE, "CONT", TGSI_OPCODE_CONT },
-   { 0, 1, 0, 0, 0, 0, NONE, "EMIT", TGSI_OPCODE_EMIT },
-   { 0, 1, 0, 0, 0, 0, NONE, "ENDPRIM", TGSI_OPCODE_ENDPRIM },
-   { 0, 0, 0, 1, 0, 1, NONE, "BGNLOOP", TGSI_OPCODE_BGNLOOP },
-   { 0, 0, 0, 0, 0, 1, NONE, "BGNSUB", TGSI_OPCODE_BGNSUB },
-   { 0, 0, 0, 1, 1, 0, NONE, "ENDLOOP", TGSI_OPCODE_ENDLOOP },
-   { 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB },
-   { 1, 1, 1, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ },
-   { 1, 1, 1, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 105 },     /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "", 106 },     /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSGE", TGSI_OPCODE_FSGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSLT", TGSI_OPCODE_FSLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSNE", TGSI_OPCODE_FSNE },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 112 },      /* removed */
-   { 0, 1, 0, 0, 0, 0, NONE, "CALLNZ", TGSI_OPCODE_CALLNZ },
-   { 0, 1, 0, 0, 0, 0, NONE, "", 114 },     /* removed */
-   { 0, 1, 0, 0, 0, 0, NONE, "BREAKC", TGSI_OPCODE_BREAKC },
-   { 0, 1, 0, 0, 0, 0, NONE, "KILL_IF", TGSI_OPCODE_KILL_IF },
-   { 0, 0, 0, 0, 0, 0, NONE, "END", TGSI_OPCODE_END },
-   { 1, 3, 0, 0, 0, 0, COMP, "DFMA", TGSI_OPCODE_DFMA },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2I", TGSI_OPCODE_F2I },
-   { 1, 2, 0, 0, 0, 0, COMP, "IDIV", TGSI_OPCODE_IDIV },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMAX", TGSI_OPCODE_IMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMIN", TGSI_OPCODE_IMIN },
-   { 1, 1, 0, 0, 0, 0, COMP, "INEG", TGSI_OPCODE_INEG },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISGE", TGSI_OPCODE_ISGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISHR", TGSI_OPCODE_ISHR },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISLT", TGSI_OPCODE_ISLT },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2U", TGSI_OPCODE_F2U },
-   { 1, 1, 0, 0, 0, 0, COMP, "U2F", TGSI_OPCODE_U2F },
-   { 1, 2, 0, 0, 0, 0, COMP, "UADD", TGSI_OPCODE_UADD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UDIV", TGSI_OPCODE_UDIV },
-   { 1, 3, 0, 0, 0, 0, COMP, "UMAD", TGSI_OPCODE_UMAD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMAX", TGSI_OPCODE_UMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMIN", TGSI_OPCODE_UMIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMOD", TGSI_OPCODE_UMOD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMUL", TGSI_OPCODE_UMUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "USEQ", TGSI_OPCODE_USEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "USGE", TGSI_OPCODE_USGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "USHR", TGSI_OPCODE_USHR },
-   { 1, 2, 0, 0, 0, 0, COMP, "USLT", TGSI_OPCODE_USLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "USNE", TGSI_OPCODE_USNE },
-   { 0, 1, 0, 0, 0, 0, NONE, "SWITCH", TGSI_OPCODE_SWITCH },
-   { 0, 1, 0, 0, 0, 0, NONE, "CASE", TGSI_OPCODE_CASE },
-   { 0, 0, 0, 0, 0, 0, NONE, "DEFAULT", TGSI_OPCODE_DEFAULT },
-   { 0, 0, 0, 0, 0, 0, NONE, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ARL", TGSI_OPCODE_ARL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "MOV", TGSI_OPCODE_MOV },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "LIT", TGSI_OPCODE_LIT },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "RCP", TGSI_OPCODE_RCP },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "RSQ", TGSI_OPCODE_RSQ },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "EXP", TGSI_OPCODE_EXP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "LOG", TGSI_OPCODE_LOG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MUL", TGSI_OPCODE_MUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ADD", TGSI_OPCODE_ADD },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP3", TGSI_OPCODE_DP3 },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP4", TGSI_OPCODE_DP4 },
+   { 1, 2, 0, 0, 0, 0, 0, CHAN, "DST", TGSI_OPCODE_DST },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MIN", TGSI_OPCODE_MIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MAX", TGSI_OPCODE_MAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SLT", TGSI_OPCODE_SLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SGE", TGSI_OPCODE_SGE },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "MAD", TGSI_OPCODE_MAD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SUB", TGSI_OPCODE_SUB },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "LRP", TGSI_OPCODE_LRP },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "FMA", TGSI_OPCODE_FMA },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "SQRT", TGSI_OPCODE_SQRT },
+   { 1, 3, 0, 0, 0, 0, 0, REPL, "DP2A", TGSI_OPCODE_DP2A },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 22 },      /* removed */
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 23 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "FRC", TGSI_OPCODE_FRC },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "CLAMP", TGSI_OPCODE_CLAMP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "FLR", TGSI_OPCODE_FLR },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ROUND", TGSI_OPCODE_ROUND },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "EX2", TGSI_OPCODE_EX2 },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "LG2", TGSI_OPCODE_LG2 },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "POW", TGSI_OPCODE_POW },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "XPD", TGSI_OPCODE_XPD },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 32 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ABS", TGSI_OPCODE_ABS },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 34 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DPH", TGSI_OPCODE_DPH },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "COS", TGSI_OPCODE_COS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "KILL", TGSI_OPCODE_KILL },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK2H", TGSI_OPCODE_PK2H },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK2US", TGSI_OPCODE_PK2US },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK4B", TGSI_OPCODE_PK4B },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK4UB", TGSI_OPCODE_PK4UB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 44 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SEQ", TGSI_OPCODE_SEQ },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 46 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SGT", TGSI_OPCODE_SGT },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "SIN", TGSI_OPCODE_SIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SLE", TGSI_OPCODE_SLE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SNE", TGSI_OPCODE_SNE },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 51 },      /* removed */
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TEX", TGSI_OPCODE_TEX },
+   { 1, 4, 1, 0, 0, 0, 0, OTHR, "TXD", TGSI_OPCODE_TXD },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXP", TGSI_OPCODE_TXP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP2H", TGSI_OPCODE_UP2H },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP2US", TGSI_OPCODE_UP2US },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP4B", TGSI_OPCODE_UP4B },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP4UB", TGSI_OPCODE_UP4UB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 59 },      /* removed */
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 60 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ARR", TGSI_OPCODE_ARR },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 62 },      /* removed */
+   { 0, 0, 0, 0, 1, 0, 0, NONE, "CAL", TGSI_OPCODE_CAL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "RET", TGSI_OPCODE_RET },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "SSG", TGSI_OPCODE_SSG },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "CMP", TGSI_OPCODE_CMP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "SCS", TGSI_OPCODE_SCS },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXB", TGSI_OPCODE_TXB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 69 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DIV", TGSI_OPCODE_DIV },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP2", TGSI_OPCODE_DP2 },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXL", TGSI_OPCODE_TXL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "BRK", TGSI_OPCODE_BRK },
+   { 0, 1, 0, 0, 1, 0, 1, NONE, "IF", TGSI_OPCODE_IF },
+   { 0, 1, 0, 0, 1, 0, 1, NONE, "UIF", TGSI_OPCODE_UIF },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 76 },      /* removed */
+   { 0, 0, 0, 0, 1, 1, 1, NONE, "ELSE", TGSI_OPCODE_ELSE },
+   { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDIF", TGSI_OPCODE_ENDIF },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDX_FINE", TGSI_OPCODE_DDX_FINE },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDY_FINE", TGSI_OPCODE_DDY_FINE },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "PUSHA", TGSI_OPCODE_PUSHA },
+   { 1, 0, 0, 0, 0, 0, 0, NONE, "POPA", TGSI_OPCODE_POPA },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "CEIL", TGSI_OPCODE_CEIL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "I2F", TGSI_OPCODE_I2F },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "NOT", TGSI_OPCODE_NOT },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "TRUNC", TGSI_OPCODE_TRUNC },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SHL", TGSI_OPCODE_SHL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 88 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "AND", TGSI_OPCODE_AND },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "OR", TGSI_OPCODE_OR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MOD", TGSI_OPCODE_MOD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "XOR", TGSI_OPCODE_XOR },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "SAD", TGSI_OPCODE_SAD },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXF", TGSI_OPCODE_TXF },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXQ", TGSI_OPCODE_TXQ },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "CONT", TGSI_OPCODE_CONT },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "EMIT", TGSI_OPCODE_EMIT },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "ENDPRIM", TGSI_OPCODE_ENDPRIM },
+   { 0, 0, 0, 0, 1, 0, 1, NONE, "BGNLOOP", TGSI_OPCODE_BGNLOOP },
+   { 0, 0, 0, 0, 0, 0, 1, NONE, "BGNSUB", TGSI_OPCODE_BGNSUB },
+   { 0, 0, 0, 0, 1, 1, 0, NONE, "ENDLOOP", TGSI_OPCODE_ENDLOOP },
+   { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB },
+   { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ },
+   { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS },
+   { 1, 1, 0, 0, 0, 0, 0, NONE, "RESQ", TGSI_OPCODE_RESQ },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 106 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSGE", TGSI_OPCODE_FSGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSLT", TGSI_OPCODE_FSLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSNE", TGSI_OPCODE_FSNE },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 112 },      /* removed */
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "CALLNZ", TGSI_OPCODE_CALLNZ },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "", 114 },     /* removed */
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "BREAKC", TGSI_OPCODE_BREAKC },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "KILL_IF", TGSI_OPCODE_KILL_IF },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "END", TGSI_OPCODE_END },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "DFMA", TGSI_OPCODE_DFMA },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2I", TGSI_OPCODE_F2I },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IDIV", TGSI_OPCODE_IDIV },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMAX", TGSI_OPCODE_IMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMIN", TGSI_OPCODE_IMIN },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "INEG", TGSI_OPCODE_INEG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISGE", TGSI_OPCODE_ISGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISHR", TGSI_OPCODE_ISHR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISLT", TGSI_OPCODE_ISLT },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2U", TGSI_OPCODE_F2U },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "U2F", TGSI_OPCODE_U2F },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UADD", TGSI_OPCODE_UADD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UDIV", TGSI_OPCODE_UDIV },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UMAD", TGSI_OPCODE_UMAD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMAX", TGSI_OPCODE_UMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMIN", TGSI_OPCODE_UMIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMOD", TGSI_OPCODE_UMOD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMUL", TGSI_OPCODE_UMUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USEQ", TGSI_OPCODE_USEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USGE", TGSI_OPCODE_USGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USHR", TGSI_OPCODE_USHR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USLT", TGSI_OPCODE_USLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USNE", TGSI_OPCODE_USNE },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "SWITCH", TGSI_OPCODE_SWITCH },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "CASE", TGSI_OPCODE_CASE },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "DEFAULT", TGSI_OPCODE_DEFAULT },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH },
 
-   { 1, 3, 0, 0, 0, 0, OTHR, "SAMPLE",      TGSI_OPCODE_SAMPLE },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_I",    TGSI_OPCODE_SAMPLE_I },
-   { 1, 3, 0, 0, 0, 0, OTHR, "SAMPLE_I_MS", TGSI_OPCODE_SAMPLE_I_MS },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_B",    TGSI_OPCODE_SAMPLE_B },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_C",    TGSI_OPCODE_SAMPLE_C },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_C_LZ", TGSI_OPCODE_SAMPLE_C_LZ },
-   { 1, 5, 0, 0, 0, 0, OTHR, "SAMPLE_D",    TGSI_OPCODE_SAMPLE_D },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_L",    TGSI_OPCODE_SAMPLE_L },
-   { 1, 3, 0, 0, 0, 0, OTHR, "GATHER4",     TGSI_OPCODE_GATHER4 },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SVIEWINFO",   TGSI_OPCODE_SVIEWINFO },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_POS",  TGSI_OPCODE_SAMPLE_POS },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_INFO", TGSI_OPCODE_SAMPLE_INFO },
-   { 1, 1, 0, 0, 0, 0, COMP, "UARL", TGSI_OPCODE_UARL },
-   { 1, 3, 0, 0, 0, 0, COMP, "UCMP", TGSI_OPCODE_UCMP },
-   { 1, 1, 0, 0, 0, 0, COMP, "IABS", TGSI_OPCODE_IABS },
-   { 1, 1, 0, 0, 0, 0, COMP, "ISSG", TGSI_OPCODE_ISSG },
-   { 1, 2, 0, 0, 0, 0, OTHR, "LOAD", TGSI_OPCODE_LOAD },
-   { 1, 2, 0, 0, 0, 0, OTHR, "STORE", TGSI_OPCODE_STORE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "MFENCE", TGSI_OPCODE_MFENCE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "LFENCE", TGSI_OPCODE_LFENCE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "SFENCE", TGSI_OPCODE_SFENCE },
-   { 0, 0, 0, 0, 0, 0, OTHR, "BARRIER", TGSI_OPCODE_BARRIER },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "SAMPLE",      TGSI_OPCODE_SAMPLE },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_I",    TGSI_OPCODE_SAMPLE_I },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "SAMPLE_I_MS", TGSI_OPCODE_SAMPLE_I_MS },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_B",    TGSI_OPCODE_SAMPLE_B },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_C",    TGSI_OPCODE_SAMPLE_C },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_C_LZ", TGSI_OPCODE_SAMPLE_C_LZ },
+   { 1, 5, 0, 0, 0, 0, 0, OTHR, "SAMPLE_D",    TGSI_OPCODE_SAMPLE_D },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_L",    TGSI_OPCODE_SAMPLE_L },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "GATHER4",     TGSI_OPCODE_GATHER4 },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SVIEWINFO",   TGSI_OPCODE_SVIEWINFO },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_POS",  TGSI_OPCODE_SAMPLE_POS },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_INFO", TGSI_OPCODE_SAMPLE_INFO },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "UARL", TGSI_OPCODE_UARL },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UCMP", TGSI_OPCODE_UCMP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "IABS", TGSI_OPCODE_IABS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ISSG", TGSI_OPCODE_ISSG },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "LOAD", TGSI_OPCODE_LOAD },
+   { 1, 2, 0, 1, 0, 0, 0, OTHR, "STORE", TGSI_OPCODE_STORE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "MFENCE", TGSI_OPCODE_MFENCE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "LFENCE", TGSI_OPCODE_LFENCE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "SFENCE", TGSI_OPCODE_SFENCE },
+   { 0, 0, 0, 0, 0, 0, 0, OTHR, "BARRIER", TGSI_OPCODE_BARRIER },
 
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUADD", TGSI_OPCODE_ATOMUADD },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMXCHG", TGSI_OPCODE_ATOMXCHG },
-   { 1, 4, 0, 0, 0, 0, OTHR, "ATOMCAS", TGSI_OPCODE_ATOMCAS },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMAND", TGSI_OPCODE_ATOMAND },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMOR", TGSI_OPCODE_ATOMOR },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMXOR", TGSI_OPCODE_ATOMXOR },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUMIN", TGSI_OPCODE_ATOMUMIN },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUMAX", TGSI_OPCODE_ATOMUMAX },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMIMIN", TGSI_OPCODE_ATOMIMIN },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMIMAX", TGSI_OPCODE_ATOMIMAX },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TEX2", TGSI_OPCODE_TEX2 },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TXB2", TGSI_OPCODE_TXB2 },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TXL2", TGSI_OPCODE_TXL2 },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMUL_HI", TGSI_OPCODE_IMUL_HI },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMUL_HI", TGSI_OPCODE_UMUL_HI },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TG4", TGSI_OPCODE_TG4 },
-   { 1, 2, 1, 0, 0, 0, OTHR, "LODQ", TGSI_OPCODE_LODQ },
-   { 1, 3, 0, 0, 0, 0, COMP, "IBFE", TGSI_OPCODE_IBFE },
-   { 1, 3, 0, 0, 0, 0, COMP, "UBFE", TGSI_OPCODE_UBFE },
-   { 1, 4, 0, 0, 0, 0, COMP, "BFI", TGSI_OPCODE_BFI },
-   { 1, 1, 0, 0, 0, 0, COMP, "BREV", TGSI_OPCODE_BREV },
-   { 1, 1, 0, 0, 0, 0, COMP, "POPC", TGSI_OPCODE_POPC },
-   { 1, 1, 0, 0, 0, 0, COMP, "LSB", TGSI_OPCODE_LSB },
-   { 1, 1, 0, 0, 0, 0, COMP, "IMSB", TGSI_OPCODE_IMSB },
-   { 1, 1, 0, 0, 0, 0, COMP, "UMSB", TGSI_OPCODE_UMSB },
-   { 1, 1, 0, 0, 0, 0, OTHR, "INTERP_CENTROID", TGSI_OPCODE_INTERP_CENTROID },
-   { 1, 2, 0, 0, 0, 0, OTHR, "INTERP_SAMPLE", TGSI_OPCODE_INTERP_SAMPLE },
-   { 1, 2, 0, 0, 0, 0, OTHR, "INTERP_OFFSET", TGSI_OPCODE_INTERP_OFFSET },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2D", TGSI_OPCODE_F2D },
-   { 1, 1, 0, 0, 0, 0, COMP, "D2F", TGSI_OPCODE_D2F },
-   { 1, 1, 0, 0, 0, 0, COMP, "DABS", TGSI_OPCODE_DABS },
-   { 1, 1, 0, 0, 0, 0, COMP, "DNEG", TGSI_OPCODE_DNEG },
-   { 1, 2, 0, 0, 0, 0, COMP, "DADD", TGSI_OPCODE_DADD },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMUL", TGSI_OPCODE_DMUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMAX", TGSI_OPCODE_DMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMIN", TGSI_OPCODE_DMIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSLT", TGSI_OPCODE_DSLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSGE", TGSI_OPCODE_DSGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSEQ", TGSI_OPCODE_DSEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSNE", TGSI_OPCODE_DSNE },
-   { 1, 1, 0, 0, 0, 0, COMP, "DRCP", TGSI_OPCODE_DRCP },
-   { 1, 1, 0, 0 ,0, 0, COMP, "DSQRT", TGSI_OPCODE_DSQRT },
-   { 1, 3, 0, 0 ,0, 0, COMP, "DMAD", TGSI_OPCODE_DMAD },
-   { 1, 1, 0, 0, 0, 0, COMP, "DFRAC", TGSI_OPCODE_DFRAC},
-   { 1, 2, 0, 0, 0, 0, COMP, "DLDEXP", TGSI_OPCODE_DLDEXP},
-   { 2, 1, 0, 0, 0, 0, COMP, "DFRACEXP", TGSI_OPCODE_DFRACEXP},
-   { 1, 1, 0, 0, 0, 0, COMP, "D2I", TGSI_OPCODE_D2I },
-   { 1, 1, 0, 0, 0, 0, COMP, "I2D", TGSI_OPCODE_I2D },
-   { 1, 1, 0, 0, 0, 0, COMP, "D2U", TGSI_OPCODE_D2U },
-   { 1, 1, 0, 0, 0, 0, COMP, "U2D", TGSI_OPCODE_U2D },
-   { 1, 1, 0, 0 ,0, 0, COMP, "DRSQ", TGSI_OPCODE_DRSQ },
-   { 1, 1, 0, 0, 0, 0, COMP, "DTRUNC", TGSI_OPCODE_DTRUNC },
-   { 1, 1, 0, 0, 0, 0, COMP, "DCEIL", TGSI_OPCODE_DCEIL },
-   { 1, 1, 0, 0, 0, 0, COMP, "DFLR", TGSI_OPCODE_DFLR },
-   { 1, 1, 0, 0, 0, 0, COMP, "DROUND", TGSI_OPCODE_DROUND },
-   { 1, 1, 0, 0, 0, 0, COMP, "DSSG", TGSI_OPCODE_DSSG },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUADD", TGSI_OPCODE_ATOMUADD },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMXCHG", TGSI_OPCODE_ATOMXCHG },
+   { 1, 4, 0, 1, 0, 0, 0, OTHR, "ATOMCAS", TGSI_OPCODE_ATOMCAS },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMAND", TGSI_OPCODE_ATOMAND },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMOR", TGSI_OPCODE_ATOMOR },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMXOR", TGSI_OPCODE_ATOMXOR },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUMIN", TGSI_OPCODE_ATOMUMIN },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUMAX", TGSI_OPCODE_ATOMUMAX },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMIMIN", TGSI_OPCODE_ATOMIMIN },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMIMAX", TGSI_OPCODE_ATOMIMAX },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TEX2", TGSI_OPCODE_TEX2 },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TXB2", TGSI_OPCODE_TXB2 },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TXL2", TGSI_OPCODE_TXL2 },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMUL_HI", TGSI_OPCODE_IMUL_HI },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMUL_HI", TGSI_OPCODE_UMUL_HI },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TG4", TGSI_OPCODE_TG4 },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "LODQ", TGSI_OPCODE_LODQ },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "IBFE", TGSI_OPCODE_IBFE },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UBFE", TGSI_OPCODE_UBFE },
+   { 1, 4, 0, 0, 0, 0, 0, COMP, "BFI", TGSI_OPCODE_BFI },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "BREV", TGSI_OPCODE_BREV },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "POPC", TGSI_OPCODE_POPC },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "LSB", TGSI_OPCODE_LSB },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "IMSB", TGSI_OPCODE_IMSB },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "UMSB", TGSI_OPCODE_UMSB },
+   { 1, 1, 0, 0, 0, 0, 0, OTHR, "INTERP_CENTROID", TGSI_OPCODE_INTERP_CENTROID },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "INTERP_SAMPLE", TGSI_OPCODE_INTERP_SAMPLE },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "INTERP_OFFSET", TGSI_OPCODE_INTERP_OFFSET },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2D", TGSI_OPCODE_F2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2F", TGSI_OPCODE_D2F },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DABS", TGSI_OPCODE_DABS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DNEG", TGSI_OPCODE_DNEG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DADD", TGSI_OPCODE_DADD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMUL", TGSI_OPCODE_DMUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMAX", TGSI_OPCODE_DMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMIN", TGSI_OPCODE_DMIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSLT", TGSI_OPCODE_DSLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSGE", TGSI_OPCODE_DSGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSEQ", TGSI_OPCODE_DSEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSNE", TGSI_OPCODE_DSNE },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DRCP", TGSI_OPCODE_DRCP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DSQRT", TGSI_OPCODE_DSQRT },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "DMAD", TGSI_OPCODE_DMAD },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DFRAC", TGSI_OPCODE_DFRAC},
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DLDEXP", TGSI_OPCODE_DLDEXP},
+   { 2, 1, 0, 0, 0, 0, 0, COMP, "DFRACEXP", TGSI_OPCODE_DFRACEXP},
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2I", TGSI_OPCODE_D2I },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "I2D", TGSI_OPCODE_I2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2U", TGSI_OPCODE_D2U },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "U2D", TGSI_OPCODE_U2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DRSQ", TGSI_OPCODE_DRSQ },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DTRUNC", TGSI_OPCODE_DTRUNC },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DCEIL", TGSI_OPCODE_DCEIL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DFLR", TGSI_OPCODE_DFLR },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DROUND", TGSI_OPCODE_DROUND },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DSSG", TGSI_OPCODE_DSSG },
 };
 
 const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h
index aa7edd1e114..46f03cd393f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.h
@@ -74,6 +74,7 @@ struct tgsi_opcode_info
    unsigned num_dst:3;
    unsigned num_src:3;
    unsigned is_tex:1;
+   unsigned is_store:1;
    unsigned is_branch:1;
    int pre_dedent:2;
    int post_indent:2;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 0729b5d2426..ae95ebd82a4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -121,8 +121,8 @@ tgsi_parse_token(
          next_token( ctx, &decl->Semantic );
       }
 
-      if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
-         next_token(ctx, &decl->Resource);
+      if (decl->Declaration.File == TGSI_FILE_IMAGE) {
+         next_token(ctx, &decl->Image);
       }
 
       if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
@@ -195,6 +195,10 @@ tgsi_parse_token(
          }
       }
 
+      if (inst->Instruction.Memory) {
+         next_token(ctx, &inst->Memory);
+      }
+
       assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
 
       for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index 35e1c7cfd62..4689fb797d0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -64,7 +64,7 @@ struct tgsi_full_declaration
    struct tgsi_declaration_dimension Dim;
    struct tgsi_declaration_interp Interp;
    struct tgsi_declaration_semantic Semantic;
-   struct tgsi_declaration_resource Resource;
+   struct tgsi_declaration_image Image;
    struct tgsi_declaration_sampler_view SamplerView;
    struct tgsi_declaration_array Array;
 };
@@ -91,6 +91,7 @@ struct tgsi_full_instruction
    struct tgsi_instruction_predicate   Predicate;
    struct tgsi_instruction_label       Label;
    struct tgsi_instruction_texture     Texture;
+   struct tgsi_instruction_memory      Memory;
    struct tgsi_full_dst_register       Dst[TGSI_FULL_MAX_DST_REGISTERS];
    struct tgsi_full_src_register       Src[TGSI_FULL_MAX_SRC_REGISTERS];
    struct tgsi_texture_offset          TexOffsets[TGSI_FULL_MAX_TEX_OFFSETS];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index e04f4076e9c..7a02e27e01e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -187,13 +187,28 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                   }
 
                   if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                      info->reads_position &&
-                      src->Register.Index == 0 &&
-                      (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleW == TGSI_SWIZZLE_Z)) {
-                     info->reads_z = TRUE;
+                      !src->Register.Indirect) {
+                     unsigned name =
+                        info->input_semantic_name[src->Register.Index];
+                     unsigned index =
+                        info->input_semantic_index[src->Register.Index];
+
+                     if (name == TGSI_SEMANTIC_POSITION &&
+                         (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleW == TGSI_SWIZZLE_Z))
+                        info->reads_z = TRUE;
+
+                     if (name == TGSI_SEMANTIC_COLOR) {
+                        unsigned mask =
+                              (1 << src->Register.SwizzleX) |
+                              (1 << src->Register.SwizzleY) |
+                              (1 << src->Register.SwizzleZ) |
+                              (1 << src->Register.SwizzleW);
+
+                        info->colors_read |= mask << (index * 4);
+                     }
                   }
                }
 
@@ -358,7 +373,10 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                      info->uses_primid = TRUE;
                   } else if (semName == TGSI_SEMANTIC_INVOCATIONID) {
                      info->uses_invocationid = TRUE;
-                  }
+                  } else if (semName == TGSI_SEMANTIC_POSITION)
+                     info->reads_position = TRUE;
+                  else if (semName == TGSI_SEMANTIC_FACE)
+                     info->uses_frontface = TRUE;
                }
                else if (file == TGSI_FILE_OUTPUT) {
                   info->output_semantic_name[reg] = (ubyte) semName;
@@ -392,6 +410,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                      }
                      else if (semName == TGSI_SEMANTIC_STENCIL) {
                         info->writes_stencil = TRUE;
+                     } else if (semName == TGSI_SEMANTIC_SAMPLEMASK) {
+                        info->writes_samplemask = TRUE;
                      }
                   }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 7e9a5597db2..b0b423ab528 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -77,11 +77,13 @@ struct tgsi_shader_info
 
    uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
 
+   ubyte colors_read; /**< which color components are read by the FS */
    ubyte colors_written;
    boolean reads_position; /**< does fragment shader read position? */
    boolean reads_z; /**< does fragment shader read depth? */
    boolean writes_z;  /**< does fragment shader write Z value? */
    boolean writes_stencil; /**< does fragment shader write stencil value? */
+   boolean writes_samplemask; /**< does fragment shader write sample mask? */
    boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KILL or KILL_IF instruction used? */
    boolean uses_persp_center;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index fc29a2398aa..f2d70d49839 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -54,8 +54,9 @@ static const char *tgsi_file_names[] =
    "IMM",
    "PRED",
    "SV",
-   "RES",
-   "SVIEW"
+   "IMAGE",
+   "SVIEW",
+   "BUFFER",
 };
 
 const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
@@ -96,6 +97,8 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
    "TESSINNER",
    "VERTICESIN",
    "HELPER_INVOCATION",
+   "BASEINSTANCE",
+   "DRAWID",
 };
 
 const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
@@ -205,6 +208,13 @@ const char *tgsi_immediate_type_names[4] =
    "FLT64"
 };
 
+const char *tgsi_memory_names[3] =
+{
+   "COHERENT",
+   "RESTRICT",
+   "VOLATILE",
+};
+
 
 static inline void
 tgsi_strings_check(void)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.h b/src/gallium/auxiliary/tgsi/tgsi_strings.h
index 71e74372f22..031d32278cc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.h
@@ -60,6 +60,8 @@ extern const char *tgsi_fs_coord_pixel_center_names[2];
 
 extern const char *tgsi_immediate_type_names[4];
 
+extern const char *tgsi_memory_names[3];
+
 
 const char *
 tgsi_file_name(unsigned file);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 4a82c9b3552..97b1869a66f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1039,6 +1039,12 @@ parse_instruction(
       inst.Texture.Texture = TGSI_TEXTURE_UNKNOWN;
    }
 
+   if ((i >= TGSI_OPCODE_LOAD && i <= TGSI_OPCODE_ATOMIMAX) ||
+       i == TGSI_OPCODE_RESQ) {
+      inst.Instruction.Memory = 1;
+      inst.Memory.Qualifier = 0;
+   }
+
    /* Parse instruction operands.
     */
    for (i = 0; i < info->num_dst + info->num_src + info->is_tex; i++) {
@@ -1091,6 +1097,27 @@ parse_instruction(
    inst.Texture.NumOffsets = i;
 
    cur = ctx->cur;
+   eat_opt_white(&cur);
+   for (i = 0; inst.Instruction.Memory && *cur == ','; i++) {
+      uint j;
+      cur++;
+      eat_opt_white(&cur);
+      ctx->cur = cur;
+      for (j = 0; j < 3; j++) {
+         if (str_match_nocase_whole(&ctx->cur, tgsi_memory_names[j])) {
+            inst.Memory.Qualifier |= 1U << j;
+            break;
+         }
+      }
+      if (j == 3) {
+         report_error(ctx, "Expected memory qualifier");
+         return FALSE;
+      }
+      cur = ctx->cur;
+      eat_opt_white(&cur);
+   }
+
+   cur = ctx->cur;
    eat_opt_white( &cur );
    if (info->is_branch && *cur == ':') {
       uint target;
@@ -1251,10 +1278,10 @@ static boolean parse_declaration( struct translate_ctx *ctx )
 
       cur++;
       eat_opt_white( &cur );
-      if (file == TGSI_FILE_RESOURCE) {
+      if (file == TGSI_FILE_IMAGE) {
          for (i = 0; i < TGSI_TEXTURE_COUNT; i++) {
             if (str_match_nocase_whole(&cur, tgsi_texture_names[i])) {
-               decl.Resource.Resource = i;
+               decl.Image.Resource = i;
                break;
             }
          }
@@ -1263,16 +1290,18 @@ static boolean parse_declaration( struct translate_ctx *ctx )
             return FALSE;
          }
 
+         /* XXX format */
+
          cur2 = cur;
          eat_opt_white(&cur2);
          while (*cur2 == ',') {
             cur2++;
             eat_opt_white(&cur2);
             if (str_match_nocase_whole(&cur2, "RAW")) {
-               decl.Resource.Raw = 1;
+               decl.Image.Raw = 1;
 
             } else if (str_match_nocase_whole(&cur2, "WR")) {
-               decl.Resource.Writable = 1;
+               decl.Image.Writable = 1;
 
             } else {
                break;
@@ -1348,6 +1377,11 @@ static boolean parse_declaration( struct translate_ctx *ctx )
                decl.SamplerView.ReturnTypeX;
          }
          ctx->cur = cur;
+      } else if (file == TGSI_FILE_BUFFER) {
+         if (str_match_nocase_whole(&cur, "ATOMIC")) {
+            decl.Declaration.Atomic = 1;
+            ctx->cur = cur;
+         }
       } else {
          if (str_match_nocase_whole(&cur, "LOCAL")) {
             decl.Declaration.Local = 1;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 4aaf8dfe6d8..d6811501d16 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -50,6 +50,7 @@ union tgsi_any_token {
    struct tgsi_declaration_range decl_range;
    struct tgsi_declaration_dimension decl_dim;
    struct tgsi_declaration_interp decl_interp;
+   struct tgsi_declaration_image decl_image;
    struct tgsi_declaration_semantic decl_semantic;
    struct tgsi_declaration_sampler_view decl_sampler_view;
    struct tgsi_declaration_array array;
@@ -59,6 +60,7 @@ union tgsi_any_token {
    struct tgsi_instruction_predicate insn_predicate;
    struct tgsi_instruction_label insn_label;
    struct tgsi_instruction_texture insn_texture;
+   struct tgsi_instruction_memory insn_memory;
    struct tgsi_texture_offset insn_texture_offset;
    struct tgsi_src_register src;
    struct tgsi_ind_register ind;
@@ -115,7 +117,6 @@ struct ureg_program
    unsigned vs_inputs[PIPE_MAX_ATTRIBS/32];
 
    struct {
-      unsigned index;
       unsigned semantic_name;
       unsigned semantic_index;
    } system_value[UREG_MAX_SYSTEM_VALUE];
@@ -155,6 +156,21 @@ struct ureg_program
    } sampler_view[PIPE_MAX_SHADER_SAMPLER_VIEWS];
    unsigned nr_sampler_views;
 
+   struct {
+      unsigned index;
+      unsigned target;
+      unsigned format;
+      boolean wr;
+      boolean raw;
+   } image[PIPE_MAX_SHADER_IMAGES];
+   unsigned nr_images;
+
+   struct {
+      unsigned index;
+      bool atomic;
+   } buffer[PIPE_MAX_SHADER_BUFFERS];
+   unsigned nr_buffers;
+
    struct util_bitmask *free_temps;
    struct util_bitmask *local_temps;
    struct util_bitmask *decl_temps;
@@ -320,20 +336,29 @@ ureg_DECL_input(struct ureg_program *ureg,
 
 struct ureg_src
 ureg_DECL_system_value(struct ureg_program *ureg,
-                       unsigned index,
                        unsigned semantic_name,
                        unsigned semantic_index)
 {
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_system_values; i++) {
+      if (ureg->system_value[i].semantic_name == semantic_name &&
+          ureg->system_value[i].semantic_index == semantic_index) {
+         goto out;
+      }
+   }
+
    if (ureg->nr_system_values < UREG_MAX_SYSTEM_VALUE) {
-      ureg->system_value[ureg->nr_system_values].index = index;
       ureg->system_value[ureg->nr_system_values].semantic_name = semantic_name;
       ureg->system_value[ureg->nr_system_values].semantic_index = semantic_index;
+      i = ureg->nr_system_values;
       ureg->nr_system_values++;
    } else {
       set_bad(ureg);
    }
 
-   return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, index);
+out:
+   return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, i);
 }
 
 
@@ -648,6 +673,60 @@ ureg_DECL_sampler_view(struct ureg_program *ureg,
    return reg;
 }
 
+/* Allocate a new image.
+ */
+struct ureg_src
+ureg_DECL_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw)
+{
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_IMAGE, index);
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_images; i++)
+      if (ureg->image[i].index == index)
+         return reg;
+
+   if (i < PIPE_MAX_SHADER_IMAGES) {
+      ureg->image[i].index = index;
+      ureg->image[i].target = target;
+      ureg->image[i].wr = wr;
+      ureg->image[i].raw = raw;
+      ureg->image[i].format = format;
+      ureg->nr_images++;
+      return reg;
+   }
+
+   assert(0);
+   return reg;
+}
+
+/* Allocate a new buffer.
+ */
+struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr,
+                                 bool atomic)
+{
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_BUFFER, nr);
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_buffers; i++)
+      if (ureg->buffer[i].index == nr)
+         return reg;
+
+   if (i < PIPE_MAX_SHADER_BUFFERS) {
+      ureg->buffer[i].index = nr;
+      ureg->buffer[i].atomic = atomic;
+      ureg->nr_buffers++;
+      return reg;
+   }
+
+   assert(0);
+   return reg;
+}
+
 static int
 match_or_expand_immediate64( const unsigned *v,
                              int type,
@@ -1148,6 +1227,21 @@ ureg_emit_texture_offset(struct ureg_program *ureg,
    
 }
 
+void
+ureg_emit_memory(struct ureg_program *ureg,
+                 unsigned extended_token,
+                 unsigned qualifier)
+{
+   union tgsi_any_token *out, *insn;
+
+   out = get_tokens( ureg, DOMAIN_INSN, 1 );
+   insn = retrieve_token( ureg, DOMAIN_INSN, extended_token );
+
+   insn->insn.Memory = 1;
+
+   out[0].value = 0;
+   out[0].insn_memory.Qualifier = qualifier;
+}
 
 void
 ureg_fixup_insn_size(struct ureg_program *ureg,
@@ -1300,6 +1394,42 @@ ureg_label_insn(struct ureg_program *ureg,
 }
 
 
+void
+ureg_memory_insn(struct ureg_program *ureg,
+                 unsigned opcode,
+                 const struct ureg_dst *dst,
+                 unsigned nr_dst,
+                 const struct ureg_src *src,
+                 unsigned nr_src,
+                 unsigned qualifier)
+{
+   struct ureg_emit_insn_result insn;
+   unsigned i;
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         FALSE,
+                         FALSE,
+                         FALSE,
+                         TGSI_SWIZZLE_X,
+                         TGSI_SWIZZLE_Y,
+                         TGSI_SWIZZLE_Z,
+                         TGSI_SWIZZLE_W,
+                         nr_dst,
+                         nr_src);
+
+   ureg_emit_memory(ureg, insn.extended_token, qualifier);
+
+   for (i = 0; i < nr_dst; i++)
+      ureg_emit_dst(ureg, dst[i]);
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src(ureg, src[i]);
+
+   ureg_fixup_insn_size(ureg, insn.insn_token);
+}
+
+
 static void
 emit_decl_semantic(struct ureg_program *ureg,
                    unsigned file,
@@ -1478,6 +1608,52 @@ emit_decl_sampler_view(struct ureg_program *ureg,
 }
 
 static void
+emit_decl_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
+
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 3;
+   out[0].decl.File = TGSI_FILE_IMAGE;
+   out[0].decl.UsageMask = 0xf;
+
+   out[1].value = 0;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
+
+   out[2].value = 0;
+   out[2].decl_image.Resource = target;
+   out[2].decl_image.Writable = wr;
+   out[2].decl_image.Raw      = raw;
+   out[2].decl_image.Format   = format;
+}
+
+static void
+emit_decl_buffer(struct ureg_program *ureg,
+                 unsigned index,
+                 bool atomic)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);
+
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 2;
+   out[0].decl.File = TGSI_FILE_BUFFER;
+   out[0].decl.UsageMask = 0xf;
+   out[0].decl.Atomic = atomic;
+
+   out[1].value = 0;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
+}
+
+static void
 emit_immediate( struct ureg_program *ureg,
                 const unsigned *v,
                 unsigned type )
@@ -1587,8 +1763,8 @@ static void emit_decls( struct ureg_program *ureg )
    for (i = 0; i < ureg->nr_system_values; i++) {
       emit_decl_semantic(ureg,
                          TGSI_FILE_SYSTEM_VALUE,
-                         ureg->system_value[i].index,
-                         ureg->system_value[i].index,
+                         i,
+                         i,
                          ureg->system_value[i].semantic_name,
                          ureg->system_value[i].semantic_index,
                          TGSI_WRITEMASK_XYZW, 0);
@@ -1636,6 +1812,19 @@ static void emit_decls( struct ureg_program *ureg )
                              ureg->sampler_view[i].return_type_w);
    }
 
+   for (i = 0; i < ureg->nr_images; i++) {
+      emit_decl_image(ureg,
+                      ureg->image[i].index,
+                      ureg->image[i].target,
+                      ureg->image[i].format,
+                      ureg->image[i].wr,
+                      ureg->image[i].raw);
+   }
+
+   for (i = 0; i < ureg->nr_buffers; i++) {
+      emit_decl_buffer(ureg, ureg->buffer[i].index, ureg->buffer[i].atomic);
+   }
+
    if (ureg->const_decls.nr_constant_ranges) {
       for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
          emit_decl_range(ureg,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 0aae550d60a..86e58a91343 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -221,7 +221,6 @@ ureg_DECL_input(struct ureg_program *,
 
 struct ureg_src
 ureg_DECL_system_value(struct ureg_program *,
-                       unsigned index,
                        unsigned semantic_name,
                        unsigned semantic_index);
 
@@ -327,6 +326,16 @@ ureg_DECL_sampler_view(struct ureg_program *,
                        unsigned return_type_z,
                        unsigned return_type_w );
 
+struct ureg_src
+ureg_DECL_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw);
+
+struct ureg_src
+ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, bool atomic);
 
 static inline struct ureg_src
 ureg_imm4f( struct ureg_program *ureg,
@@ -522,6 +531,14 @@ ureg_label_insn(struct ureg_program *ureg,
                 unsigned nr_src,
                 unsigned *label);
 
+void
+ureg_memory_insn(struct ureg_program *ureg,
+                 unsigned opcode,
+                 const struct ureg_dst *dst,
+                 unsigned nr_dst,
+                 const struct ureg_src *src,
+                 unsigned nr_src,
+                 unsigned qualifier);
 
 /***********************************************************************
  * Internal instruction helpers, don't call these directly:
@@ -559,6 +576,11 @@ void
 ureg_emit_texture_offset(struct ureg_program *ureg,
                          const struct tgsi_texture_offset *offset);
 
+void
+ureg_emit_memory(struct ureg_program *ureg,
+                 unsigned insn_token,
+                 unsigned qualifier);
+
 void 
 ureg_emit_dst( struct ureg_program *ureg,
                struct ureg_dst dst );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 653e650dc4c..5fff3f0787f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -29,6 +29,7 @@
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
 #include "tgsi_util.h"
+#include "tgsi_exec.h"
 
 union pointer_hack
 {
@@ -53,17 +54,17 @@ tgsi_util_get_src_register_swizzle(
    const struct tgsi_src_register *reg,
    unsigned component )
 {
-   switch( component ) {
-   case 0:
+   switch (component) {
+   case TGSI_CHAN_X:
       return reg->SwizzleX;
-   case 1:
+   case TGSI_CHAN_Y:
       return reg->SwizzleY;
-   case 2:
+   case TGSI_CHAN_Z:
       return reg->SwizzleZ;
-   case 3:
+   case TGSI_CHAN_W:
       return reg->SwizzleW;
    default:
-      assert( 0 );
+      assert(0);
    }
    return 0;
 }
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 05b4567130e..43fbd8e6452 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -320,7 +320,8 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    for (i = 0; i < 4; i++)
       ctx->vertices[i][0][3] = 1; /*v.w*/
 
-   ctx->upload = u_upload_create(pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+   ctx->upload = u_upload_create(pipe, 65536, PIPE_BIND_VERTEX_BUFFER,
+                                 PIPE_USAGE_STREAM);
 
    return &ctx->base;
 }
@@ -1191,7 +1192,7 @@ static void blitter_draw(struct blitter_context_priv *ctx,
 
    vb.stride = 8 * sizeof(float);
 
-   u_upload_data(ctx->upload, 0, sizeof(ctx->vertices), ctx->vertices,
+   u_upload_data(ctx->upload, 0, sizeof(ctx->vertices), 4, ctx->vertices,
                  &vb.buffer_offset, &vb.buffer);
    if (!vb.buffer)
       return;
@@ -2111,7 +2112,7 @@ void util_blitter_clear_buffer(struct blitter_context *blitter,
       return;
    }
 
-   u_upload_data(ctx->upload, 0, num_channels*4, clear_value,
+   u_upload_data(ctx->upload, 0, num_channels*4, 4, clear_value,
                  &vb.buffer_offset, &vb.buffer);
    if (!vb.buffer)
       goto out;
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index cb162d89a58..2b605594a2e 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -727,6 +727,65 @@ error1:
    ;
 }
 
+void
+debug_dump_ubyte_rgba_bmp(const char *filename,
+                          unsigned width, unsigned height,
+                          const ubyte *rgba, unsigned stride)
+{
+   FILE *stream;
+   struct bmp_file_header bmfh;
+   struct bmp_info_header bmih;
+   unsigned x, y;
+
+   assert(rgba);
+   if(!rgba)
+      goto error1;
+
+   bmfh.bfType = 0x4d42;
+   bmfh.bfSize = 14 + 40 + height*width*4;
+   bmfh.bfReserved1 = 0;
+   bmfh.bfReserved2 = 0;
+   bmfh.bfOffBits = 14 + 40;
+
+   bmih.biSize = 40;
+   bmih.biWidth = width;
+   bmih.biHeight = height;
+   bmih.biPlanes = 1;
+   bmih.biBitCount = 32;
+   bmih.biCompression = 0;
+   bmih.biSizeImage = height*width*4;
+   bmih.biXPelsPerMeter = 0;
+   bmih.biYPelsPerMeter = 0;
+   bmih.biClrUsed = 0;
+   bmih.biClrImportant = 0;
+
+   stream = fopen(filename, "wb");
+   assert(stream);
+   if(!stream)
+      goto error1;
+
+   fwrite(&bmfh, 14, 1, stream);
+   fwrite(&bmih, 40, 1, stream);
+
+   y = height;
+   while(y--) {
+      const ubyte *ptr = rgba + (stride * y * 4);
+      for(x = 0; x < width; ++x)
+      {
+         struct bmp_rgb_quad pixel;
+         pixel.rgbRed   = ptr[x*4 + 0];
+         pixel.rgbGreen = ptr[x*4 + 1];
+         pixel.rgbBlue  = ptr[x*4 + 2];
+         pixel.rgbAlpha = ptr[x*4 + 3];
+         fwrite(&pixel, 1, 4, stream);
+      }
+   }
+
+   fclose(stream);
+error1:
+   ;
+}
+
 
 /**
  * Print PIPE_TRANSFER_x flags with a message.
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 34668f844e9..671bd37a085 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -490,12 +490,16 @@ void debug_dump_transfer_bmp(struct pipe_context *pipe,
 void debug_dump_float_rgba_bmp(const char *filename,
                                unsigned width, unsigned height,
                                float *rgba, unsigned stride);
+void debug_dump_ubyte_rgba_bmp(const char *filename,
+                               unsigned width, unsigned height,
+                               const ubyte *rgba, unsigned stride);
 #else
 #define debug_dump_image(prefix, format, cpp, width, height, stride, data) ((void)0)
 #define debug_dump_surface(pipe, prefix, surface) ((void)0)
 #define debug_dump_surface_bmp(pipe, filename, surface) ((void)0)
 #define debug_dump_transfer_bmp(filename, transfer, ptr) ((void)0)
 #define debug_dump_float_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
+#define debug_dump_ubyte_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
 #endif
 
 
diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index 0bb46ff8dd1..08dec13846d 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -177,6 +177,7 @@ struct pstip_transform_context {
    struct tgsi_shader_info info;
    uint tempsUsed;  /**< bitmask */
    int wincoordInput;
+   unsigned wincoordFile;
    int maxInput;
    uint samplersUsed;  /**< bitfield of samplers used */
    int freeSampler;  /** an available sampler for the pstipple */
@@ -206,7 +207,7 @@ pstip_transform_decl(struct tgsi_transform_context *ctx,
          pctx->samplersUsed |= 1 << i;
       }
    }
-   else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+   else if (decl->Declaration.File == pctx->wincoordFile) {
       pctx->maxInput = MAX2(pctx->maxInput, (int) decl->Range.Last);
       if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION)
          pctx->wincoordInput = (int) decl->Range.First;
@@ -275,10 +276,22 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
       wincoordInput = pctx->wincoordInput;
 
    if (pctx->wincoordInput < 0) {
+      struct tgsi_full_declaration decl;
+
+      decl = tgsi_default_full_declaration();
       /* declare new position input reg */
-      tgsi_transform_input_decl(ctx, wincoordInput,
-                                TGSI_SEMANTIC_POSITION, 1,
-                                TGSI_INTERPOLATE_LINEAR);
+      decl.Declaration.File = pctx->wincoordFile;
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.Name = TGSI_SEMANTIC_POSITION;
+      decl.Range.First =
+      decl.Range.Last = wincoordInput;
+
+      if (pctx->wincoordFile == TGSI_FILE_INPUT) {
+         decl.Declaration.Interpolate = 1;
+         decl.Interp.Interpolate = TGSI_INTERPOLATE_LINEAR;
+      }
+
+      ctx->emit_declaration(ctx, &decl);
    }
 
    sampIdx = pctx->hasFixedUnit ? pctx->fixedUnit : pctx->freeSampler;
@@ -327,7 +340,7 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
    tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
                            TGSI_FILE_TEMPORARY, texTemp,
                            TGSI_WRITEMASK_XYZW,
-                           TGSI_FILE_INPUT, wincoordInput,
+                           pctx->wincoordFile, wincoordInput,
                            TGSI_FILE_IMMEDIATE, pctx->numImmed);
 
    /* TEX texTemp, texTemp, sampler; */
@@ -351,11 +364,15 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
  *                        will be used to sample the stipple texture;
  *                        if NULL, the fixed unit is used
  * \param fixedUnit       fixed texture unit used for the stipple texture
+ * \param wincoordFile    TGSI_FILE_INPUT or TGSI_FILE_SYSTEM_VALUE,
+ *                        depending on which one is supported by the driver
+ *                        for TGSI_SEMANTIC_POSITION in the fragment shader
  */
 struct tgsi_token *
 util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
                                      unsigned *samplerUnitOut,
-                                     unsigned fixedUnit)
+                                     unsigned fixedUnit,
+                                     unsigned wincoordFile)
 {
    struct pstip_transform_context transform;
    const uint newLen = tgsi_num_tokens(tokens) + NUM_NEW_TOKENS;
@@ -370,6 +387,7 @@ util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
     */
    memset(&transform, 0, sizeof(transform));
    transform.wincoordInput = -1;
+   transform.wincoordFile = wincoordFile;
    transform.maxInput = -1;
    transform.coordOrigin = TGSI_FS_COORD_ORIGIN_UPPER_LEFT;
    transform.hasFixedUnit = !samplerUnitOut;
diff --git a/src/gallium/auxiliary/util/u_pstipple.h b/src/gallium/auxiliary/util/u_pstipple.h
index 249c58be95f..ef8396f4318 100644
--- a/src/gallium/auxiliary/util/u_pstipple.h
+++ b/src/gallium/auxiliary/util/u_pstipple.h
@@ -50,7 +50,8 @@ util_pstipple_create_sampler(struct pipe_context *pipe);
 struct tgsi_token *
 util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
                                      unsigned *samplerUnitOut,
-                                     unsigned fixed_unit);
+                                     unsigned fixed_unit,
+                                     unsigned wincoordFile);
 
 
 #endif
diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h
new file mode 100644
index 00000000000..1eca6d6df2c
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_pwr8.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2015 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Oded Gabbay <[email protected]>
+ */
+
+/**
+ * @file
+ * POWER8 intrinsics portability header.
+ *
+ */
+
+#ifndef U_PWR8_H_
+#define U_PWR8_H_
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16)))
+
+typedef VECTOR_ALIGN_16 vector unsigned char __m128i;
+
+typedef VECTOR_ALIGN_16 union m128i {
+   __m128i m128i;
+   vector signed int m128si;
+   vector unsigned int m128ui;
+   ubyte ub[16];
+   ushort us[8];
+   int i[4];
+   uint ui[4];
+} __m128i_union;
+
+static inline __m128i
+vec_set_epi32 (int i3, int i2, int i1, int i0)
+{
+   __m128i_union vdst;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vdst.i[0] = i0;
+   vdst.i[1] = i1;
+   vdst.i[2] = i2;
+   vdst.i[3] = i3;
+#else
+   vdst.i[3] = i0;
+   vdst.i[2] = i1;
+   vdst.i[1] = i2;
+   vdst.i[0] = i3;
+#endif
+
+   return (__m128i) vdst.m128si;
+}
+
+static inline __m128i
+vec_setr_epi32 (int i0, int i1, int i2, int i3)
+{
+  return vec_set_epi32 (i3, i2, i1, i0);
+}
+
+static inline __m128i
+vec_unpacklo_epi32 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23};
+#else
+      {24, 25, 26, 27,  8,  9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpackhi_epi32 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
+#else
+      {16, 17, 18, 19,  0,  1,  2,  3, 20, 21, 22, 23,  4,  5,  6,  7};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpacklo_epi64 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23};
+#else
+      {24, 25, 26, 27, 28, 29, 30, 31,  8,  9, 10, 11, 12, 13, 14, 15};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpackhi_epi64 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+#else
+      {16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_add_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_add ((vector signed int) a, (vector signed int) b);
+}
+
+static inline __m128i
+vec_sub_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b);
+}
+
+/* Call this function ONLY on POWER8 and newer platforms */
+static inline __m128i
+vec_mullo_epi32 (__m128i a, __m128i b)
+{
+   __m128i v;
+
+   __asm__(
+           "vmuluwm %0, %1, %2   \n"
+           : "=v" (v)
+           : "v" (a), "v" (b)
+           );
+
+   return v;
+}
+
+static inline void
+transpose4_epi32(const __m128i * restrict a,
+                 const __m128i * restrict b,
+                 const __m128i * restrict c,
+                 const __m128i * restrict d,
+                 __m128i * restrict o,
+                 __m128i * restrict p,
+                 __m128i * restrict q,
+                 __m128i * restrict r)
+{
+   __m128i t0 = vec_unpacklo_epi32(*a, *b);
+   __m128i t1 = vec_unpacklo_epi32(*c, *d);
+   __m128i t2 = vec_unpackhi_epi32(*a, *b);
+   __m128i t3 = vec_unpackhi_epi32(*c, *d);
+
+   *o = vec_unpacklo_epi64(t0, t1);
+   *p = vec_unpackhi_epi64(t0, t1);
+   *q = vec_unpacklo_epi64(t2, t3);
+   *r = vec_unpackhi_epi64(t2, t3);
+}
+
+static inline __m128i
+vec_slli_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_srli_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_srai_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_cmpeq_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) b);
+}
+
+static inline __m128i
+vec_loadu_si128 (const uint32_t* src)
+{
+   __m128i_union vsrc;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+
+   vsrc.m128ui = *((vector unsigned int *) src);
+
+#else
+
+   __m128i vmask, tmp1, tmp2;
+
+   vmask = vec_lvsl(0, src);
+
+   tmp1 = (__m128i) vec_ld (0, src);
+   tmp2 = (__m128i) vec_ld (15, src);
+   vsrc.m128ui = (vector unsigned int) vec_perm (tmp1, tmp2, vmask);
+
+#endif
+
+   return vsrc.m128i;
+}
+
+static inline __m128i
+vec_load_si128 (const uint32_t* src)
+{
+   __m128i_union vsrc;
+
+   vsrc.m128ui = *((vector unsigned int *) src);
+
+   return vsrc.m128i;
+}
+
+static inline void
+vec_store_si128 (uint32_t* dest, __m128i vdata)
+{
+   vec_st ((vector unsigned int) vdata, 0, dest);
+}
+
+/* Call this function ONLY on POWER8 and newer platforms */
+static inline int
+vec_movemask_epi8 (__m128i vsrc)
+{
+   __m128i_union vtemp;
+   int result;
+
+   vtemp.m128i = vec_vgbbd(vsrc);
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   result = vtemp.ub[15] << 8 | vtemp.ub[7];
+#else
+   result = vtemp.ub[0] << 8 | vtemp.ub[8];
+#endif
+
+   return result;
+}
+
+static inline __m128i
+vec_packs_epi16 (__m128i a, __m128i b)
+{
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   return (__m128i) vec_packs ((vector signed short) a,
+                               (vector signed short) b);
+#else
+   return (__m128i) vec_packs ((vector signed short) b,
+                               (vector signed short) a);
+#endif
+}
+
+static inline __m128i
+vec_packs_epi32 (__m128i a, __m128i b)
+{
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b);
+#else
+   return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a);
+#endif
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+#endif /* U_PWR8_H_ */
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 7f8e5a1a3cf..cae4138ba01 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -166,14 +166,49 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
 #endif /* !PIPE_ARCH_SSSE3 */
 
 
+/*
+ * Provide an SSE implementation of _mm_mul_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * Basically, albeit surprising at first (and second, and third...) look
+ * if a * b is done signed instead of unsigned, can just
+ * subtract b from the high bits of the result if a is negative
+ * (and the same for a if b is negative). Modular arithmetic at its best!
+ *
+ * So for int32 a,b in crude pseudo-code ("*" here denoting a widening mul)
+ * fixupb = (signmask(b) & a) << 32ULL
+ * fixupa = (signmask(a) & b) << 32ULL
+ * a * b = (unsigned)a * (unsigned)b - fixupb - fixupa
+ * = (unsigned)a * (unsigned)b -(fixupb + fixupa)
+ *
+ * This does both lo (dwords 0/2) and hi parts (1/3) at the same time due
+ * to some optimization potential.
+ */
+static inline __m128i
+mm_mullohi_epi32(const __m128i a, const __m128i b, __m128i *res13)
+{
+   __m128i a13, b13, mul02, mul13;
+   __m128i anegmask, bnegmask, fixup, fixup02, fixup13;
+   a13 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2,3,0,1));
+   b13 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2,3,0,1));
+   anegmask = _mm_srai_epi32(a, 31);
+   bnegmask = _mm_srai_epi32(b, 31);
+   fixup = _mm_add_epi32(_mm_and_si128(anegmask, b),
+                         _mm_and_si128(bnegmask, a));
+   mul02 = _mm_mul_epu32(a, b);
+   mul13 = _mm_mul_epu32(a13, b13);
+   fixup02 = _mm_slli_epi64(fixup, 32);
+   fixup13 = _mm_and_si128(fixup, _mm_set_epi32(-1,0,-1,0));
+   *res13 = _mm_sub_epi64(mul13, fixup13);
+   return _mm_sub_epi64(mul02, fixup02);
+}
 
 
 /* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
  * _mm_mul_epu32().
  *
- * I suspect this works fine for us because one of our operands is
- * always positive, but not sure that this can be used for general
- * signed integer multiplication.
+ * This always works regardless the signs of the operands, since
+ * the high bits (which would be different) aren't used.
  *
  * This seems close enough to the speed of SSE4 and the real
  * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
@@ -188,6 +223,12 @@ static inline __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 
    /* Interleave the results, either with shuffles or (slightly
     * faster) direct bit operations:
+    * XXX: might be only true for some cpus (in particular 65nm
+    * Core 2). On most cpus (including that Core 2, but not Nehalem...)
+    * using _mm_shuffle_ps/_mm_shuffle_epi32 might also be faster
+    * than using the 3 instructions below. But logic should be fine
+    * as well, we can't have optimal solution for all cpus (if anything,
+    * should just use _mm_mullo_epi32() if sse41 is available...).
     */
 #if 0
    __m128i ba8             = _mm_shuffle_epi32(ba, 8);
@@ -214,17 +255,44 @@ transpose4_epi32(const __m128i * restrict a,
                  __m128i * restrict q,
                  __m128i * restrict r)
 {
-  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
-  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
-  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
-  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
-
-  *o = _mm_unpacklo_epi64(t0, t1);
-  *p = _mm_unpackhi_epi64(t0, t1);
-  *q = _mm_unpacklo_epi64(t2, t3);
-  *r = _mm_unpackhi_epi64(t2, t3);
+   __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+   __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+   *o = _mm_unpacklo_epi64(t0, t1);
+   *p = _mm_unpackhi_epi64(t0, t1);
+   *q = _mm_unpacklo_epi64(t2, t3);
+   *r = _mm_unpackhi_epi64(t2, t3);
 }
 
+
+/*
+ * Same as above, except the first two values are already interleaved
+ * (i.e. contain 64bit values).
+ */
+static inline void
+transpose2_64_2_32(const __m128i * restrict a01,
+                   const __m128i * restrict a23,
+                   const __m128i * restrict c,
+                   const __m128i * restrict d,
+                   __m128i * restrict o,
+                   __m128i * restrict p,
+                   __m128i * restrict q,
+                   __m128i * restrict r)
+{
+   __m128i t0 = *a01;
+   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+   __m128i t2 = *a23;
+   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+   *o = _mm_unpacklo_epi64(t0, t1);
+   *p = _mm_unpackhi_epi64(t0, t1);
+   *q = _mm_unpacklo_epi64(t2, t3);
+   *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+
 #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
 
 
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c
index 6aa44f9602a..c150d92b967 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -600,7 +600,8 @@ is_box_inside_resource(const struct pipe_resource *res,
       depth = res->array_size;
       assert(res->array_size % 6 == 0);
       break;
-   case PIPE_MAX_TEXTURE_TYPES:;
+   case PIPE_MAX_TEXTURE_TYPES:
+      break;
    }
 
    return box->x >= 0 &&
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index b672fad6bf0..aa31ef2a4bd 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -42,8 +42,8 @@ struct u_upload_mgr {
    struct pipe_context *pipe;
 
    unsigned default_size;  /* Minimum size of the upload buffer, in bytes. */
-   unsigned alignment;     /* Alignment of each sub-allocation. */
    unsigned bind;          /* Bitmask of PIPE_BIND_* flags. */
+   unsigned usage;         /* PIPE_USAGE_* */
    unsigned map_flags;     /* Bitmask of PIPE_TRANSFER_* flags. */
    boolean map_persistent; /* If persistent mappings are supported. */
 
@@ -55,10 +55,9 @@ struct u_upload_mgr {
 };
 
 
-struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
-                                      unsigned default_size,
-                                      unsigned alignment,
-                                      unsigned bind )
+struct u_upload_mgr *
+u_upload_create(struct pipe_context *pipe, unsigned default_size,
+                unsigned bind, unsigned usage)
 {
    struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr );
    if (!upload)
@@ -66,8 +65,8 @@ struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
 
    upload->pipe = pipe;
    upload->default_size = default_size;
-   upload->alignment = alignment;
    upload->bind = bind;
+   upload->usage = usage;
 
    upload->map_persistent =
       pipe->screen->get_param(pipe->screen,
@@ -149,7 +148,7 @@ u_upload_alloc_buffer(struct u_upload_mgr *upload,
    buffer.target = PIPE_BUFFER;
    buffer.format = PIPE_FORMAT_R8_UNORM; /* want TYPELESS or similar */
    buffer.bind = upload->bind;
-   buffer.usage = PIPE_USAGE_STREAM;
+   buffer.usage = upload->usage;
    buffer.width0 = size;
    buffer.height0 = 1;
    buffer.depth0 = 1;
@@ -181,19 +180,24 @@ void
 u_upload_alloc(struct u_upload_mgr *upload,
                unsigned min_out_offset,
                unsigned size,
+               unsigned alignment,
                unsigned *out_offset,
                struct pipe_resource **outbuf,
                void **ptr)
 {
-   unsigned alloc_size = align(size, upload->alignment);
-   unsigned alloc_offset = align(min_out_offset, upload->alignment);
    unsigned buffer_size = upload->buffer ? upload->buffer->width0 : 0;
    unsigned offset;
 
+   min_out_offset = align(min_out_offset, alignment);
+
+   offset = align(upload->offset, alignment);
+   offset = MAX2(offset, min_out_offset);
+
    /* Make sure we have enough space in the upload buffer
-    * for the sub-allocation. */
-   if (unlikely(MAX2(upload->offset, alloc_offset) + alloc_size > buffer_size)) {
-      u_upload_alloc_buffer(upload, alloc_offset + alloc_size);
+    * for the sub-allocation.
+    */
+   if (unlikely(!upload->buffer || offset + size > buffer_size)) {
+      u_upload_alloc_buffer(upload, min_out_offset + size);
 
       if (unlikely(!upload->buffer)) {
          *out_offset = ~0;
@@ -202,11 +206,10 @@ u_upload_alloc(struct u_upload_mgr *upload,
          return;
       }
 
+      offset = min_out_offset;
       buffer_size = upload->buffer->width0;
    }
 
-   offset = MAX2(upload->offset, alloc_offset);
-
    if (unlikely(!upload->map)) {
       upload->map = pipe_buffer_map_range(upload->pipe, upload->buffer,
                                           offset,
@@ -224,8 +227,8 @@ u_upload_alloc(struct u_upload_mgr *upload,
       upload->map -= offset;
    }
 
-   assert(offset < upload->buffer->width0);
-   assert(offset + size <= upload->buffer->width0);
+   assert(offset < buffer_size);
+   assert(offset + size <= buffer_size);
    assert(size);
 
    /* Emit the return values: */
@@ -233,19 +236,20 @@ u_upload_alloc(struct u_upload_mgr *upload,
    pipe_resource_reference(outbuf, upload->buffer);
    *out_offset = offset;
 
-   upload->offset = offset + alloc_size;
+   upload->offset = offset + size;
 }
 
 void u_upload_data(struct u_upload_mgr *upload,
                    unsigned min_out_offset,
                    unsigned size,
+                   unsigned alignment,
                    const void *data,
                    unsigned *out_offset,
                    struct pipe_resource **outbuf)
 {
    uint8_t *ptr;
 
-   u_upload_alloc(upload, min_out_offset, size,
+   u_upload_alloc(upload, min_out_offset, size, alignment,
                   out_offset, outbuf,
                   (void**)&ptr);
    if (ptr)
@@ -257,6 +261,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
                      unsigned min_out_offset,
                      unsigned offset,
                      unsigned size,
+                     unsigned alignment,
                      struct pipe_resource *inbuf,
                      unsigned *out_offset,
                      struct pipe_resource **outbuf)
@@ -278,6 +283,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
    if (0)
       debug_printf("upload ptr %p ofs %d sz %d\n", map, offset, size);
 
-   u_upload_data(upload, min_out_offset, size, map, out_offset, outbuf);
+   u_upload_data(upload, min_out_offset, size, alignment,
+                 map, out_offset, outbuf);
    pipe_buffer_unmap( upload->pipe, transfer );
 }
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index 67c6daa4e7f..1d933d754ae 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -43,13 +43,12 @@ struct pipe_resource;
  *
  * \param pipe          Pipe driver.
  * \param default_size  Minimum size of the upload buffer, in bytes.
- * \param alignment     Alignment of each suballocation in the upload buffer.
  * \param bind          Bitmask of PIPE_BIND_* flags.
+ * \param usage         PIPE_USAGE_*
  */
-struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
-                                      unsigned default_size,
-                                      unsigned alignment,
-                                      unsigned bind );
+struct u_upload_mgr *
+u_upload_create(struct pipe_context *pipe, unsigned default_size,
+                unsigned bind, unsigned usage);
 
 /**
  * Destroy the upload manager.
@@ -74,6 +73,7 @@ void u_upload_unmap( struct u_upload_mgr *upload );
  * \param upload           Upload manager
  * \param min_out_offset   Minimum offset that should be returned in out_offset.
  * \param size             Size of the allocation.
+ * \param alignment        Alignment of the suballocation within the buffer
  * \param out_offset       Pointer to where the new buffer offset will be returned.
  * \param outbuf           Pointer to where the upload buffer will be returned.
  * \param ptr              Pointer to the allocated memory that is returned.
@@ -81,6 +81,7 @@ void u_upload_unmap( struct u_upload_mgr *upload );
 void u_upload_alloc(struct u_upload_mgr *upload,
                     unsigned min_out_offset,
                     unsigned size,
+                    unsigned alignment,
                     unsigned *out_offset,
                     struct pipe_resource **outbuf,
                     void **ptr);
@@ -95,6 +96,7 @@ void u_upload_alloc(struct u_upload_mgr *upload,
 void u_upload_data(struct u_upload_mgr *upload,
                    unsigned min_out_offset,
                    unsigned size,
+                   unsigned alignment,
                    const void *data,
                    unsigned *out_offset,
                    struct pipe_resource **outbuf);
@@ -110,6 +112,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
                      unsigned min_out_offset,
                      unsigned offset,
                      unsigned size,
+                     unsigned alignment,
                      struct pipe_resource *inbuf,
                      unsigned *out_offset,
                      struct pipe_resource **outbuf);
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 54e9e717104..e16ee3651e6 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -314,8 +314,9 @@ u_vbuf_create(struct pipe_context *pipe,
    mgr->translate_cache = translate_cache_create();
    memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs));
 
-   mgr->uploader = u_upload_create(pipe, 1024 * 1024, 4,
-                                   PIPE_BIND_VERTEX_BUFFER);
+   mgr->uploader = u_upload_create(pipe, 1024 * 1024,
+                                   PIPE_BIND_VERTEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
 
    return mgr;
 }
@@ -454,7 +455,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
 
       /* Create and map the output buffer. */
       u_upload_alloc(mgr->uploader, 0,
-                     key->output_stride * num_indices,
+                     key->output_stride * num_indices, 4,
                      &out_offset, &out_buffer,
                      (void**)&out_map);
       if (!out_buffer)
@@ -487,7 +488,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
       /* Create and map the output buffer. */
       u_upload_alloc(mgr->uploader,
                      key->output_stride * start_vertex,
-                     key->output_stride * num_vertices,
+                     key->output_stride * num_vertices, 4,
                      &out_offset, &out_buffer,
                      (void**)&out_map);
       if (!out_buffer)
@@ -987,7 +988,7 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
       real_vb = &mgr->real_vertex_buffer[i];
       ptr = mgr->vertex_buffer[i].user_buffer;
 
-      u_upload_data(mgr->uploader, start, end - start, ptr + start,
+      u_upload_data(mgr->uploader, start, end - start, 4, ptr + start,
                     &real_vb->buffer_offset, &real_vb->buffer);
       if (!real_vb->buffer)
          return PIPE_ERROR_OUT_OF_MEMORY;
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index afe53063b48..77688f0f99f 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -716,6 +716,7 @@ gen_vertex_data(struct vl_compositor *c, struct vl_compositor_state *s, struct u
    /* Allocate new memory for vertices. */
    u_upload_alloc(c->upload, 0,
                   c->vertex_buf.stride * VL_COMPOSITOR_MAX_LAYERS * 4, /* size */
+                  4, /* alignment */
                   &c->vertex_buf.buffer_offset, &c->vertex_buf.buffer,
                   (void**)&vb);
 
@@ -1090,7 +1091,8 @@ vl_compositor_init(struct vl_compositor *c, struct pipe_context *pipe)
 
    c->pipe = pipe;
 
-   c->upload = u_upload_create(pipe, 128 * 1024, 4, PIPE_BIND_VERTEX_BUFFER);
+   c->upload = u_upload_create(pipe, 128 * 1024, PIPE_BIND_VERTEX_BUFFER,
+                               PIPE_USAGE_STREAM);
 
    if (!c->upload)
       return false;
diff --git a/src/gallium/auxiliary/vl/vl_mc.c b/src/gallium/auxiliary/vl/vl_mc.c
index 6c317bbe04a..eb703a90445 100644
--- a/src/gallium/auxiliary/vl/vl_mc.c
+++ b/src/gallium/auxiliary/vl/vl_mc.c
@@ -79,14 +79,18 @@ calc_position(struct vl_mc *r, struct ureg_program *shader, struct ureg_src bloc
 }
 
 static struct ureg_dst
-calc_line(struct ureg_program *shader)
+calc_line(struct pipe_screen *screen, struct ureg_program *shader)
 {
    struct ureg_dst tmp;
    struct ureg_src pos;
 
    tmp = ureg_DECL_temporary(shader);
 
-   pos = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS, TGSI_INTERPOLATE_LINEAR);
+   if (screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL))
+      pos = ureg_DECL_system_value(shader, TGSI_SEMANTIC_POSITION, 0);
+   else
+      pos = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS,
+                               TGSI_INTERPOLATE_LINEAR);
 
    /*
     * tmp.y = fraction(pos.y / 2) >= 0.5 ? 1 : 0
@@ -177,7 +181,7 @@ create_ref_frag_shader(struct vl_mc *r)
 
    fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
 
-   field = calc_line(shader);
+   field = calc_line(r->pipe->screen, shader);
 
    /*
     * ref = field.z ? tc[1] : tc[0]
@@ -324,7 +328,7 @@ create_ycbcr_frag_shader(struct vl_mc *r, float scale, bool invert,
 
    fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
 
-   tmp = calc_line(shader);
+   tmp = calc_line(r->pipe->screen, shader);
 
    /*
     * if (field == tc.w)
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index f5bb3a0106f..b5c70451ce8 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -792,7 +792,7 @@ vl_mpeg12_end_frame(struct pipe_video_codec *decoder,
       for (j = 0; j < VL_MAX_REF_FRAMES; ++j) {
          if (!ref_frames[j] || !ref_frames[j][i]) continue;
 
-         vb[2] = vl_vb_get_mv(&buf->vertex_stream, j);;
+         vb[2] = vl_vb_get_mv(&buf->vertex_stream, j);
          dec->context->set_vertex_buffers(dec->context, 0, 3, vb);
 
          vl_mc_render_ref(i ? &dec->mc_c : &dec->mc_y, &buf->mc[i], ref_frames[j][i]);
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index e900283f731..c8f5f6a461e 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -213,6 +213,11 @@ The integer capabilities:
 * ``PIPE_CAP_DRAW_INDIRECT``: Whether the driver supports taking draw arguments
   { count, instance_count, start, index_bias } from a PIPE_BUFFER resource.
   See pipe_draw_info.
+* ``PIPE_CAP_MULTI_DRAW_INDIRECT``: Whether the driver supports
+  pipe_draw_info::indirect_stride and ::indirect_count
+* ``PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS``: Whether the driver supports
+  taking the number of indirect draws from a separate parameter
+  buffer, see pipe_draw_info::indirect_params.
 * ``PIPE_CAP_TGSI_FS_FINE_DERIVATIVE``: Whether the fragment shader supports
   the FINE versions of DDX/DDY.
 * ``PIPE_CAP_VENDOR_ID``: The vendor ID of the underlying hardware. If it's
@@ -239,8 +244,7 @@ The integer capabilities:
   will need to lower TGSI_SEMANTIC_VERTEXID to TGSI_SEMANTIC_VERTEXID_NOBASE
   and TGSI_SEMANTIC_BASEVERTEX, so drivers setting this must handle both these
   semantics. Only relevant if geometry shaders are supported.
-  (Currently not possible to query availability of these two semantics outside
-  this, at least BASEVERTEX should be exposed separately too).
+  (BASEVERTEX could be exposed separately too via ``PIPE_CAP_DRAW_PARAMETERS``).
 * ``PIPE_CAP_POLYGON_OFFSET_CLAMP``: If true, the driver implements support
   for ``pipe_rasterizer_state::offset_clamp``.
 * ``PIPE_CAP_MULTISAMPLE_Z_RESOLVE``: Whether the driver supports blitting
@@ -283,6 +287,20 @@ The integer capabilities:
   a compressed block is copied to/from a plain pixel of the same size.
 * ``PIPE_CAP_CLEAR_TEXTURE``: Whether `clear_texture` will be
   available in contexts.
+* ``PIPE_CAP_DRAW_PARAMETERS``: Whether ``TGSI_SEMANTIC_BASEVERTEX``,
+  ``TGSI_SEMANTIC_BASEINSTANCE``, and ``TGSI_SEMANTIC_DRAWID`` are
+  supported in vertex shaders.
+* ``PIPE_CAP_TGSI_PACK_HALF_FLOAT``: Whether the ``UP2H`` and ``PK2H``
+  TGSI opcodes are supported.
+* ``PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL``: If state trackers should use
+  a system value for the POSITION fragment shader input.
+* ``PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL``: If state trackers should use
+  a system value for the FACE fragment shader input.
+  Also, the FACE system value is integer, not float.
+* ``PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT``: Describes the required
+  alignment for pipe_shader_buffer::buffer_offset, in bytes. Maximum
+  value allowed is 256 (for GL conformance). 0 is only allowed if
+  shader buffers are not supported.
 
 
 .. _pipe_capf:
@@ -375,6 +393,10 @@ to be 0.
   of iterations that loops are allowed to have to be unrolled. It is only
   a hint to state trackers. Whether any loops will be unrolled is not
   guaranteed.
+* ``PIPE_SHADER_CAP_MAX_SHADER_BUFFERS``: Maximum number of memory buffers
+  (also used to implement atomic counters). Having this be non-0 also
+  implies support for the ``LOAD``, ``STORE``, and ``ATOM*`` TGSI
+  opcodes.
 
 
 .. _pipe_compute_cap:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index e7b0c2f6377..7810a3eb915 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -458,7 +458,11 @@ while DDY is allowed to be the same for the entire 2x2 quad.
 
 .. opcode:: PK2H - Pack Two 16-bit Floats
 
-  TBD
+This instruction replicates its result.
+
+.. math::
+
+  dst = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16
 
 
 .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars
@@ -615,7 +619,15 @@ This instruction replicates its result.
 
 .. opcode:: UP2H - Unpack Two 16-Bit Floats
 
-  TBD
+.. math::
+
+  dst.x = f16\_to\_f32(src0.x \& 0xffff)
+
+  dst.y = f16\_to\_f32(src0.x >> 16)
+
+  dst.z = f16\_to\_f32(src0.x \& 0xffff)
+
+  dst.w = f16\_to\_f32(src0.x >> 16)
 
 .. note::
 
@@ -2252,11 +2264,11 @@ after lookup.
 Resource Access Opcodes
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-.. opcode:: LOAD - Fetch data from a shader resource
+.. opcode:: LOAD - Fetch data from a shader buffer or image
 
                Syntax: ``LOAD dst, resource, address``
 
-               Example: ``LOAD TEMP[0], RES[0], TEMP[1]``
+               Example: ``LOAD TEMP[0], BUFFER[0], TEMP[1]``
 
                Using the provided integer address, LOAD fetches data
                from the specified buffer or texture without any
@@ -2280,7 +2292,7 @@ Resource Access Opcodes
 
                Syntax: ``STORE resource, address, src``
 
-               Example: ``STORE RES[0], TEMP[0], TEMP[1]``
+               Example: ``STORE BUFFER[0], TEMP[0], TEMP[1]``
 
                Using the provided integer address, STORE writes data
                to the specified buffer or texture.
@@ -2299,6 +2311,18 @@ Resource Access Opcodes
                texture arrays and 2D textures.  address.w is always
                ignored.
 
+.. opcode:: RESQ - Query information about a resource
+
+  Syntax: ``RESQ dst, resource``
+
+  Example: ``RESQ TEMP[0], BUFFER[0]``
+
+  Returns information about the buffer or image resource. For buffer
+  resources, the size (in bytes) is returned in the x component. For
+  image resources, .xyz will contain the width/height/layers of the
+  image, while .w will contain the number of samples for multi-sampled
+  images.
+
 
 .. _threadsyncopcodes:
 
@@ -2358,158 +2382,159 @@ These opcodes provide atomic variants of some common arithmetic and
 logical operations.  In this context atomicity means that another
 concurrent memory access operation that affects the same memory
 location is guaranteed to be performed strictly before or after the
-entire execution of the atomic operation.
-
-For the moment they're only valid in compute programs.
+entire execution of the atomic operation. The resource may be a buffer
+or an image. In the case of an image, the offset works the same as for
+``LOAD`` and ``STORE``, specified above. These atomic operations may
+only be used with 32-bit integer image formats.
 
 .. opcode:: ATOMUADD - Atomic integer addition
 
   Syntax: ``ATOMUADD dst, resource, offset, src``
 
-  Example: ``ATOMUADD TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUADD TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = dst_i + src_i
+  resource[offset] = dst_x + src_x
 
 
 .. opcode:: ATOMXCHG - Atomic exchange
 
   Syntax: ``ATOMXCHG dst, resource, offset, src``
 
-  Example: ``ATOMXCHG TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMXCHG TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = src_i
+  resource[offset] = src_x
 
 
 .. opcode:: ATOMCAS - Atomic compare-and-exchange
 
   Syntax: ``ATOMCAS dst, resource, offset, cmp, src``
 
-  Example: ``ATOMCAS TEMP[0], RES[0], TEMP[1], TEMP[2], TEMP[3]``
+  Example: ``ATOMCAS TEMP[0], BUFFER[0], TEMP[1], TEMP[2], TEMP[3]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i == cmp_i ? src_i : dst_i)
+  resource[offset] = (dst_x == cmp_x ? src_x : dst_x)
 
 
 .. opcode:: ATOMAND - Atomic bitwise And
 
   Syntax: ``ATOMAND dst, resource, offset, src``
 
-  Example: ``ATOMAND TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMAND TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = dst_i \& src_i
+  resource[offset] = dst_x \& src_x
 
 
 .. opcode:: ATOMOR - Atomic bitwise Or
 
   Syntax: ``ATOMOR dst, resource, offset, src``
 
-  Example: ``ATOMOR TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMOR TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = dst_i | src_i
+  resource[offset] = dst_x | src_x
 
 
 .. opcode:: ATOMXOR - Atomic bitwise Xor
 
   Syntax: ``ATOMXOR dst, resource, offset, src``
 
-  Example: ``ATOMXOR TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMXOR TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = dst_i \oplus src_i
+  resource[offset] = dst_x \oplus src_x
 
 
 .. opcode:: ATOMUMIN - Atomic unsigned minimum
 
   Syntax: ``ATOMUMIN dst, resource, offset, src``
 
-  Example: ``ATOMUMIN TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUMIN TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i < src_i ? dst_i : src_i)
+  resource[offset] = (dst_x < src_x ? dst_x : src_x)
 
 
 .. opcode:: ATOMUMAX - Atomic unsigned maximum
 
   Syntax: ``ATOMUMAX dst, resource, offset, src``
 
-  Example: ``ATOMUMAX TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUMAX TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i > src_i ? dst_i : src_i)
+  resource[offset] = (dst_x > src_x ? dst_x : src_x)
 
 
 .. opcode:: ATOMIMIN - Atomic signed minimum
 
   Syntax: ``ATOMIMIN dst, resource, offset, src``
 
-  Example: ``ATOMIMIN TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMIMIN TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i < src_i ? dst_i : src_i)
+  resource[offset] = (dst_x < src_x ? dst_x : src_x)
 
 
 .. opcode:: ATOMIMAX - Atomic signed maximum
 
   Syntax: ``ATOMIMAX dst, resource, offset, src``
 
-  Example: ``ATOMIMAX TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMIMAX TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i > src_i ? dst_i : src_i)
+  resource[offset] = (dst_x > src_x ? dst_x : src_x)
 
 
 
@@ -2646,7 +2671,8 @@ space coordinate system.  After clipping, the X, Y and Z components of the
 vertex will be divided by the W value to get normalized device coordinates.
 
 For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
-fragment shader input contains the fragment's window position.  The X
+fragment shader input (or system value, depending on which one is
+supported by the driver) contains the fragment's window position.  The X
 component starts at zero and always increases from left to right.
 The Y component starts at zero and always increases but Y=0 may either
 indicate the top of the window or the bottom depending on the fragment
@@ -2758,11 +2784,17 @@ typically only used for legacy graphics APIs.
 TGSI_SEMANTIC_FACE
 """"""""""""""""""
 
-This label applies to fragment shader inputs only and indicates that
-the register contains front/back-face information of the form (F, 0,
-0, 1).  The first component will be positive when the fragment belongs
-to a front-facing polygon, and negative when the fragment belongs to a
-back-facing polygon.
+This label applies to fragment shader inputs (or system values,
+depending on which one is supported by the driver) and indicates that
+the register contains front/back-face information.
+
+If it is an input, it will be a floating-point vector in the form (F, 0, 0, 1),
+where F will be positive when the fragment belongs to a front-facing polygon,
+and negative when the fragment belongs to a back-facing polygon.
+
+If it is a system value, it will be an integer vector in the form (F, 0, 0, 1),
+where F is 0xffffffff when the fragment belongs to a front-facing polygon and
+0 when the fragment belongs to a back-facing polygon.
 
 
 TGSI_SEMANTIC_EDGEFLAG
@@ -2949,6 +2981,19 @@ invocation is covered or not. Helper invocations are created in order
 to properly compute derivatives, however it may be desirable to skip
 some of the logic in those cases. See ``gl_HelperInvocation`` documentation.
 
+TGSI_SEMANTIC_BASEINSTANCE
+""""""""""""""""""""""""""
+
+For vertex shaders, the base instance argument supplied for this
+draw. This is an integer value, and only the X component is used.
+
+TGSI_SEMANTIC_DRAWID
+""""""""""""""""""""
+
+For vertex shaders, the zero-based index of the current draw in a
+``glMultiDraw*`` invocation. This is an integer value, and only the X
+component is used.
+
 
 Declaration Interpolate
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/src/gallium/drivers/freedreno/.gitignore b/src/gallium/drivers/freedreno/.gitignore
new file mode 100644
index 00000000000..150f5d19f5b
--- /dev/null
+++ b/src/gallium/drivers/freedreno/.gitignore
@@ -0,0 +1 @@
+ir3_compiler
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index baae9144005..74ef4168655 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -128,6 +128,7 @@ ir3_SOURCES := \
 	ir3/ir3_group.c \
 	ir3/ir3.h \
 	ir3/ir3_legalize.c \
+	ir3/ir3_nir.c \
 	ir3/ir3_nir.h \
 	ir3/ir3_nir_lower_if_else.c \
 	ir3/ir3_print.c \
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index 74cbbf2edd8..e47bbff5643 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -171,8 +171,8 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
 	fd3_query_context_init(pctx);
 
-	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-			2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0,
+                                                         PIPE_USAGE_STREAM);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 24afbc9e956..e65a352e7f6 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -145,7 +145,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	void *ptr;
 
 	u_upload_alloc(fd3_ctx->border_color_uploader,
-			0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+			0, BORDER_COLOR_UPLOAD_SIZE,
+		       BORDER_COLOR_UPLOAD_SIZE, &off,
 			&fd3_ctx->border_color_buf,
 			&ptr);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index e53e0c56c9a..7d6365bbb6d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -171,8 +171,8 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
 	fd4_query_context_init(pctx);
 
-	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-			2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0,
+                                                         PIPE_USAGE_STREAM);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index b9a28149722..bc62a5d9a4b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -133,7 +133,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	void *ptr;
 
 	u_upload_alloc(fd4_ctx->border_color_uploader,
-			0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+			0, BORDER_COLOR_UPLOAD_SIZE,
+		       BORDER_COLOR_UPLOAD_SIZE, &off,
 			&fd4_ctx->border_color_buf,
 			&ptr);
 
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 571c8142bf7..418b71b95de 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -40,6 +40,8 @@
 #include "freedreno_gmem.h"
 #include "freedreno_util.h"
 
+#define BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE)
+
 struct fd_vertex_stateobj;
 
 struct fd_texture_stateobj {
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 5bbe4016a2a..9d0cdd8e545 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -226,6 +226,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
 	case PIPE_CAP_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
@@ -238,6 +240,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -414,6 +421,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+		return 0;
 	}
 	debug_printf("unknown shader param %d\n", param);
 	return 0;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index d55daeefe06..481859efb17 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -40,6 +40,7 @@
 #include "freedreno_util.h"
 
 #include "ir3_compiler.h"
+#include "ir3_nir.h"
 #include "instr-a3xx.h"
 #include "ir3.h"
 
@@ -105,10 +106,10 @@ int main(int argc, char **argv)
 	const char *filename;
 	struct tgsi_token toks[65536];
 	struct tgsi_parse_context parse;
-	struct ir3_compiler *compiler;
 	struct ir3_shader_variant v;
 	struct ir3_shader s;
 	struct ir3_shader_key key = {};
+	/* TODO cmdline option to target different gpus: */
 	unsigned gpu_id = 320;
 	const char *info;
 	void *ptr;
@@ -228,7 +229,12 @@ int main(int argc, char **argv)
 	if (!tgsi_text_translate(ptr, toks, Elements(toks)))
 		errx(1, "could not parse `%s'", filename);
 
-	s.tokens = toks;
+	if (fd_mesa_debug & FD_DBG_OPTMSGS)
+		tgsi_dump(toks, 0);
+
+	nir_shader *nir = ir3_tgsi_to_nir(toks);
+	s.compiler = ir3_compiler_create(gpu_id);
+	s.nir = ir3_optimize_nir(&s, nir, NULL);
 
 	v.key = key;
 	v.shader = &s;
@@ -246,11 +252,8 @@ int main(int argc, char **argv)
 		break;
 	}
 
-	/* TODO cmdline option to target different gpus: */
-	compiler = ir3_compiler_create(gpu_id);
-
 	info = "NIR compiler";
-	ret = ir3_compile_shader_nir(compiler, &v);
+	ret = ir3_compile_shader_nir(s.compiler, &v);
 	if (ret) {
 		fprintf(stderr, "compiler failed!\n");
 		return ret;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 224f7806b3c..86afda4ba08 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -32,10 +32,6 @@
 #include "util/u_string.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
-#include "tgsi/tgsi_lowering.h"
-#include "tgsi/tgsi_strings.h"
-
-#include "nir/tgsi_to_nir.h"
 
 #include "freedreno_util.h"
 
@@ -123,97 +119,10 @@ struct ir3_compile {
 static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
 static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
 
-static struct nir_shader *to_nir(struct ir3_compile *ctx,
-		const struct tgsi_token *tokens, struct ir3_shader_variant *so)
-{
-	static const nir_shader_compiler_options options = {
-			.lower_fpow = true,
-			.lower_fsat = true,
-			.lower_scmp = true,
-			.lower_flrp = true,
-			.lower_ffract = true,
-			.native_integers = true,
-	};
-	struct nir_lower_tex_options tex_options = {
-			.lower_rect = 0,
-	};
-	bool progress;
-
-	switch (so->type) {
-	case SHADER_FRAGMENT:
-	case SHADER_COMPUTE:
-		tex_options.saturate_s = so->key.fsaturate_s;
-		tex_options.saturate_t = so->key.fsaturate_t;
-		tex_options.saturate_r = so->key.fsaturate_r;
-		break;
-	case SHADER_VERTEX:
-		tex_options.saturate_s = so->key.vsaturate_s;
-		tex_options.saturate_t = so->key.vsaturate_t;
-		tex_options.saturate_r = so->key.vsaturate_r;
-		break;
-	}
-
-	if (ctx->compiler->gpu_id >= 400) {
-		/* a4xx seems to have *no* sam.p */
-		tex_options.lower_txp = ~0;  /* lower all txp */
-	} else {
-		/* a3xx just needs to avoid sam.p for 3d tex */
-		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
-	}
-
-	struct nir_shader *s = tgsi_to_nir(tokens, &options);
-
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		debug_printf("----------------------\n");
-		nir_print_shader(s, stdout);
-		debug_printf("----------------------\n");
-	}
-
-	nir_opt_global_to_local(s);
-	nir_convert_to_ssa(s);
-	if (s->stage == MESA_SHADER_VERTEX) {
-		nir_lower_clip_vs(s, so->key.ucp_enables);
-	} else if (s->stage == MESA_SHADER_FRAGMENT) {
-		nir_lower_clip_fs(s, so->key.ucp_enables);
-	}
-	nir_lower_tex(s, &tex_options);
-	if (so->key.color_two_side)
-		nir_lower_two_sided_color(s);
-	nir_lower_idiv(s);
-	nir_lower_load_const_to_scalar(s);
-
-	do {
-		progress = false;
-
-		nir_lower_vars_to_ssa(s);
-		nir_lower_alu_to_scalar(s);
-		nir_lower_phis_to_scalar(s);
-
-		progress |= nir_copy_prop(s);
-		progress |= nir_opt_dce(s);
-		progress |= nir_opt_cse(s);
-		progress |= ir3_nir_lower_if_else(s);
-		progress |= nir_opt_algebraic(s);
-		progress |= nir_opt_constant_folding(s);
-
-	} while (progress);
-
-	nir_remove_dead_variables(s);
-	nir_validate_shader(s);
-
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		debug_printf("----------------------\n");
-		nir_print_shader(s, stdout);
-		debug_printf("----------------------\n");
-	}
-
-	return s;
-}
 
 static struct ir3_compile *
 compile_init(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens)
+		struct ir3_shader_variant *so)
 {
 	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
 
@@ -239,7 +148,28 @@ compile_init(struct ir3_compiler *compiler,
 	ctx->block_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
-	ctx->s = to_nir(ctx, tokens, so);
+	/* TODO: maybe generate some sort of bitmask of what key
+	 * lowers vs what shader has (ie. no need to lower
+	 * texture clamp lowering if no texture sample instrs)..
+	 * although should be done further up the stack to avoid
+	 * creating duplicate variants..
+	 */
+
+	if (ir3_key_lowers_nir(&so->key)) {
+		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+	} else {
+		/* fast-path for shader key that lowers nothing in NIR: */
+		ctx->s = so->shader->nir;
+	}
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}",
+			so->shader->id, so->id, so->type,
+			so->key.binning_pass, so->key.color_two_side,
+			so->key.half_precision);
+		nir_print_shader(ctx->s, stdout);
+	}
 
 	so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
 
@@ -1954,8 +1884,6 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 		case nir_texop_query_levels:
 			emit_tex_query_levels(ctx, tex);
 			break;
-		case nir_texop_samples_identical:
-			unreachable("nir_texop_samples_identical");
 		default:
 			emit_tex(ctx, tex);
 			break;
@@ -2170,6 +2098,8 @@ emit_stream_out(struct ir3_compile *ctx)
 static void
 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 {
+	nir_metadata_require(impl, nir_metadata_block_index);
+
 	emit_cf_list(ctx, &impl->body);
 	emit_block(ctx, impl->end_block);
 
@@ -2499,7 +2429,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	assert(!so->ir);
 
-	ctx = compile_init(compiler, so, so->shader->tokens);
+	ctx = compile_init(compiler, so);
 	if (!ctx) {
 		DBG("INIT failed!");
 		ret = -1;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
new file mode 100644
index 00000000000..565b9c32c1d
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,153 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+
+#include "freedreno_util.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "nir/tgsi_to_nir.h"
+
+struct nir_shader *
+ir3_tgsi_to_nir(const struct tgsi_token *tokens)
+{
+	static const nir_shader_compiler_options options = {
+			.lower_fpow = true,
+			.lower_fsat = true,
+			.lower_scmp = true,
+			.lower_flrp = true,
+			.lower_ffract = true,
+			.native_integers = true,
+	};
+	return tgsi_to_nir(tokens, &options);
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+	return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+			key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+			key->ucp_enables | key->color_two_side;
+}
+
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key)
+{
+	struct nir_lower_tex_options tex_options = {
+			.lower_rect = 0,
+	};
+	bool progress;
+
+	if (key) {
+		switch (shader->type) {
+		case SHADER_FRAGMENT:
+		case SHADER_COMPUTE:
+			tex_options.saturate_s = key->fsaturate_s;
+			tex_options.saturate_t = key->fsaturate_t;
+			tex_options.saturate_r = key->fsaturate_r;
+			break;
+		case SHADER_VERTEX:
+			tex_options.saturate_s = key->vsaturate_s;
+			tex_options.saturate_t = key->vsaturate_t;
+			tex_options.saturate_r = key->vsaturate_r;
+			break;
+		}
+	}
+
+	if (shader->compiler->gpu_id >= 400) {
+		/* a4xx seems to have *no* sam.p */
+		tex_options.lower_txp = ~0;  /* lower all txp */
+	} else {
+		/* a3xx just needs to avoid sam.p for 3d tex */
+		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+	}
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	OPT_V(s, nir_opt_global_to_local);
+	OPT_V(s, nir_convert_to_ssa);
+
+	if (key) {
+		if (s->stage == MESA_SHADER_VERTEX) {
+			OPT_V(s, nir_lower_clip_vs, key->ucp_enables);
+		} else if (s->stage == MESA_SHADER_FRAGMENT) {
+			OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
+		}
+		if (key->color_two_side) {
+			OPT_V(s, nir_lower_two_sided_color);
+		}
+	}
+
+	OPT_V(s, nir_lower_tex, &tex_options);
+	OPT_V(s, nir_lower_idiv);
+	OPT_V(s, nir_lower_load_const_to_scalar);
+
+	do {
+		progress = false;
+
+		OPT_V(s, nir_lower_vars_to_ssa);
+		OPT_V(s, nir_lower_alu_to_scalar);
+		OPT_V(s, nir_lower_phis_to_scalar);
+
+		progress |= OPT(s, nir_copy_prop);
+		progress |= OPT(s, nir_opt_dce);
+		progress |= OPT(s, nir_opt_cse);
+		progress |= OPT(s, ir3_nir_lower_if_else);
+		progress |= OPT(s, nir_opt_algebraic);
+		progress |= OPT(s, nir_opt_constant_folding);
+
+	} while (progress);
+
+	OPT_V(s, nir_remove_dead_variables);
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	nir_sweep(s);
+
+	return s;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
index 9950782dc38..534199d3744 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
@@ -32,6 +32,13 @@
 #include "glsl/nir/nir.h"
 #include "glsl/nir/shader_enums.h"
 
+#include "ir3_shader.h"
+
 bool ir3_nir_lower_if_else(nir_shader *shader);
 
+struct nir_shader * ir3_tgsi_to_nir(const struct tgsi_token *tokens);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key);
+
 #endif /* IR3_NIR_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 7b565332256..7d17f426ad3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -39,7 +39,7 @@
 
 #include "ir3_shader.h"
 #include "ir3_compiler.h"
-
+#include "ir3_nir.h"
 
 static void
 delete_variant(struct ir3_shader_variant *v)
@@ -187,12 +187,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	v->key = key;
 	v->type = shader->type;
 
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
-			key.binning_pass, key.color_two_side, key.half_precision);
-		tgsi_dump(shader->tokens, 0);
-	}
-
 	ret = ir3_compile_shader_nir(shader->compiler, v);
 	if (ret) {
 		debug_error("compile failed!");
@@ -267,7 +261,7 @@ ir3_shader_destroy(struct ir3_shader *shader)
 		v = v->next;
 		delete_variant(t);
 	}
-	free((void *)shader->tokens);
+	ralloc_free(shader->nir);
 	free(shader);
 }
 
@@ -281,14 +275,24 @@ ir3_shader_create(struct pipe_context *pctx,
 	shader->id = ++shader->compiler->shader_count;
 	shader->pctx = pctx;
 	shader->type = type;
-	shader->tokens = tgsi_dup_tokens(cso->tokens);
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump tgsi: type=%d", shader->type);
+		tgsi_dump(cso->tokens, 0);
+	}
+	nir_shader *nir = ir3_tgsi_to_nir(cso->tokens);
+	/* do first pass optimization, ignoring the key: */
+	shader->nir = ir3_optimize_nir(shader, nir, NULL);
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump nir%d: type=%d", shader->id, shader->type);
+		nir_print_shader(shader->nir, stdout);
+	}
 	shader->stream_output = cso->stream_output;
 	if (fd_mesa_debug & FD_DBG_SHADERDB) {
 		/* if shader-db run, create a standard variant immediately
 		 * (as otherwise nothing will trigger the shader to be
 		 * actually compiled)
 		 */
-		static struct ir3_shader_key key = {};
+		static struct ir3_shader_key key = {0};
 		ir3_shader_variant(shader, key);
 	}
 	return shader;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index cf99a4c05ed..b3c28a41387 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -230,6 +230,8 @@ struct ir3_shader_variant {
 	struct ir3_shader *shader;
 };
 
+typedef struct nir_shader nir_shader;
+
 struct ir3_shader {
 	enum shader_t type;
 
@@ -240,7 +242,7 @@ struct ir3_shader {
 	struct ir3_compiler *compiler;
 
 	struct pipe_context *pctx;    /* TODO replace w/ pipe_screen */
-	const struct tgsi_token *tokens;
+	nir_shader *nir;
 	struct pipe_stream_output_info stream_output;
 
 	struct ir3_shader_variant *variants;
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 1ed685188db..2adaee30fb9 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -195,7 +195,6 @@ struct i915_rasterizer_state {
 
    unsigned light_twoside : 1;
    unsigned st;
-   enum interp_mode color_interp;
 
    unsigned LIS4;
    unsigned LIS7;
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index a5b161882cd..e2a493bc1b5 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -254,6 +254,11 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -264,6 +269,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
       return 0;
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 6ba9646f7ab..b54a9fbf4f9 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -423,7 +423,7 @@ i915_prepare_vertex_sampling(struct i915_context *i915)
          for (j = view->u.tex.first_level; j <= tex->last_level; j++) {
             mip_offsets[j] = i915_texture_offset(i915_tex, j , 0 /* FIXME depth */);
             row_stride[j] = i915_tex->stride;
-            img_stride[j] = 0; /* FIXME */;
+            img_stride[j] = 0; /* FIXME */
          }
 
          draw_set_mapped_texture(i915->draw,
@@ -920,7 +920,6 @@ i915_create_rasterizer_state(struct pipe_context *pipe,
    struct i915_rasterizer_state *cso = CALLOC_STRUCT( i915_rasterizer_state );
 
    cso->templ = *rasterizer;
-   cso->color_interp = rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
    cso->light_twoside = rasterizer->light_twoside;
    cso->ds[0].u = _3DSTATE_DEPTH_OFFSET_SCALE;
    cso->ds[1].f = rasterizer->offset_scale;
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index 7ad88a1ce01..bd0f448f645 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -57,7 +57,6 @@ static uint find_mapping(const struct i915_fragment_shader* fs, int unit)
 static void calculate_vertex_layout(struct i915_context *i915)
 {
    const struct i915_fragment_shader *fs = i915->fs;
-   const enum interp_mode colorInterp = i915->rasterizer->color_interp;
    struct vertex_info vinfo;
    boolean texCoords[I915_TEX_UNITS], colors[2], fog, needW, face;
    uint i;
@@ -107,12 +106,12 @@ static void calculate_vertex_layout(struct i915_context *i915)
    /* pos */
    src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
    if (needW) {
-      draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZW;
       vinfo.attrib[0].emit = EMIT_4F;
    }
    else {
-      draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_3F, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZ;
       vinfo.attrib[0].emit = EMIT_3F;
    }
@@ -123,21 +122,21 @@ static void calculate_vertex_layout(struct i915_context *i915)
    /* primary color */
    if (colors[0]) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src);
       vinfo.hwfmt[0] |= S4_VFMT_COLOR;
    }
 
    /* secondary color */
    if (colors[1]) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
-      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src);
       vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
    }
 
    /* fog coord, not fog blend factor */
    if (fog) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
       vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
    }
 
@@ -147,7 +146,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
       if (texCoords[i]) {
          hwtc = TEXCOORDFMT_4D;
          src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC, fs->generic_mapping[i]);
-         draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
       }
       else {
          hwtc = TEXCOORDFMT_NOT_PRESENT;
@@ -164,7 +163,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
        * module by adding an extra shader output.
        */
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FACE, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_CONSTANT, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
       vinfo.hwfmt[1] &= ~(TEXCOORDFMT_NOT_PRESENT << (slot * 4));
       vinfo.hwfmt[1] |= TEXCOORDFMT_1D << (slot * 4);
    }
@@ -185,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
 struct i915_tracked_state i915_update_vertex_layout = {
    "vertex_layout",
    calculate_vertex_layout,
-   I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS
+   I915_NEW_FS | I915_NEW_VS
 };
 
 
diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c
index 9d5195129b7..079872f4306 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder.c
@@ -333,7 +333,7 @@ ilo_builder_init(struct ilo_builder *builder,
                  const struct ilo_dev *dev,
                  struct intel_winsys *winsys)
 {
-   int i;
+   unsigned i;
 
    assert(ilo_is_zeroed(builder, sizeof(*builder)));
 
@@ -366,7 +366,7 @@ ilo_builder_init(struct ilo_builder *builder,
 void
 ilo_builder_reset(struct ilo_builder *builder)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++)
       ilo_builder_writer_reset(builder, i);
@@ -382,7 +382,7 @@ ilo_builder_reset(struct ilo_builder *builder)
 bool
 ilo_builder_begin(struct ilo_builder *builder)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++) {
       if (!ilo_builder_writer_alloc_and_map(builder, i)) {
@@ -407,7 +407,7 @@ struct intel_bo *
 ilo_builder_end(struct ilo_builder *builder, unsigned *used)
 {
    struct ilo_builder_writer *bat;
-   int i;
+   unsigned i;
 
    ilo_builder_batch_patch_sba(builder);
 
diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c
index 2a00cf1c93c..6bcd0bcb8f5 100644
--- a/src/gallium/drivers/ilo/ilo_context.c
+++ b/src/gallium/drivers/ilo/ilo_context.c
@@ -189,8 +189,9 @@ ilo_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
     * These must be called last as u_upload/u_blitter are clients of the pipe
     * context.
     */
-   ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024, 16,
-         PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER);
+   ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024,
+         PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
    if (!ilo->uploader) {
       ilo_context_destroy(&ilo->base);
       return NULL;
diff --git a/src/gallium/drivers/ilo/ilo_gpgpu.c b/src/gallium/drivers/ilo/ilo_gpgpu.c
index 9a2ca007f80..b7415901a88 100644
--- a/src/gallium/drivers/ilo/ilo_gpgpu.c
+++ b/src/gallium/drivers/ilo/ilo_gpgpu.c
@@ -92,7 +92,7 @@ ilo_launch_grid(struct pipe_context *pipe,
    input_buf.buffer_size =
       ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_INPUT_SIZE);
    if (input_buf.buffer_size) {
-      u_upload_data(ilo->uploader, 0, input_buf.buffer_size, input,
+      u_upload_data(ilo->uploader, 0, input_buf.buffer_size, 16, input,
             &input_buf.buffer_offset, &input_buf.buffer);
    }
 
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index cfa2fb41152..d5a82ce80ae 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -463,6 +463,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_MAX_VERTEX_STREAMS:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -476,6 +478,11 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index d89765a9d23..8dc2d38e039 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -376,7 +376,7 @@ finalize_cbuf_state(struct ilo_context *ilo,
       if (cbuf->cso[i].resource)
          continue;
 
-      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
+      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size, 16,
             cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
 
       cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource);
@@ -426,12 +426,12 @@ finalize_index_buffer(struct ilo_context *ilo)
       unsigned hw_offset;
 
       if (vec->ib.state.user_buffer) {
-         u_upload_data(ilo->uploader, 0, size,
+         u_upload_data(ilo->uploader, 0, size, 16,
                vec->ib.state.user_buffer + offset,
                &hw_offset, &vec->ib.hw_resource);
       } else {
          u_upload_buffer(ilo->uploader, 0,
-               vec->ib.state.offset + offset, size, vec->ib.state.buffer,
+               vec->ib.state.offset + offset, size, 16, vec->ib.state.buffer,
                &hw_offset, &vec->ib.hw_resource);
       }
 
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
index 5250115a893..f46126e8427 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
@@ -266,7 +266,7 @@ fs_lower_opcode_tgsi_indirect_const(struct fs_compile_context *fcc,
    struct toy_inst *inst;
    struct toy_src desc, real_src[4];
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    tsrc_transpose(idx, real_src);
 
@@ -319,7 +319,7 @@ fs_lower_opcode_tgsi_const_pcb(struct fs_compile_context *fcc,
    const int grf_subreg = (idx.val32 & 1) * 16;
    struct toy_src src;
    struct toy_dst real_dst[4];
-   int i;
+   unsigned i;
 
    if (!fcc->variant->use_pcb || dim != 0 || idx.file != TOY_FILE_IMM ||
        grf >= fcc->first_attr_grf)
@@ -350,7 +350,7 @@ fs_lower_opcode_tgsi_const_gen6(struct fs_compile_context *fcc,
    struct toy_inst *inst;
    struct toy_src desc;
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
       return;
@@ -396,7 +396,7 @@ fs_lower_opcode_tgsi_const_gen7(struct fs_compile_context *fcc,
    struct toy_src desc;
    struct toy_inst *inst;
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
       return;
@@ -1168,7 +1168,7 @@ fs_lower_opcode_derivative(struct toy_compiler *tc, struct toy_inst *inst)
 {
    struct toy_dst dst[4];
    struct toy_src src[4];
-   int i;
+   unsigned i;
 
    tdst_transpose(inst->dst, dst);
    tsrc_transpose(inst->src[0], src);
@@ -1257,7 +1257,7 @@ fs_lower_opcode_kil(struct toy_compiler *tc, struct toy_inst *inst)
    }
    else {
       struct toy_src src[4];
-      int i;
+      unsigned i;
 
       tsrc_transpose(inst->src[0], src);
       /* mask out killed pixels */
@@ -1583,7 +1583,7 @@ fs_write_fb(struct fs_compile_context *fcc)
 static void
 fs_setup_shader_out(struct ilo_shader *sh, const struct toy_tgsi *tgsi)
 {
-   int i;
+   unsigned i;
 
    sh->out.count = tgsi->num_outputs;
    for (i = 0; i < tgsi->num_outputs; i++) {
@@ -1603,7 +1603,7 @@ static void
 fs_setup_shader_in(struct ilo_shader *sh, const struct toy_tgsi *tgsi,
                    bool flatshade)
 {
-   int i;
+   unsigned i;
 
    sh->in.count = tgsi->num_inputs;
    for (i = 0; i < tgsi->num_inputs; i++) {
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
index a29baab10c1..0df0afc706b 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
@@ -126,7 +126,7 @@ vs_lower_opcode_tgsi_const_gen6(struct vs_compile_context *vcc,
    tc_MOV(tc, block_offsets, idx);
 
    msg_type = GEN6_MSG_DP_OWORD_DUAL_BLOCK_READ;
-   msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;;
+   msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;
    msg_len = 2;
 
    desc = tsrc_imm_mdesc_data_port(tc, false, msg_len, 1, true, false,
@@ -522,7 +522,7 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc,
       if (num_coords >= 3) {
          struct toy_dst tmp, max;
          struct toy_src abs_coords[3];
-         int i;
+         unsigned i;
 
          tmp = tc_alloc_tmp(tc);
          max = tdst_writemask(tmp, TOY_WRITEMASK_W);
@@ -804,7 +804,7 @@ static int
 vs_collect_outputs(struct vs_compile_context *vcc, struct toy_src *outs)
 {
    const struct toy_tgsi *tgsi = &vcc->tgsi;
-   int i;
+   unsigned i;
 
    for (i = 0; i < vcc->shader->out.count; i++) {
       const int slot = vcc->output_map[i];
diff --git a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
index b725375fb67..1874faa6be3 100644
--- a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
+++ b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
@@ -70,7 +70,7 @@ struct linear_scan {
 static void
 linear_scan_free_regs(struct linear_scan *ls, int reg, int count)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < count; i++)
       ls->free_regs[ls->num_free_regs++] = reg + count - 1 - i;
@@ -221,7 +221,7 @@ linear_scan_spill(struct linear_scan *ls,
 static void
 linear_scan_spill_range(struct linear_scan *ls, int first, int count)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < count; i++) {
       struct linear_scan_live_interval *interval = &ls->intervals[first + i];
diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c
index d38585f1475..9a7140b9a9b 100644
--- a/src/gallium/drivers/ilo/shader/toy_tgsi.c
+++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c
@@ -1593,7 +1593,7 @@ ra_get_type(struct toy_tgsi *tgsi, const struct tgsi_full_instruction *tgsi_inst
       tgsi_inst->Src[operand].Register.File;
    switch (file) {
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       type = TOY_TYPE_D;
       break;
@@ -1834,7 +1834,7 @@ ra_get_src_indirect(struct toy_tgsi *tgsi,
       src = tsrc_null();
       break;
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       is_resource = true;
       /* fall through */
@@ -1918,7 +1918,7 @@ ra_get_src(struct toy_tgsi *tgsi,
       need_vrf = true;
       break;
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       assert(!s->Register.Dimension);
       src = tsrc_imm_d(s->Register.Index);
@@ -2256,7 +2256,7 @@ parse_declaration(struct toy_tgsi *tgsi,
    case TGSI_FILE_SAMPLER:
    case TGSI_FILE_PREDICATE:
    case TGSI_FILE_ADDRESS:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       /* nothing to do */
       break;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 0a52642e395..9029d2a4180 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -63,8 +63,7 @@ enum lp_interp {
    LP_INTERP_LINEAR,
    LP_INTERP_PERSPECTIVE,
    LP_INTERP_POSITION,
-   LP_INTERP_FACING,
-   LP_INTERP_ZERO
+   LP_INTERP_FACING
 };
 
 struct lp_shader_input {
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 9dcc102e758..62d99bbaac8 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -108,28 +108,22 @@ struct llvmpipe_context {
    struct vertex_info vertex_info;
    
    /** Which vertex shader output slot contains color */
-   uint8_t color_slot[2];
+   int8_t color_slot[2];
 
    /** Which vertex shader output slot contains bcolor */
-   uint8_t bcolor_slot[2];
+   int8_t bcolor_slot[2];
 
    /** Which vertex shader output slot contains point size */
-   uint8_t psize_slot;
+   int8_t psize_slot;
 
    /** Which vertex shader output slot contains viewport index */
-   uint8_t viewport_index_slot;
+   int8_t viewport_index_slot;
 
    /** Which geometry shader output slot contains layer */
-   uint8_t layer_slot;
+   int8_t layer_slot;
 
    /** A fake frontface output for unfilled primitives */
-   uint8_t face_slot;
-
-   /** Which output slot is used for the fake vp index info */
-   uint8_t fake_vpindex_slot;
-
-   /** Which output slot is used for the fake layer info */
-   uint8_t fake_layer_slot;
+   int8_t face_slot;
 
    /** Depth format and bias settings. */
    boolean floating_point_depth;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c19f9318006..db45cbbb057 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -115,7 +115,7 @@ struct lp_rast_plane {
    int32_t dcdy;
 
    /* one-pixel sized trivial reject offsets for each plane */
-   int64_t eo;
+   uint32_t eo;
 };
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index c9b9221d87c..232c8599e42 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
    lp_rast_triangle_4(task, arg2);
 }
 
-#if !defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_SSE)
 
-void
-lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
-                         const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<3)-1;
-   lp_rast_triangle_32_3(task, arg2);
-}
-
-void
-lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
-                         const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<4)-1;
-   lp_rast_triangle_32_4(task, arg2);
-}
-
-void
-lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
-{
-   lp_rast_triangle_32_3_16(task, arg);
-}
-
-#else
 #include <emmintrin.h>
 #include "util/u_sse.h"
 
@@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 #define NR_PLANES 3
 
-
-
-
-
-
-
 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
@@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                                0xffff & ~out[i].mask);
 }
 
-
-
-
-
 void
 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
@@ -471,6 +433,254 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 }
 
 #undef NR_PLANES
+
+#else
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#include <altivec.h>
+#include "util/u_pwr8.h"
+
+static inline void
+build_masks_32(int c,
+               int cdiff,
+               int dcdx,
+               int dcdy,
+               unsigned *outmask,
+               unsigned *partmask)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   {
+      __m128i cstep01, cstep23, result;
+
+      cstep01 = vec_packs_epi32(cstep0, cstep1);
+      cstep23 = vec_packs_epi32(cstep2, cstep3);
+      result = vec_packs_epi16(cstep01, cstep23);
+
+      *outmask |= vec_movemask_epi8(result);
+   }
+
+
+   {
+      __m128i cio4 = (__m128i) vec_splats(cdiff);
+      __m128i cstep01, cstep23, result;
+
+      cstep0 = vec_add_epi32(cstep0, cio4);
+      cstep1 = vec_add_epi32(cstep1, cio4);
+      cstep2 = vec_add_epi32(cstep2, cio4);
+      cstep3 = vec_add_epi32(cstep3, cio4);
+
+      cstep01 = vec_packs_epi32(cstep0, cstep1);
+      cstep23 = vec_packs_epi32(cstep2, cstep3);
+      result = vec_packs_epi16(cstep01, cstep23);
+
+      *partmask |= vec_movemask_epi8(result);
+   }
+}
+
+static inline unsigned
+build_mask_linear_32(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = vec_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return vec_movemask_epi8(result);
+}
+
+static inline __m128i
+lp_plane_to_m128i(const struct lp_rast_plane *plane)
+{
+   return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
+                         (int32_t)plane->dcdy, (int32_t)plane->eo);
+}
+
+#define NR_PLANES 3
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = vec_splats((unsigned char) 0);
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+
+   __m128i vshuf_mask0;
+   __m128i vshuf_mask1;
+   __m128i vshuf_mask2;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
+#else
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
+#endif
+
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+    */
+   dcdx = vec_sub_epi32(zero, dcdx);
+
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
+   rej4 = vec_slli_epi32(rej4, 2);
+
+   /*
+    * Adjust so we can just check the sign bit (< 0 comparison),
+    * instead of having to do a less efficient <= 0 comparison
+    */
+   c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
+   rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
+
+   dcdx2 = vec_add_epi32(dcdx, dcdx);
+   dcdx3 = vec_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+   for (i = 0; i < 4; i++) {
+      __m128i cx = c;
+
+      for (j = 0; j < 4; j++) {
+         __m128i c4rej = vec_add_epi32(cx, rej4);
+         __m128i rej_masks = vec_srai_epi32(c4rej, 31);
+
+         /* if (is_zero(rej_masks)) */
+         if (vec_movemask_epi8(rej_masks) == 0) {
+            __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
+            __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
+            __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
+
+            __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
+
+            __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
+            __m128i c_01 = vec_packs_epi32(c_0, c_1);
+
+            __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
+
+            __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
+            __m128i c_23 = vec_packs_epi32(c_2, c_3);
+            __m128i c_0123 = vec_packs_epi16(c_01, c_23);
+
+            unsigned mask = vec_movemask_epi8(c_0123);
+
+            out[nr].i = i;
+            out[nr].j = j;
+            out[nr].mask = mask;
+            if (mask != 0xffff)
+               nr++;
+         }
+         cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
+      }
+
+      c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
+   }
+
+   for (i = 0; i < nr; i++)
+      lp_rast_shade_quads_mask(task,
+                               &tri->inputs,
+                               x + 4 * out[i].j,
+                               y + 4 * out[i].i,
+                               0xffff & ~out[i].mask);
+}
+
+#undef NR_PLANES
+
+#else
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<3)-1;
+   lp_rast_triangle_32_3(task, arg2);
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_32_4(task, arg2);
+}
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   lp_rast_triangle_32_3_16(task, arg);
+}
+
 #endif
 
 
@@ -512,7 +722,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #define NR_PLANES 8
 #include "lp_rast_tri_tmp.h"
 
-#ifdef PIPE_ARCH_SSE
+#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
 #undef BUILD_MASKS
 #undef BUILD_MASK_LINEAR
 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 52f6e999683..e0aea94205e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -82,7 +82,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
       const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
       const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
       const int64_t cox = IMUL64(plane[j].eo, 4);
-      const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
       const int64_t cio = IMUL64(ei, 4) - 1;
 
       BUILD_MASKS(c[j] + cox,
@@ -182,7 +182,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
          const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
          const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
          const int64_t cox = IMUL64(plane[j].eo, 16);
-         const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+         const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
          const int64_t cio = IMUL64(ei, 16) - 1;
 
          BUILD_MASKS(c[j] + cox,
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 899f28da7d3..e29b008c7e8 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -301,6 +301,13 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index ddbb88eb107..bd850519468 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -486,6 +486,11 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup,
                                    depth,
                                    stencil);
 
+   /*
+    * XXX: should make a full mask here for things like D24X8,
+    * otherwise we'll do a read-modify-write clear later which
+    * should be unnecessary.
+    */
    zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format,
                                        zmask32,
                                        smask8);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 4451284c303..80acd74bddd 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -105,10 +105,10 @@ struct lp_setup_context
    float pixel_offset;
    float line_width;
    float point_size;
-   uint8_t psize_slot;
-   uint8_t viewport_index_slot;
-   uint8_t layer_slot;
-   uint8_t face_slot;
+   int8_t psize_slot;
+   int8_t viewport_index_slot;
+   int8_t layer_slot;
+   int8_t face_slot;
 
    struct pipe_framebuffer_state fb;
    struct u_rect framebuffer;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index fac1cd61d77..a0de599c9c6 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -644,19 +644,25 @@ try_setup_line( struct lp_setup_context *setup,
    line->inputs.layer = layer;
    line->inputs.viewport_index = viewport_index;
 
+   /*
+    * XXX: this code is mostly identical to the one in lp_setup_tri, except it
+    * uses 4 planes instead of 3. Could share the code (including the sse
+    * assembly, in fact we'd get the 4th plane for free).
+    * The only difference apart from storing the 4th plane would be some
+    * different shuffle for calculating dcdx/dcdy.
+    */
    for (i = 0; i < 4; i++) {
 
-      /* half-edge constants, will be interated over the whole render
+      /* half-edge constants, will be iterated over the whole render
        * target.
        */
       plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]);
 
-      
-      /* correct for top-left vs. bottom-left fill convention.  
-       */         
+      /* correct for top-left vs. bottom-left fill convention.
+       */
       if (plane[i].dcdx < 0) {
          /* both fill conventions want this - adjust for left edges */
-         plane[i].c++;            
+         plane[i].c++;
       }
       else if (plane[i].dcdx == 0) {
          if (setup->pixel_offset == 0) {
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index b1671dd0ae2..358da442ea7 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -46,6 +46,9 @@
 
 #if defined(PIPE_ARCH_SSE)
 #include <emmintrin.h>
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+#include <altivec.h>
+#include "util/u_pwr8.h"
 #endif
 
 static inline int
@@ -387,25 +390,21 @@ do_triangle_ccw(struct lp_setup_context *setup,
    plane = GET_PLANES(tri);
 
 #if defined(PIPE_ARCH_SSE)
-   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
-       setup->fb.height <= MAX_FIXED_LENGTH32 &&
-       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
-       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+   if (1) {
       __m128i vertx, verty;
       __m128i shufx, shufy;
-      __m128i dcdx, dcdy, c;
-      __m128i unused;
+      __m128i dcdx, dcdy;
+      __m128i cdx02, cdx13, cdy02, cdy13, c02, c13;
+      __m128i c01, c23, unused;
       __m128i dcdx_neg_mask;
       __m128i dcdy_neg_mask;
       __m128i dcdx_zero_mask;
-      __m128i top_left_flag;
-      __m128i c_inc_mask, c_inc;
+      __m128i top_left_flag, c_dec;
       __m128i eo, p0, p1, p2;
       __m128i zero = _mm_setzero_si128();
-      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
 
-      vertx = _mm_loadu_si128((__m128i *)position->x); /* vertex x coords */
-      verty = _mm_loadu_si128((__m128i *)position->y); /* vertex y coords */
+      vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */
+      verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */
 
       shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
       shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
@@ -419,42 +418,161 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
       top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0);
 
-      c_inc_mask = _mm_or_si128(dcdx_neg_mask,
-                                _mm_and_si128(dcdx_zero_mask,
-                                              _mm_xor_si128(dcdy_neg_mask,
-                                                            top_left_flag)));
-
-      c_inc = _mm_srli_epi32(c_inc_mask, 31);
-
-      c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
-                        mm_mullo_epi32(dcdy, verty));
+      c_dec = _mm_or_si128(dcdx_neg_mask,
+                           _mm_and_si128(dcdx_zero_mask,
+                                         _mm_xor_si128(dcdy_neg_mask,
+                                                       top_left_flag)));
 
-      c = _mm_add_epi32(c, c_inc);
+      /*
+       * 64 bit arithmetic.
+       * Note we need _signed_ mul (_mm_mul_epi32) which we emulate.
+       */
+      cdx02 = mm_mullohi_epi32(dcdx, vertx, &cdx13);
+      cdy02 = mm_mullohi_epi32(dcdy, verty, &cdy13);
+      c02 = _mm_sub_epi64(cdx02, cdy02);
+      c13 = _mm_sub_epi64(cdx13, cdy13);
+      c02 = _mm_sub_epi64(c02, _mm_shuffle_epi32(c_dec,
+                                                 _MM_SHUFFLE(2,2,0,0)));
+      c13 = _mm_sub_epi64(c13, _mm_shuffle_epi32(c_dec,
+                                                 _MM_SHUFFLE(3,3,1,1)));
+
+      /*
+       * Useful for very small fbs/tris (or fewer subpixel bits) only:
+       * c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+       *                   mm_mullo_epi32(dcdy, verty));
+       *
+       * c = _mm_sub_epi32(c, c_dec);
+       */
 
       /* Scale up to match c:
        */
       dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
       dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
 
-      /* Calculate trivial reject values:
+      /*
+       * Calculate trivial reject values:
+       * Note eo cannot overflow even if dcdx/dcdy would already have
+       * 31 bits (which they shouldn't have). This is because eo
+       * is never negative (albeit if we rely on that need to be careful...)
        */
       eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
                          _mm_and_si128(dcdx_neg_mask, dcdx));
 
       /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
 
+      /*
+       * Pointless transpose which gets undone immediately in
+       * rasterization.
+       * It is actually difficult to do away with it - would essentially
+       * need GET_PLANES_DX, GET_PLANES_DY etc., but the calculations
+       * for this then would need to depend on the number of planes.
+       * The transpose is quite special here due to c being 64bit...
+       * The store has to be unaligned (unless we'd make the plane size
+       * a multiple of 128), and of course storing eo separately...
+       */
+      c01 = _mm_unpacklo_epi64(c02, c13);
+      c23 = _mm_unpackhi_epi64(c02, c13);
+      transpose2_64_2_32(&c01, &c23, &dcdx, &dcdy,
+                         &p0, &p1, &p2, &unused);
+      _mm_storeu_si128((__m128i *)&plane[0], p0);
+      plane[0].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+      _mm_storeu_si128((__m128i *)&plane[1], p1);
+      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(3,2,0,1));
+      plane[1].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+      _mm_storeu_si128((__m128i *)&plane[2], p2);
+      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2));
+      plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+   } else
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+   /*
+    * XXX this code is effectively disabled for all practical purposes,
+    * as the allowed fb size is tiny if FIXED_ORDER is 8.
+    */
+   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
+       setup->fb.height <= MAX_FIXED_LENGTH32 &&
+       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
+       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+      unsigned int bottom_edge;
+      __m128i vertx, verty;
+      __m128i shufx, shufy;
+      __m128i dcdx, dcdy, c;
+      __m128i unused;
+      __m128i dcdx_neg_mask;
+      __m128i dcdy_neg_mask;
+      __m128i dcdx_zero_mask;
+      __m128i top_left_flag;
+      __m128i c_inc_mask, c_inc;
+      __m128i eo, p0, p1, p2;
+      __m128i_union vshuf_mask;
+      __m128i zero = vec_splats((unsigned char) 0);
+      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      vshuf_mask.i[0] = 0x07060504;
+      vshuf_mask.i[1] = 0x0B0A0908;
+      vshuf_mask.i[2] = 0x03020100;
+      vshuf_mask.i[3] = 0x0F0E0D0C;
+#else
+      vshuf_mask.i[0] = 0x00010203;
+      vshuf_mask.i[1] = 0x0C0D0E0F;
+      vshuf_mask.i[2] = 0x04050607;
+      vshuf_mask.i[3] = 0x08090A0B;
+#endif
+
+      /* vertex x coords */
+      vertx = vec_load_si128((const uint32_t *) position->x);
+      /* vertex y coords */
+      verty = vec_load_si128((const uint32_t *) position->y);
+
+      shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
+      shufy = vec_perm (verty, verty, vshuf_mask.m128i);
+
+      dcdx = vec_sub_epi32(verty, shufy);
+      dcdy = vec_sub_epi32(vertx, shufx);
+
+      dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
+      dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
+      dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
+
+      bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
+      top_left_flag = (__m128i) vec_splats(bottom_edge);
+
+      c_inc_mask = vec_or(dcdx_neg_mask,
+                                vec_and(dcdx_zero_mask,
+                                              vec_xor(dcdy_neg_mask,
+                                                            top_left_flag)));
+
+      c_inc = vec_srli_epi32(c_inc_mask, 31);
+
+      c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
+                        vec_mullo_epi32(dcdy, verty));
+
+      c = vec_add_epi32(c, c_inc);
+
+      /* Scale up to match c:
+       */
+      dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
+      dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+                         vec_and(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
       /* Pointless transpose which gets undone immediately in
        * rasterization:
        */
       transpose4_epi32(&c, &dcdx, &dcdy, &eo,
                        &p0, &p1, &p2, &unused);
 
-#define STORE_PLANE(plane, vec) do {                 \
-         _mm_store_si128((__m128i *)&temp_vec, vec); \
-         plane.c    = (int64_t)temp_vec[0];          \
-         plane.dcdx = temp_vec[1];                   \
-         plane.dcdy = temp_vec[2];                   \
-         plane.eo   = temp_vec[3];                   \
+#define STORE_PLANE(plane, vec) do {                  \
+         vec_store_si128((uint32_t *)&temp_vec, vec); \
+         plane.c    = (int64_t)temp_vec[0];           \
+         plane.dcdx = temp_vec[1];                    \
+         plane.dcdy = temp_vec[2];                    \
+         plane.eo   = temp_vec[3];                    \
       } while(0)
 
       STORE_PLANE(plane[0], p0);
@@ -473,17 +591,17 @@ do_triangle_ccw(struct lp_setup_context *setup,
       plane[2].dcdx = position->dy20;
   
       for (i = 0; i < 3; i++) {
-         /* half-edge constants, will be interated over the whole render
+         /* half-edge constants, will be iterated over the whole render
           * target.
           */
          plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
-               IMUL64(plane[i].dcdy, position->y[i]);
+                      IMUL64(plane[i].dcdy, position->y[i]);
 
          /* correct for top-left vs. bottom-left fill convention.
-          */         
+          */
          if (plane[i].dcdx < 0) {
             /* both fill conventions want this - adjust for left edges */
-            plane[i].c++;            
+            plane[i].c++;
          }
          else if (plane[i].dcdx == 0) {
             if (setup->bottom_edge_rule == 0){
@@ -517,19 +635,19 @@ do_triangle_ccw(struct lp_setup_context *setup,
    }
 
    if (0) {
-      debug_printf("p0: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+      debug_printf("p0: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[0].c,
                    plane[0].dcdx,
                    plane[0].dcdy,
                    plane[0].eo);
-      
-      debug_printf("p1: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+
+      debug_printf("p1: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[1].c,
                    plane[1].dcdx,
                    plane[1].dcdy,
                    plane[1].eo);
-      
-      debug_printf("p2: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+
+      debug_printf("p2: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[2].c,
                    plane[2].dcdx,
                    plane[2].dcdy,
@@ -590,7 +708,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
 static inline uint32_t 
 floor_pot(uint32_t n)
 {
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+#if defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
    if (n == 0)
       return 0;
 
@@ -738,9 +856,9 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
 
          ei[i] = (plane[i].dcdy - 
                   plane[i].dcdx - 
-                  plane[i].eo) << TILE_ORDER;
+                  (int64_t)plane[i].eo) << TILE_ORDER;
 
-         eo[i] = plane[i].eo << TILE_ORDER;
+         eo[i] = (int64_t)plane[i].eo << TILE_ORDER;
          xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER);
          ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER;
       }
@@ -932,12 +1050,12 @@ rotate_fixed_position_12( struct fixed_position* position )
 /**
  * Draw triangle if it's CW, cull otherwise.
  */
-static void triangle_cw( struct lp_setup_context *setup,
-			 const float (*v0)[4],
-			 const float (*v1)[4],
-			 const float (*v2)[4] )
+static void triangle_cw(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
 
    calc_fixed_position(setup, &position, v0, v1, v2);
 
@@ -953,12 +1071,12 @@ static void triangle_cw( struct lp_setup_context *setup,
 }
 
 
-static void triangle_ccw( struct lp_setup_context *setup,
-                          const float (*v0)[4],
-                          const float (*v1)[4],
-                          const float (*v2)[4])
+static void triangle_ccw(struct lp_setup_context *setup,
+                         const float (*v0)[4],
+                         const float (*v1)[4],
+                         const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
 
    calc_fixed_position(setup, &position, v0, v1, v2);
 
@@ -969,12 +1087,12 @@ static void triangle_ccw( struct lp_setup_context *setup,
 /**
  * Draw triangle whether it's CW or CCW.
  */
-static void triangle_both( struct lp_setup_context *setup,
-			   const float (*v0)[4],
-			   const float (*v1)[4],
-			   const float (*v2)[4] )
+static void triangle_both(struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
    struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
 
    if (lp_context->active_statistics_queries &&
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index f5bcfb2b511..34961cbbac5 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -48,21 +48,26 @@
 static void
 compute_vertex_info(struct llvmpipe_context *llvmpipe)
 {
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   const struct tgsi_shader_info *fsInfo = &llvmpipe->fs->info.base;
    struct vertex_info *vinfo = &llvmpipe->vertex_info;
    int vs_index;
    uint i;
 
    draw_prepare_shader_outputs(llvmpipe->draw);
 
-   llvmpipe->color_slot[0] = 0;
-   llvmpipe->color_slot[1] = 0;
-   llvmpipe->bcolor_slot[0] = 0;
-   llvmpipe->bcolor_slot[1] = 0;
-   llvmpipe->viewport_index_slot = 0;
-   llvmpipe->layer_slot = 0;
-   llvmpipe->face_slot = 0;
-   llvmpipe->psize_slot = 0;
+   /*
+    * Those can't actually be 0 (because pos is always at 0).
+    * But use ints anyway to avoid confusion (in vs outputs, they
+    * can very well be at pos 0).
+    */
+   llvmpipe->color_slot[0] = -1;
+   llvmpipe->color_slot[1] = -1;
+   llvmpipe->bcolor_slot[0] = -1;
+   llvmpipe->bcolor_slot[1] = -1;
+   llvmpipe->viewport_index_slot = -1;
+   llvmpipe->layer_slot = -1;
+   llvmpipe->face_slot = -1;
+   llvmpipe->psize_slot = -1;
 
    /*
     * Match FS inputs against VS outputs, emitting the necessary
@@ -73,60 +78,49 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
    vinfo->num_attribs = 0;
 
    vs_index = draw_find_shader_output(llvmpipe->draw,
-                                      TGSI_SEMANTIC_POSITION,
-                                      0);
+                                      TGSI_SEMANTIC_POSITION, 0);
 
-   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
 
-   for (i = 0; i < lpfs->info.base.num_inputs; i++) {
+   for (i = 0; i < fsInfo->num_inputs; i++) {
       /*
        * Search for each input in current vs output:
        */
-
       vs_index = draw_find_shader_output(llvmpipe->draw,
-                                         lpfs->info.base.input_semantic_name[i],
-                                         lpfs->info.base.input_semantic_index[i]);
+                                         fsInfo->input_semantic_name[i],
+                                         fsInfo->input_semantic_index[i]);
 
-      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
-          lpfs->info.base.input_semantic_index[i] < 2) {
-         int idx = lpfs->info.base.input_semantic_index[i];
-         llvmpipe->color_slot[idx] = vinfo->num_attribs;
+      if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+          fsInfo->input_semantic_index[i] < 2) {
+         int idx = fsInfo->input_semantic_index[i];
+         llvmpipe->color_slot[idx] = (int)vinfo->num_attribs;
       }
 
-      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
-         llvmpipe->face_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) {
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
+         llvmpipe->face_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       /*
        * For vp index and layer, if the fs requires them but the vs doesn't
-       * provide them, store the slot - we'll later replace the data directly
-       * with zero (as required by ARB_fragment_layer_viewport). This is
-       * because draw itself just redirects them to whatever was at output 0.
-       * We'll also store the real vpindex/layer slot for setup use.
+       * provide them, draw (vbuf) will give us the required 0 (slot -1).
+       * (This means in this case we'll also use those slots in setup, which
+       * isn't necessary but they'll contain the correct (0) value.)
        */
-      } else if (lpfs->info.base.input_semantic_name[i] ==
+      } else if (fsInfo->input_semantic_name[i] ==
                  TGSI_SEMANTIC_VIEWPORT_INDEX) {
-         if (vs_index >= 0) {
-            llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         }
-         else {
-            llvmpipe->fake_vpindex_slot = vinfo->num_attribs;
-         }
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
-         if (vs_index >= 0) {
-            llvmpipe->layer_slot = vinfo->num_attribs;
-         }
-         else {
-            llvmpipe->fake_layer_slot = vinfo->num_attribs;
-         }
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->viewport_index_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+      } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+         llvmpipe->layer_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       } else {
          /*
-          * Emit the requested fs attribute for all but position.
+          * Note that we'd actually want to skip position (as we won't use
+          * the attribute in the fs) but can't. The reason is that we don't
+          * actually have a input/output map for setup (even though it looks
+          * like we do...). Could adjust for this though even without a map
+          * (in llvmpipe_create_fs_state()).
           */
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -137,8 +131,8 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                          TGSI_SEMANTIC_BCOLOR, i);
 
       if (vs_index >= 0) {
-         llvmpipe->bcolor_slot[i] = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+         llvmpipe->bcolor_slot[i] = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -148,29 +142,29 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                       TGSI_SEMANTIC_PSIZE, 0);
 
    if (vs_index >= 0) {
-      llvmpipe->psize_slot = vinfo->num_attribs;
-      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      llvmpipe->psize_slot = (int)vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
    }
 
    /* Figure out if we need viewport index (if it wasn't already in fs input) */
-   if (llvmpipe->viewport_index_slot == 0) {
+   if (llvmpipe->viewport_index_slot < 0) {
       vs_index = draw_find_shader_output(llvmpipe->draw,
                                          TGSI_SEMANTIC_VIEWPORT_INDEX,
                                          0);
       if (vs_index >= 0) {
-         llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->viewport_index_slot =(int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
    /* Figure out if we need layer (if it wasn't already in fs input) */
-   if (llvmpipe->layer_slot == 0) {
+   if (llvmpipe->layer_slot < 0) {
       vs_index = draw_find_shader_output(llvmpipe->draw,
                                          TGSI_SEMANTIC_LAYER,
                                          0);
       if (vs_index >= 0) {
-         llvmpipe->layer_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->layer_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -197,10 +191,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
       llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
    }
       
-   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
-                          LP_NEW_FS |
+   if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_VS))
-      compute_vertex_info( llvmpipe );
+      compute_vertex_info(llvmpipe);
 
    if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_FRAMEBUFFER |
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 079083e9601..83ff97659fb 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -2695,34 +2695,35 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
 
       switch (shader->info.base.input_interpolate[i]) {
       case TGSI_INTERPOLATE_CONSTANT:
-	 shader->inputs[i].interp = LP_INTERP_CONSTANT;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_CONSTANT;
+         break;
       case TGSI_INTERPOLATE_LINEAR:
-	 shader->inputs[i].interp = LP_INTERP_LINEAR;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_LINEAR;
+         break;
       case TGSI_INTERPOLATE_PERSPECTIVE:
-	 shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+         break;
       case TGSI_INTERPOLATE_COLOR:
-	 shader->inputs[i].interp = LP_INTERP_COLOR;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_COLOR;
+         break;
       default:
-	 assert(0);
-	 break;
+         assert(0);
+         break;
       }
 
       switch (shader->info.base.input_semantic_name[i]) {
       case TGSI_SEMANTIC_FACE:
-	 shader->inputs[i].interp = LP_INTERP_FACING;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_FACING;
+         break;
       case TGSI_SEMANTIC_POSITION:
-	 /* Position was already emitted above
-	  */
-	 shader->inputs[i].interp = LP_INTERP_POSITION;
-	 shader->inputs[i].src_index = 0;
-	 continue;
+         /* Position was already emitted above
+          */
+         shader->inputs[i].interp = LP_INTERP_POSITION;
+         shader->inputs[i].src_index = 0;
+         continue;
       }
 
+      /* XXX this is a completely pointless index map... */
       shader->inputs[i].src_index = i+1;
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index d7ba5c8ad8e..6a4fbbbf202 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -372,9 +372,9 @@ load_attribute(struct gallivm_state *gallivm,
    /* Potentially modify it according to twoside, etc:
     */
    if (key->twoside) {
-      if (vert_attr == key->color_slot && key->bcolor_slot > 0)
+      if (vert_attr == key->color_slot && key->bcolor_slot >= 0)
          lp_twoside(gallivm, args, key, key->bcolor_slot, attribv);
-      else if (vert_attr == key->spec_slot && key->bspec_slot > 0)
+      else if (vert_attr == key->spec_slot && key->bspec_slot >= 0)
          lp_twoside(gallivm, args, key, key->bspec_slot, attribv);
    }
 }
@@ -602,13 +602,6 @@ emit_tri_coef( struct gallivm_state *gallivm,
           */
          break;
 
-      case LP_INTERP_ZERO:
-         /*
-          * The information we get from the output is bogus, replace it
-          * with zero.
-          */
-         emit_constant_coef4(gallivm, args, slot+1, args->bld.zero);
-         break;
       case LP_INTERP_FACING:
          emit_facing_coef(gallivm, args, slot+1);
          break;
@@ -879,13 +872,7 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
    key->pad = 0;
    memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]);
    for (i = 0; i < key->num_inputs; i++) {
-      if (key->inputs[i].interp == LP_INTERP_CONSTANT) {
-         if (key->inputs[i].src_index == lp->fake_vpindex_slot ||
-             key->inputs[i].src_index == lp->fake_layer_slot) {
-            key->inputs[i].interp = LP_INTERP_ZERO;
-         }
-      }
-      else if (key->inputs[i].interp == LP_INTERP_COLOR) {
+      if (key->inputs[i].interp == LP_INTERP_COLOR) {
          if (lp->rasterizer->flatshade)
             key->inputs[i].interp = LP_INTERP_CONSTANT;
          else
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h
index 6cee6fe5eb5..9ad244482de 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -17,10 +17,10 @@ struct lp_setup_variant_list_item
 struct lp_setup_variant_key {
    unsigned size:16;
    unsigned num_inputs:8;
-   unsigned color_slot:8;
-   unsigned bcolor_slot:8;
-   unsigned spec_slot:8;
-   unsigned bspec_slot:8;
+   int color_slot:8;
+   int bcolor_slot:8;
+   int spec_slot:8;
+   int bspec_slot:8;
    unsigned flatshade_first:1;
    unsigned pixel_center_half:1;
    unsigned twoside:1;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 7b19174f345..9139b83f05a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -184,7 +184,7 @@ add_blend_test(struct gallivm_state *gallivm,
 
    LLVMBuildStore(builder, res, res_ptr);
 
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
 
    gallivm_verify_function(gallivm, func);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index a30f35c8149..02a63193af5 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -140,7 +140,7 @@ add_conv_test(struct gallivm_state *gallivm,
       LLVMBuildStore(builder, dst[i], ptr);
    }
 
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
 
    gallivm_verify_function(gallivm, func);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index d09a0ab0610..d1fdd75495f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -390,6 +390,9 @@ enum SVSemantic
    SV_VERTEX_STRIDE,
    SV_INVOCATION_INFO,
    SV_THREAD_KILL,
+   SV_BASEVERTEX,
+   SV_BASEINSTANCE,
+   SV_DRAWID,
    SV_UNDEFINED,
    SV_LAST
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index b49bf9d53bc..4504240ac5e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -124,6 +124,7 @@ struct nv50_ir_prog_info
    union {
       struct {
          uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */
+         bool usesDrawParameters;
       } vp;
       struct {
          uint8_t inputPatchSize;
@@ -160,8 +161,9 @@ struct nv50_ir_prog_info
       uint8_t clipDistances;     /* number of clip distance outputs */
       uint8_t cullDistances;     /* number of cull distance outputs */
       int8_t genUserClip;        /* request user clip planes for ClipVertex */
+      uint8_t auxCBSlot;         /* constant buffer index of UCP/draw data */
       uint16_t ucpBase;          /* base address for UCPs */
-      uint8_t ucpCBSlot;         /* constant buffer index of UCP data */
+      uint16_t drawInfoBase;     /* base address for draw parameters */
       uint8_t pointSize;         /* output index for PointSize */
       uint8_t instanceId;        /* system value index of InstanceID */
       uint8_t vertexId;          /* system value index of VertexID */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index e9ddd366391..ec74e7ac811 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -740,6 +740,7 @@ CodeEmitterGM107::emitF2F()
    emitCC   (0x2f);
    emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
    emitFMZ  (0x2c, 1);
+   emitField(0x29, 1, insn->subOp);
    emitRND  (0x27, rnd, 0x2a);
    emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
    emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 1d4f0d92f6b..0b28047e22b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1030,7 +1030,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i)
 
       // for 8/16 source types, the byte/word is in subOp. word 1 is
       // represented as 2.
-      code[1] |= i->subOp << 0x17;
+      if (!isFloatType(i->sType))
+         code[1] |= i->subOp << 0x17;
+      else
+         code[1] |= i->subOp << 0x18;
 
       if (sat)
          code[0] |= 0x20;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index b23386040a7..7b313f3c39c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -319,6 +319,10 @@ unsigned int Instruction::srcMask(unsigned int s) const
          x |= 2;
       return x;
    }
+   case TGSI_OPCODE_PK2H:
+      return 0x3;
+   case TGSI_OPCODE_UP2H:
+      return 0x1;
    default:
       break;
    }
@@ -348,7 +352,7 @@ static nv50_ir::DataFile translateFile(uint file)
    case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
    case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
    case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
-   case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
+   //case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
    case TGSI_FILE_SAMPLER:
    case TGSI_FILE_NULL:
    default:
@@ -377,6 +381,9 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
    case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
    case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL;
+   case TGSI_SEMANTIC_BASEVERTEX: return nv50_ir::SV_BASEVERTEX;
+   case TGSI_SEMANTIC_BASEINSTANCE: return nv50_ir::SV_BASEINSTANCE;
+   case TGSI_SEMANTIC_DRAWID:     return nv50_ir::SV_DRAWID;
    default:
       assert(0);
       return nv50_ir::SV_CLOCK;
@@ -449,6 +456,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
    case TGSI_OPCODE_ATOMUMAX:
    case TGSI_OPCODE_UBFE:
    case TGSI_OPCODE_UMSB:
+   case TGSI_OPCODE_UP2H:
       return nv50_ir::TYPE_U32;
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_I2D:
@@ -513,10 +521,12 @@ nv50_ir::DataType Instruction::inferDstType() const
    case TGSI_OPCODE_DSGE:
    case TGSI_OPCODE_DSLT:
    case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_PK2H:
       return nv50_ir::TYPE_U32;
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_U2F:
    case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_UP2H:
       return nv50_ir::TYPE_F32;
    case TGSI_OPCODE_I2D:
    case TGSI_OPCODE_U2D:
@@ -861,7 +871,7 @@ bool Source::scanSource()
    clipVertexOutput = -1;
 
    textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
-   resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+   //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
 
    info->immd.bufSize = 0;
 
@@ -1128,6 +1138,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       case TGSI_SEMANTIC_SAMPLEPOS:
          info->prop.fp.sampleInterp = 1;
          break;
+      case TGSI_SEMANTIC_BASEVERTEX:
+      case TGSI_SEMANTIC_BASEINSTANCE:
+      case TGSI_SEMANTIC_DRAWID:
+         info->prop.vp.usesDrawParameters = true;
+         break;
       default:
          break;
       }
@@ -1144,6 +1159,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          }
       }
       break;
+/*
    case TGSI_FILE_RESOURCE:
       for (i = first; i <= last; ++i) {
          resources[i].target = decl->Resource.Resource;
@@ -1151,6 +1167,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          resources[i].slot = i;
       }
       break;
+*/
    case TGSI_FILE_SAMPLER_VIEW:
       for (i = first; i <= last; ++i)
          textureViews[i].target = decl->SamplerView.Resource;
@@ -1216,11 +1233,13 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
          if (src.isIndirect(0))
             mainTempsInLMem = true;
       } else
+/*
       if (src.getFile() == TGSI_FILE_RESOURCE) {
          if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
             info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
                0x1 : 0x2;
       } else
+*/
       if (src.getFile() == TGSI_FILE_OUTPUT) {
          if (src.isIndirect(0)) {
             // We don't know which one is accessed, just mark everything for
@@ -1271,9 +1290,11 @@ Instruction::getTexture(const tgsi::Source *code, int s) const
    unsigned int r;
 
    switch (getSrc(s).getFile()) {
+/*
    case TGSI_FILE_RESOURCE:
       r = getSrc(s).getIndex(0);
       return translateTexture(code->resources.at(r).target);
+*/
    case TGSI_FILE_SAMPLER_VIEW:
       r = getSrc(s).getIndex(0);
       return translateTexture(code->textureViews.at(r).target);
@@ -1639,8 +1660,6 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
          // don't load masked inputs, won't be assigned a slot
          if (!ptr && !(info->in[idx].mask & (1 << swz)))
             return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
-         if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
-            return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
          return interpolate(src, c, shiftAddress(ptr));
       } else
       if (prog->getType() == Program::TYPE_GEOMETRY) {
@@ -1681,7 +1700,7 @@ Converter::acquireDst(int d, int c)
    const int idx = dst.getIndex(0);
    const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
-   if (dst.isMasked(c) || f == TGSI_FILE_RESOURCE)
+   if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
       return NULL;
 
    if (dst.isIndirect(0) ||
@@ -2799,6 +2818,21 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
          mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c));
       break;
+   case TGSI_OPCODE_PK2H:
+      val0 = getScratch();
+      val1 = getScratch();
+      mkCvt(OP_CVT, TYPE_F16, val0, TYPE_F32, fetchSrc(0, 0));
+      mkCvt(OP_CVT, TYPE_F16, val1, TYPE_F32, fetchSrc(0, 1));
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp3(OP_INSBF, TYPE_U32, dst0[c], val1, mkImm(0x1010), val0);
+      break;
+   case TGSI_OPCODE_UP2H:
+      src0 = fetchSrc(0, 0);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         geni = mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F16, src0);
+         geni->subOp = c & 1;
+      }
+      break;
    case TGSI_OPCODE_EMIT:
       /* export the saved viewport index */
       if (viewport != NULL) {
@@ -3252,7 +3286,7 @@ Converter::handleUserClipPlanes()
 
    for (c = 0; c < 4; ++c) {
       for (i = 0; i < info->io.genUserClip; ++i) {
-         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.ucpCBSlot,
+         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.auxCBSlot,
                                 TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
          Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
          if (c == 0)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index e67bf3eca84..6530078b938 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1576,6 +1576,17 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
       break;
+   case SV_BASEVERTEX:
+   case SV_BASEINSTANCE:
+   case SV_DRAWID:
+      ld = bld.mkLoad(TYPE_U32, i->getDef(0),
+                      bld.mkSymbol(FILE_MEMORY_CONST,
+                                   prog->driver->io.auxCBSlot,
+                                   TYPE_U32,
+                                   prog->driver->io.drawInfoBase +
+                                   4 * (sv - SV_BASEVERTEX)),
+                      NULL);
+      break;
    default:
       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index c2842c2186f..f5c590eef10 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -676,23 +676,22 @@ ConstantFolding::expr(Instruction *i,
    switch (i->op) {
    case OP_MAD:
    case OP_FMA: {
-      i->op = OP_ADD;
+      ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
 
-      /* Move the immediate to the second arg, otherwise the ADD operation
-       * won't be emittable
-       */
-      i->setSrc(1, i->getSrc(0));
+      // Move the immediate into position 1, where we know it might be
+      // emittable. However it might not be anyways, as there may be other
+      // restrictions, so move it into a separate LValue.
+      bld.setPosition(i, false);
+      i->op = OP_ADD;
+      i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
       i->setSrc(0, i->getSrc(2));
       i->src(0).mod = i->src(2).mod;
       i->setSrc(2, NULL);
 
-      ImmediateValue src0;
       if (i->src(0).getImmediate(src0))
-         expr(i, src0, *i->getSrc(1)->asImm());
-      if (i->saturate && !prog->getTarget()->isSatSupported(i)) {
-         bld.setPosition(i, false);
-         i->setSrc(1, bld.loadImm(NULL, res.data.u32));
-      }
+         expr(i, src0, src1);
+      else
+         opnd(i, src1, 1);
       break;
    }
    case OP_PFETCH:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 19637ce33f5..014c652eede 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -295,6 +295,9 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
    case SV_SAMPLE_INDEX:   return 0;
    case SV_SAMPLE_POS:     return 0;
    case SV_SAMPLE_MASK:    return 0;
+   case SV_BASEVERTEX:     return 0;
+   case SV_BASEINSTANCE:   return 0;
+   case SV_DRAWID:         return 0;
    default:
       return 0xffffffff;
    }
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index 670b0c8b135..cd44aa1e1d9 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -112,7 +112,7 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[],
    info.bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    info.bin.source = tokens;
 
-   info.io.ucpCBSlot = 15;
+   info.io.auxCBSlot = 15;
    info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
 
    info.io.resInfoCBSlot = 15;
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
index 58df5ee847f..809e971a678 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@@ -114,6 +114,11 @@ struct nouveau_vp3_decoder {
    unsigned fence_seq, fw_sizes, last_frame_num, tmp_stride, ref_stride;
 
    unsigned bsp_idx, vp_idx, ppp_idx;
+
+   /* End of the bsp bo where new data should be appended between one begin/end
+    * frame.
+    */
+   char *bsp_ptr;
 };
 
 struct comm {
@@ -208,11 +213,15 @@ nouveau_vp3_load_firmware(struct nouveau_vp3_decoder *dec,
                           enum pipe_video_profile profile,
                           unsigned chipset);
 
+void
+nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec);
+
+void
+nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
+                     const void *const *data, const unsigned *num_bytes);
+
 uint32_t
-nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
-                struct nouveau_vp3_video_buffer *target,
-                unsigned comm_seq, unsigned num_buffers,
-                const void *const *data, const unsigned *num_bytes);
+nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc);
 
 void
 nouveau_vp3_vp_caps(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
index 692772e49d1..a3d07deeb18 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
@@ -230,20 +230,58 @@ nouveau_vp3_fill_picparm_h264_bsp(struct nouveau_vp3_decoder *dec,
    return caps | 3;
 }
 
+static inline struct strparm_bsp *strparm_bsp(struct nouveau_vp3_decoder *dec)
+{
+   unsigned comm_seq = dec->fence_seq;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   return (struct strparm_bsp *)(bsp_bo->map + 0x100);
+}
+
+void
+nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec)
+{
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+
+   dec->bsp_ptr = (void *)str_bsp;
+   memset(str_bsp, 0, 0x80);
+   dec->bsp_ptr += 0x100;
+   /* Reserved for picparm_vp */
+   dec->bsp_ptr += 0x300;
+   /* Reserved for comm */
+#if !NOUVEAU_VP3_DEBUG_FENCE
+   memset(dec->bsp_ptr, 0, 0x200);
+#endif
+   dec->bsp_ptr += 0x200;
+}
+
+void
+nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
+                     const void *const *data, const unsigned *num_bytes)
+{
+#ifndef NDEBUG
+   unsigned comm_seq = dec->fence_seq;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+#endif
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+   int i;
+
+   for (i = 0; i < num_buffers; ++i) {
+      assert(bsp_bo->size >= str_bsp->w0[0] + num_bytes[i]);
+      memcpy(dec->bsp_ptr, data[i], num_bytes[i]);
+      dec->bsp_ptr += num_bytes[i];
+      str_bsp->w0[0] += num_bytes[i];
+   }
+}
+
 uint32_t
-nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
-                struct nouveau_vp3_video_buffer *target,
-                unsigned comm_seq, unsigned num_buffers,
-                const void *const *data, const unsigned *num_bytes)
+nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc)
 {
    enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   unsigned comm_seq = dec->fence_seq;
    struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
-   char *bsp;
    uint32_t endmarker, caps;
-   struct strparm_bsp *str_bsp;
-   int i;
-
-   bsp = bsp_bo->map;
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+   char *bsp = bsp_bo->map;
    /*
     * 0x000..0x100: picparm_bsp
     * 0x200..0x500: picparm_vp
@@ -277,34 +315,21 @@ nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
    caps |= 1 << 17; // enable watchdog
    caps |= 0 << 18; // do not report error to VP, so it can continue decoding what we have
    caps |= 0 << 19; // if enabled, use crypto crap?
-   bsp += 0x100;
 
-   str_bsp = (struct strparm_bsp *)bsp;
-   memset(str_bsp, 0, 0x80);
-   str_bsp->w0[0] = 16;
+   str_bsp = strparm_bsp(dec);
    str_bsp->w1[0] = 0x1;
-   bsp += 0x100;
-   /* Reserved for picparm_vp */
-   bsp += 0x300;
-   /* Reserved for comm */
-#if !NOUVEAU_VP3_DEBUG_FENCE
-   memset(bsp, 0, 0x200);
-#endif
-   bsp += 0x200;
-   for (i = 0; i < num_buffers; ++i) {
-      memcpy(bsp, data[i], num_bytes[i]);
-      bsp += num_bytes[i];
-      str_bsp->w0[0] += num_bytes[i];
-   }
 
    /* Append end sequence */
-   *(uint32_t *)bsp = endmarker;
-   bsp += 4;
-   *(uint32_t *)bsp = 0x00000000;
-   bsp += 4;
-   *(uint32_t *)bsp = endmarker;
-   bsp += 4;
-   *(uint32_t *)bsp = 0x00000000;
+   *(uint32_t *)dec->bsp_ptr = endmarker;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = 0x00000000;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = endmarker;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = 0x00000000;
+   str_bsp->w0[0] += 16;
+
+   dec->bsp_ptr = NULL;
 
    return caps;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index 098d6e499fa..7b0d0745766 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -208,17 +208,16 @@ nv30_render_release_vertices(struct vbuf_render *render)
 
 static const struct {
    unsigned emit;
-   unsigned interp;
    unsigned vp30;
    unsigned vp40;
    unsigned ow40;
 } vroute [] = {
-   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, INTERP_PERSPECTIVE, 0, 0, 0x00000000 },
-   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, INTERP_LINEAR     , 3, 1, 0x00000001 },
-   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, INTERP_LINEAR     , 1, 3, 0x00000004 },
-   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 },
-   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, INTERP_POS  , 6, 6, 0x00000020 },
-   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
+   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, 0, 0, 0x00000000 },
+   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, 3, 1, 0x00000001 },
+   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, 1, 3, 0x00000004 },
+   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, 5, 5, 0x00000010 },
+   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, 6, 6, 0x00000020 },
+   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, 8, 7, 0x00004000 },
 };
 
 static bool
@@ -247,7 +246,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
    if (emit == EMIT_OMIT)
       return false;
 
-   draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
+   draw_emit_vertex_attr(vinfo, emit, attrib);
    format = draw_translate_vinfo_format(emit);
 
    r->vtxfmt[attrib] = nv30_vtxfmt(&screen->base.base, format)->hw;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 854f70cf34c..d9c940232c4 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -157,6 +157,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_USER_VERTEX_BUFFERS:
    case PIPE_CAP_COMPUTE:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -174,6 +176,11 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -265,6 +272,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -308,6 +316,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 2cebcd99423..712d00ed2d3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -134,9 +134,11 @@ struct nv50_context {
    struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[3];
    uint16_t constbuf_valid[3];
+   uint16_t constbuf_coherent[3];
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
+   uint32_t vtxbufs_coherent;
    struct pipe_index_buffer idxbuf;
    uint32_t vbo_fifo; /* bitmask of vertex elements to be pushed to FIFO */
    uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
@@ -148,6 +150,7 @@ struct nv50_context {
 
    struct pipe_sampler_view *textures[3][PIPE_MAX_SAMPLERS];
    unsigned num_textures[3];
+   uint32_t textures_coherent[3];
    struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[3];
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index a4b8ddfda95..888d62e1c52 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -148,7 +148,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
    for (m = 0, i = 0; i < info->numInputs; ++i) {
       switch (info->in[i].sn) {
       case TGSI_SEMANTIC_POSITION:
-      case TGSI_SEMANTIC_FACE:
          continue;
       default:
          m += info->in[i].flat ? 0 : 1;
@@ -166,9 +165,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
          for (c = 0; c < 4; ++c)
             if (info->in[i].mask & (1 << c))
                info->in[i].slot[c] = nintp++;
-      } else
-      if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
-         info->in[i].slot[0] = 255;
       } else {
          unsigned j = info->in[i].flat ? m++ : n++;
 
@@ -335,7 +331,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    info->bin.source = (void *)prog->pipe.tokens;
 
-   info->io.ucpCBSlot = 15;
+   info->io.auxCBSlot = 15;
    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
    info->io.genUserClip = prog->vp.clpd_nr;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 272e1d45bff..56c67e0ddfb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_COMPUTE:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -212,11 +213,17 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -300,6 +307,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index de655971b66..cb040439139 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -664,6 +664,17 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
       if (old)
          nv50_screen_tic_unlock(nv50->screen, old);
 
+      if (views[i] && views[i]->texture) {
+         struct pipe_resource *res = views[i]->texture;
+         if (res->target == PIPE_BUFFER &&
+             (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+            nv50->textures_coherent[s] |= 1 << i;
+         else
+            nv50->textures_coherent[s] &= ~(1 << i);
+      } else {
+         nv50->textures_coherent[s] &= ~(1 << i);
+      }
+
       pipe_sampler_view_reference(&nv50->textures[s][i], views[i]);
    }
 
@@ -847,13 +858,19 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nv50->constbuf[s][i].u.data = cb->user_buffer;
       nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
+      nv50->constbuf_coherent[s] &= ~(1 << i);
    } else
    if (res) {
       nv50->constbuf[s][i].offset = cb->buffer_offset;
       nv50->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
+      if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         nv50->constbuf_coherent[s] |= 1 << i;
+      else
+         nv50->constbuf_coherent[s] &= ~(1 << i);
    } else {
       nv50->constbuf_valid[s] &= ~(1 << i);
+      nv50->constbuf_coherent[s] &= ~(1 << i);
    }
    nv50->constbuf_dirty[s] |= 1 << i;
 
@@ -1003,6 +1020,7 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
    if (!vb) {
       nv50->vbo_user &= ~(((1ull << count) - 1) << start_slot);
       nv50->vbo_constant &= ~(((1ull << count) - 1) << start_slot);
+      nv50->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot);
       return;
    }
 
@@ -1015,9 +1033,16 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
             nv50->vbo_constant |= 1 << dst_index;
          else
             nv50->vbo_constant &= ~(1 << dst_index);
+         nv50->vtxbufs_coherent &= ~(1 << dst_index);
       } else {
          nv50->vbo_user &= ~(1 << dst_index);
          nv50->vbo_constant &= ~(1 << dst_index);
+
+         if (vb[i].buffer &&
+             vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+            nv50->vtxbufs_coherent |= (1 << dst_index);
+         else
+            nv50->vtxbufs_coherent &= ~(1 << dst_index);
       }
    }
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 7de2f1f1d0f..60fa2bc06a8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -636,8 +636,8 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
          BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
          PUSH_DATA (push, prim);
 
-         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
          nouveau_pushbuf_space(push, 8, 0, 1);
+         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
 
          switch (index_size) {
          case 4:
@@ -765,7 +765,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    bool tex_dirty = false;
-   int i, s;
+   int s;
 
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
    nv50->vb_elt_first = info->min_index + info->index_bias;
@@ -794,27 +794,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 
    push->kick_notify = nv50_draw_vbo_kick_notify;
 
-   /* TODO: Instead of iterating over all the buffer resources looking for
-    * coherent buffers, keep track of a context-wide count.
-    */
    for (s = 0; s < 3 && !nv50->cb_dirty; ++s) {
-      uint32_t valid = nv50->constbuf_valid[s];
-
-      while (valid && !nv50->cb_dirty) {
-         const unsigned i = ffs(valid) - 1;
-         struct pipe_resource *res;
-
-         valid &= ~(1 << i);
-         if (nv50->constbuf[s][i].user)
-            continue;
-
-         res = nv50->constbuf[s][i].u.buf;
-         if (!res)
-            continue;
-
-         if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nv50->cb_dirty = true;
-      }
+      if (nv50->constbuf_coherent[s])
+         nv50->cb_dirty = true;
    }
 
    /* If there are any coherent constbufs, flush the cache */
@@ -825,15 +807,10 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    for (s = 0; s < 3 && !tex_dirty; ++s) {
-      for (i = 0; i < nv50->num_textures[s] && !tex_dirty; ++i) {
-         if (!nv50->textures[s][i] ||
-             nv50->textures[s][i]->texture->target != PIPE_BUFFER)
-            continue;
-         if (nv50->textures[s][i]->texture->flags &
-             PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            tex_dirty = true;
-      }
+      if (nv50->textures_coherent[s])
+         tex_dirty = true;
    }
+
    if (tex_dirty) {
       BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
       PUSH_DATA (push, 0x20);
@@ -853,12 +830,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_DATA (push, info->start_instance);
    }
 
-   for (i = 0; i < nv50->num_vtxbufs && !nv50->base.vbo_dirty; ++i) {
-      if (!nv50->vtxbuf[i].buffer)
-         continue;
-      if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv50->base.vbo_dirty = true;
-   }
+   nv50->base.vbo_dirty |= !!nv50->vtxbufs_coherent;
 
    if (nv50->base.vbo_dirty) {
       BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
index dbde1bfcebe..4fe0e05c96b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -77,7 +77,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       bsp_size += (1 << 20) - 1;
       bsp_size &= ~((1 << 20) - 1);
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
       if (ret) {
          debug_printf("reallocating bsp %u -> %u failed with %i\n",
                       bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -90,7 +90,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
    if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
       struct nouveau_bo *tmp_bo = NULL;
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
       if (ret) {
          debug_printf("reallocating inter %u -> %u failed with %i\n",
                       inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
@@ -106,8 +106,9 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       return -1;
    }
 
-   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes);
+   nouveau_vp3_bsp_begin(dec);
+   nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+   caps = nouveau_vp3_bsp_end(dec, desc);
 
    nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
index b2060d1fa53..4daa57d47bb 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -230,27 +230,43 @@ locn_0f_ts:
  * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE.
  *
  * arg     = mode
- * parm[0] = count
- * parm[1] = instance_count
- * parm[2] = start
- * parm[3] = index_bias
- * parm[4] = start_instance
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2 + 5n + 0] = count
+ * parm[2 + 5n + 1] = instance_count
+ * parm[2 + 5n + 2] = start
+ * parm[2 + 5n + 3] = index_bias
+ * parm[2 + 5n + 4] = start_instance
+ *
+ * SCRATCH[0] = saved VB_ELEMENT_BASE
+ * SCRATCH[1] = saved VB_INSTANCE_BASE
  */
 .section #mme9097_draw_elts_indirect
+   read $r6 0x50d /* VB_ELEMENT_BASE */
+   read $r7 0x50e /* VB_INSTANCE_BASE */
+   maddr 0x1d00
+   send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */
+   send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+dei_draw_again:
    parm $r3 /* count */
    parm $r2 /* instance_count */
    parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
    parm $r4 send $r4 /* index_bias, send start */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
    braz $r2 #dei_end
-   parm $r5 /* start_instance */
-   read $r6 0x50d /* VB_ELEMENT_BASE */
-   read $r7 0x50e /* VB_INSTANCE_BASE */
+   parm $r5 send $r4 /* start_instance, send index_bias */
+   send $r5 /* send start_instance */
+   send $r6 /* draw id */
    maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
    send $r4
    send $r5
    maddr 0x446
    send $r4
    mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
 dei_again:
    maddr 0x586 /* VERTEX_BEGIN_GL */
    send $r1 /* mode */
@@ -260,46 +276,218 @@ dei_again:
    maddrsend 0x585 /* VERTEX_END_GL */
    branz $r2 #dei_again
    mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+dei_end:
+   mov $r7 (add $r7 -1)
+   branz $r7 #dei_draw_again
+   mov $r6 (add $r6 1)
+   read $r6 0xd00
+   read $r7 0xd01
    maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
    send $r6
    send $r7
    exit maddr 0x446
    send $r6
-dei_end:
-   exit
-   nop
 
 /* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT:
  *
  * NOTE: Saves and restores VB_INSTANCE_BASE.
  *
  * arg     = mode
- * parm[0] = count
- * parm[1] = instance_count
- * parm[2] = start
- * parm[3] = start_instance
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2 + 4n + 0] = count
+ * parm[2 + 4n + 1] = instance_count
+ * parm[2 + 4n + 2] = start
+ * parm[2 + 4n + 3] = start_instance
  */
 .section #mme9097_draw_arrays_indirect
+   read $r5 0x50e /* VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+dai_draw_again:
    parm $r2 /* count */
    parm $r3 /* instance_count */
    parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
-   parm $r4 send $r4 /* start_instance */
    braz $r3 #dai_end
-   read $r6 0x50e /* VB_INSTANCE_BASE */
+   parm $r4 send $r4 /* start_instance */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   send 0x0 /* send 0 as base_vertex */
+   send $r4 /* send start_instance */
+   send $r6 /* draw id */
    maddr 0x50e /* VB_INSTANCE_BASE */
-   mov $r5 0x1
    send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
 dai_again:
    maddr 0x586 /* VERTEX_BEGIN_GL */
    send $r1 /* mode */
    maddr 0x35e /* VERTEX_BUFFER_COUNT */
    send $r2
-   mov $r3 (sub $r3 $r5)
+   mov $r3 (sub $r3 $r4)
    maddrsend 0x585 /* VERTEX_END_GL */
    branz $r3 #dai_again
-   mov $r1 (extrinsrt $r1 $r5 0 1 26) /* set INSTANCE_NEXT */
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+dai_end:
+   mov $r7 (add $r7 -1)
+   branz $r7 #dai_draw_again
+   mov $r6 (add $r6 1)
    exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
+   send $r5
+
+/* NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT
+ *
+ * NOTE: Saves and restores VB_ELEMENT,INSTANCE_BASE.
+ * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE.
+ *
+ * arg     = mode
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2] = totaldraws
+ * parm[3 + 5n + 0] = count
+ * parm[3 + 5n + 1] = instance_count
+ * parm[3 + 5n + 2] = start
+ * parm[3 + 5n + 3] = index_bias
+ * parm[3 + 5n + 4] = start_instance
+ *
+ * SCRATCH[0] = saved VB_ELEMENT_BASE
+ * SCRATCH[1] = saved VB_INSTANCE_BASE
+ * SCRATCH[2] = draws left
+ */
+.section #mme9097_draw_elts_indirect_count
+   read $r6 0x50d /* VB_ELEMENT_BASE */
+   read $r7 0x50e /* VB_INSTANCE_BASE */
+   maddr 0x1d00
+   send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */
+   send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+   parm $r5 /* totaldraws */
+   mov $r5 (sub $r5 $r6) /* draws left */
+   braz $r5 #deic_runout
+   mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */
+   branz $r3 #deic_runout
+   send $r5
+deic_draw_again:
+   parm $r3 /* count */
+   parm $r2 /* instance_count */
+   parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
+   parm $r4 send $r4 /* index_bias, send start */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   braz $r2 #deic_end
+   parm $r5 send $r4 /* start_instance, send index_bias */
+   send $r5 /* send start_instance */
+   send $r6 /* draw id */
+   maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
+   send $r4
+   send $r5
+   maddr 0x446
+   send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
+deic_again:
+   maddr 0x586 /* VERTEX_BEGIN_GL */
+   send $r1 /* mode */
+   maddr 0x5f8 /* INDEX_BATCH_COUNT */
+   send $r3 /* count */
+   mov $r2 (sub $r2 $r4)
+   maddrsend 0x585 /* VERTEX_END_GL */
+   branz $r2 #deic_again
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+deic_end:
+   read $r5 0xd02
+   mov $r5 (add $r5 -1)
+   braz $r5 #deic_runout_check
+   mov $r7 (add $r7 -1)
+   maddr 0xd02
+   send $r5
+   branz $r7 #deic_draw_again
+   mov $r6 (add $r6 1)
+deic_restore:
+   read $r6 0xd00
+   read $r7 0xd01
+   maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
    send $r6
-dai_end:
-   exit
-   nop
+   send $r7
+   exit maddr 0x446
+   send $r6
+deic_runout:
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   mov $r7 (add $r7 -1)
+deic_runout_check:
+   branz annul $r7 #deic_runout
+   bra annul #deic_restore
+
+/* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT:
+ *
+ * NOTE: Saves and restores VB_INSTANCE_BASE.
+ *
+ * arg     = mode
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2] = totaldraws
+ * parm[3 + 4n + 0] = count
+ * parm[3 + 4n + 1] = instance_count
+ * parm[3 + 4n + 2] = start
+ * parm[3 + 4n + 3] = start_instance
+ *
+ * SCRATCH[0] = VB_INSTANCE_BASE
+ */
+.section #mme9097_draw_arrays_indirect_count
+   read $r5 0x50e /* VB_INSTANCE_BASE */
+   maddr 0xd00
+   parm $r6 send $r5 /* start_drawid, save VB_INSTANCE_BASE */
+   parm $r7 /* numparams */
+   parm $r5 /* totaldraws */
+   mov $r5 (sub $r5 $r6) /* draws left */
+   braz $r5 #daic_runout
+   mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */
+   branz annul $r3 #daic_runout
+daic_draw_again:
+   parm $r2 /* count */
+   parm $r3 /* instance_count */
+   parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
+   braz $r3 #daic_end
+   parm $r4 send $r4 /* start_instance */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   send 0x0 /* send 0 as base_vertex */
+   send $r4 /* send start_instance */
+   send $r6 /* draw id */
+   maddr 0x50e /* VB_INSTANCE_BASE */
+   send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
+daic_again:
+   maddr 0x586 /* VERTEX_BEGIN_GL */
+   send $r1 /* mode */
+   maddr 0x35e /* VERTEX_BUFFER_COUNT */
+   send $r2
+   mov $r3 (sub $r3 $r4)
+   maddrsend 0x585 /* VERTEX_END_GL */
+   branz $r3 #daic_again
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+daic_end:
+   mov $r5 (add $r5 -1)
+   braz $r5 #daic_runout_check
+   mov $r7 (add $r7 -1)
+   branz $r7 #daic_draw_again
+   mov $r6 (add $r6 1)
+daic_restore:
+   read $r5 0xd00
+   exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
+   send $r5
+daic_runout:
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   mov $r7 (add $r7 -1)
+daic_runout_check:
+   branz annul $r7 #daic_runout
+   bra annul #daic_restore
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
index bac9042c2df..bf8625e0584 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -125,22 +125,33 @@ uint32_t mme9097_tep_select[] = {
 };
 
 uint32_t mme9097_draw_elts_indirect[] = {
+	0x01434615,
+/* 0x0007: dei_draw_again */
+	0x01438715,
+	0x07400021,
+	0x00003041,
+	0x00003841,
+	0x00000601,
+/* 0x0018: dei_again */
+	0x00000701,
 	0x00000301,
+/* 0x0020: dei_end */
 	0x00000201,
 	0x017dc451,
-/* 0x000e: dei_again */
 	0x00002431,
-	0x0005d007,
-	0x00000501,
-/* 0x001b: dei_end */
-	0x01434615,
-	0x01438715,
+	0x0638c021,
+	0x00600041,
+	0x0004d007,
+	0x00002531,
+	0x00002841,
+	0x00003041,
 	0x05434021,
 	0x00002041,
 	0x00002841,
 	0x01118021,
 	0x00002041,
 	0x00004411,
+	0xd0400912,
 	0x01618021,
 	0x00000841,
 	0x017e0021,
@@ -149,37 +160,175 @@ uint32_t mme9097_draw_elts_indirect[] = {
 	0x01614071,
 	0xfffe9017,
 	0xd0410912,
+	0xffffff11,
+	0xfff9b817,
+	0x00007611,
+	0x03400615,
+	0x03404715,
 	0x05434021,
 	0x00003041,
 	0x00003841,
 	0x011180a1,
 	0x00003041,
-	0x00000091,
-	0x00000011,
 };
 
 uint32_t mme9097_draw_arrays_indirect[] = {
+/* 0x0003: dai_draw_again */
+	0x01438515,
+	0x00000601,
+	0x00000701,
 	0x00000201,
+/* 0x0011: dai_again */
 	0x00000301,
-/* 0x0009: dai_again */
 	0x00d74451,
+/* 0x0019: dai_end */
+	0x0004d807,
 	0x00002431,
-/* 0x0013: dai_end */
-	0x0003d807,
-	0x01438615,
+	0x0638c021,
+	0x00600041,
+	0x00000041,
+	0x00002041,
+	0x00003041,
 	0x01438021,
-	0x00004511,
 	0x00002041,
+	0x00004411,
+	0xd0400912,
 	0x01618021,
 	0x00000841,
 	0x00d78021,
 	0x00001041,
-	0x00055b10,
+	0x00051b10,
 	0x01614071,
 	0xfffe9817,
-	0xd0414912,
+	0xd0410912,
+	0xffffff11,
+	0xfffa7817,
+	0x00007611,
 	0x014380a1,
+	0x00002841,
+};
+
+uint32_t mme9097_draw_elts_indirect_count[] = {
+	0x01434615,
+	0x01438715,
+	0x07400021,
+/* 0x000d: deic_draw_again */
 	0x00003041,
-	0x00000091,
-	0x00000011,
+	0x00003841,
+	0x00000601,
+	0x00000701,
+/* 0x001e: deic_again */
+	0x00000501,
+	0x0005ad10,
+/* 0x0026: deic_end */
+	0x000b2807,
+	0x007f4312,
+/* 0x002e: deic_restore */
+	0x000a9817,
+	0x00002841,
+/* 0x0035: deic_runout */
+	0x00000301,
+/* 0x003b: deic_runout_check */
+	0x00000201,
+	0x017dc451,
+	0x00002431,
+	0x0638c021,
+	0x00600041,
+	0x0004d007,
+	0x00002531,
+	0x00002841,
+	0x00003041,
+	0x05434021,
+	0x00002041,
+	0x00002841,
+	0x01118021,
+	0x00002041,
+	0x00004411,
+	0xd0400912,
+	0x01618021,
+	0x00000841,
+	0x017e0021,
+	0x00001841,
+	0x00051210,
+	0x01614071,
+	0xfffe9017,
+	0xd0410912,
+	0x03408515,
+	0xffffed11,
+	0x0004e807,
+	0xffffff11,
+	0x03408021,
+	0x00002841,
+	0xfff87817,
+	0x00007611,
+	0x03400615,
+	0x03404715,
+	0x05434021,
+	0x00003041,
+	0x00003841,
+	0x011180a1,
+	0x00003041,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0xffffff11,
+	0xfffeb837,
+	0xfffc8027,
+};
+
+uint32_t mme9097_draw_arrays_indirect_count[] = {
+	0x01438515,
+	0x03400021,
+/* 0x0009: daic_draw_again */
+	0x00002e31,
+	0x00000701,
+	0x00000501,
+/* 0x0017: daic_again */
+	0x0005ad10,
+	0x00086807,
+/* 0x001f: daic_end */
+	0x007f4312,
+	0x0007d837,
+/* 0x0024: daic_restore */
+/* 0x0027: daic_runout */
+	0x00000201,
+	0x00000301,
+/* 0x002c: daic_runout_check */
+	0x00d74451,
+	0x0004d807,
+	0x00002431,
+	0x0638c021,
+	0x00600041,
+	0x00000041,
+	0x00002041,
+	0x00003041,
+	0x01438021,
+	0x00002041,
+	0x00004411,
+	0xd0400912,
+	0x01618021,
+	0x00000841,
+	0x00d78021,
+	0x00001041,
+	0x00051b10,
+	0x01614071,
+	0xfffe9817,
+	0xd0410912,
+	0xffffed11,
+	0x00032807,
+	0xffffff11,
+	0xfff9f817,
+	0x00007611,
+	0x03400515,
+	0x014380a1,
+	0x00002841,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0xffffff11,
+	0xfffef837,
+	0xfffdc027,
 };
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 39b73ecb0c2..12195489691 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -134,10 +134,12 @@ struct nvc0_context {
    struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[6];
    uint16_t constbuf_valid[6];
+   uint16_t constbuf_coherent[6];
    bool cb_dirty;
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
+   uint32_t vtxbufs_coherent;
    struct pipe_index_buffer idxbuf;
    uint32_t constant_vbos;
    uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
@@ -149,6 +151,7 @@ struct nvc0_context {
    struct pipe_sampler_view *textures[6][PIPE_MAX_SAMPLERS];
    unsigned num_textures[6];
    uint32_t textures_dirty[6];
+   uint32_t textures_coherent[6];
    struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[6];
    uint16_t samplers_dirty[6];
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
index bf2798a44a0..27c026b8b30 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
@@ -29,4 +29,8 @@
 
 #define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT			0x00003840
 
+#define NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT		0x00003848
+
+#define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT		0x00003850
+
 #endif /* __NVC0_MACROS_H__ */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 67a25acf778..c3b53621630 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -55,7 +55,6 @@ nvc0_shader_input_address(unsigned sn, unsigned si)
    case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
    case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
    case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
-   case TGSI_SEMANTIC_FACE:         return 0x3fc;
    default:
       assert(!"invalid TGSI input semantic");
       return ~0;
@@ -285,8 +284,6 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       break;
    case PIPE_PRIM_TRIANGLES:
       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
-      if (info->prop.tp.winding > 0)
-         tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
       break;
    case PIPE_PRIM_QUADS:
       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
@@ -295,6 +292,10 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       tp->tp.tess_mode = ~0;
       return;
    }
+
+   if (info->prop.tp.winding > 0)
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
+
    if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS)
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
 
@@ -533,8 +534,9 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    info->bin.source = (void *)prog->pipe.tokens;
 
    info->io.genUserClip = prog->vp.num_ucps;
+   info->io.auxCBSlot = 15;
    info->io.ucpBase = 256;
-   info->io.ucpCBSlot = 15;
+   info->io.drawInfoBase = 256 + 128;
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
@@ -583,6 +585,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    prog->num_barriers = info->numBarriers;
 
    prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+   prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters;
 
    if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
       info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index 9c45e7b3e31..8b8d221edfc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -42,6 +42,7 @@ struct nvc0_program {
       uint8_t num_ucps; /* also set to max if ClipDistance is used */
       uint8_t edgeflag; /* attribute index of edgeflag input */
       bool need_vertex_id;
+      bool need_draw_parameters;
    } vp;
    struct {
       uint8_t early_z;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index a70d524ea85..1bed0162baf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -470,10 +470,7 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
 {
    struct nvc0_hw_query *hq = nvc0_hw_query(q);
 
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-
    PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
-   nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
                         NVC0_IB_ENTRY_1_NO_PREFETCH);
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 39954464b9c..33dd17ebeca 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -184,6 +184,11 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -206,6 +211,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -311,6 +318,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       return 16; /* would be 32 in linked (OpenGL-style) mode */
@@ -1025,6 +1033,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect);
+   MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
+   MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
 
    BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
    PUSH_DATA (push, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 5e84ca9e0ea..dc02b011bdf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -317,6 +317,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
 
       if (!targ->clean)
          nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq));
+      nouveau_pushbuf_space(push, 0, 0, 1);
       BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
       PUSH_DATA (push, 1);
       PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 41a824a97a0..24a6c222dd5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -554,6 +554,17 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
          continue;
       nvc0->textures_dirty[s] |= 1 << i;
 
+      if (views[i] && views[i]->texture) {
+         struct pipe_resource *res = views[i]->texture;
+         if (res->target == PIPE_BUFFER &&
+             (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+            nvc0->textures_coherent[s] |= 1 << i;
+         else
+            nvc0->textures_coherent[s] &= ~(1 << i);
+      } else {
+         nvc0->textures_coherent[s] &= ~(1 << i);
+      }
+
       if (old) {
          nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i));
          nvc0_screen_tic_unlock(nvc0->screen, old);
@@ -596,6 +607,17 @@ nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s,
             continue;
          nvc0->textures_dirty[s] |= 1 << i;
 
+         if (views[p] && views[p]->texture) {
+            struct pipe_resource *res = views[p]->texture;
+            if (res->target == PIPE_BUFFER &&
+                (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+               nvc0->textures_coherent[s] |= 1 << i;
+            else
+               nvc0->textures_coherent[s] &= ~(1 << i);
+         } else {
+            nvc0->textures_coherent[s] &= ~(1 << i);
+         }
+
          if (nvc0->textures[s][i]) {
             struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
             nouveau_bufctx_reset(bctx, bin + i);
@@ -842,14 +864,20 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nvc0->constbuf[s][i].u.data = cb->user_buffer;
       nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
+      nvc0->constbuf_coherent[s] &= ~(1 << i);
    } else
    if (cb) {
       nvc0->constbuf[s][i].offset = cb->buffer_offset;
       nvc0->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
+      if (res && res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         nvc0->constbuf_coherent[s] |= 1 << i;
+      else
+         nvc0->constbuf_coherent[s] &= ~(1 << i);
    }
    else {
       nvc0->constbuf_valid[s] &= ~(1 << i);
+      nvc0->constbuf_coherent[s] &= ~(1 << i);
    }
 }
 
@@ -1009,6 +1037,7 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
     if (!vb) {
        nvc0->vbo_user &= ~(((1ull << count) - 1) << start_slot);
        nvc0->constant_vbos &= ~(((1ull << count) - 1) << start_slot);
+       nvc0->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot);
        return;
     }
 
@@ -1021,9 +1050,16 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
              nvc0->constant_vbos |= 1 << dst_index;
           else
              nvc0->constant_vbos &= ~(1 << dst_index);
+          nvc0->vtxbufs_coherent &= ~(1 << dst_index);
        } else {
           nvc0->vbo_user &= ~(1 << dst_index);
           nvc0->constant_vbos &= ~(1 << dst_index);
+
+          if (vb[i].buffer &&
+              vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+             nvc0->vtxbufs_coherent |= (1 << dst_index);
+          else
+             nvc0->vtxbufs_coherent &= ~(1 << dst_index);
        }
     }
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 54443bdccc0..ad79d1cbb9c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -787,7 +787,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
    }
 
    while (num_instances--) {
-      PUSH_SPACE(push, 8);
+      nouveau_pushbuf_space(push, 9, 0, 1);
       BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
       PUSH_DATA (push, mode);
       BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1);
@@ -807,19 +807,31 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nv04_resource *buf = nv04_resource(info->indirect);
-   unsigned size;
-   const uint32_t offset = buf->offset + info->indirect_offset;
+   struct nv04_resource *buf_count = nv04_resource(info->indirect_params);
+   unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
+   uint32_t offset = buf->offset + info->indirect_offset;
 
    /* must make FIFO wait for engines idle before continuing to process */
-   if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
+   if ((buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) ||
+       (buf_count && buf_count->fence_wr &&
+        !nouveau_fence_signalled(buf_count->fence_wr))) {
       IMMED_NVC0(push, SUBC_3D(NV10_SUBCHAN_REF_CNT), 0);
+   }
+
+   /* Queue things up to let the macros write params to the driver constbuf */
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, 512);
+   PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+   PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
 
-   PUSH_SPACE(push, 8);
    if (info->indexed) {
       assert(nvc0->idxbuf.buffer);
       assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
-      size = 5 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ELEMENTS_INDIRECT), 1 + size / 4);
+      size = 5;
+      if (buf_count)
+         macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT;
+      else
+         macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT;
    } else {
       if (nvc0->state.index_bias) {
          /* index_bias is implied 0 if !info->indexed (really ?) */
@@ -827,15 +839,59 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
          nvc0->state.index_bias = 0;
       }
-      size = 4 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ARRAYS_INDIRECT), 1 + size / 4);
-   }
-   PUSH_DATA(push, nvc0_prim_gl(info->mode));
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
-   nouveau_pushbuf_space(push, 0, 0, 1);
-   nouveau_pushbuf_data(push,
-                        buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
+      size = 4;
+      if (buf_count)
+         macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT;
+      else
+         macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT;
+   }
+
+   /* If the stride is not the natural stride, we have to stick a separate
+    * push data reference for each draw. Otherwise it can all go in as one.
+    * Of course there is a maximum packet size, so we have to break things up
+    * along those borders as well.
+    */
+   while (count) {
+      unsigned draws = count, pushes, i;
+      if (info->indirect_stride == size * 4) {
+         draws = MIN2(draws, (NV04_PFIFO_MAX_PACKET_LEN - 4) / size);
+         pushes = 1;
+      } else {
+         draws = MIN2(draws, 32);
+         pushes = draws;
+      }
+
+      nouveau_pushbuf_space(push, 16, 0, pushes + !!buf_count);
+      PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
+      if (buf_count)
+         PUSH_REFN(push, buf_count->bo, NOUVEAU_BO_RD | buf_count->domain);
+      PUSH_DATA(push,
+                NVC0_FIFO_PKHDR_1I(0, macro, 3 + !!buf_count + draws * size));
+      PUSH_DATA(push, nvc0_prim_gl(info->mode));
+      PUSH_DATA(push, drawid);
+      PUSH_DATA(push, draws);
+      if (buf_count) {
+         nouveau_pushbuf_data(push,
+                              buf_count->bo,
+                              buf_count->offset + info->indirect_params_offset,
+                              NVC0_IB_ENTRY_1_NO_PREFETCH | 4);
+      }
+      if (pushes == 1) {
+         nouveau_pushbuf_data(push,
+                              buf->bo, offset,
+                              NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4 * draws));
+         offset += draws * info->indirect_stride;
+      } else {
+         for (i = 0; i < pushes; i++) {
+            nouveau_pushbuf_data(push,
+                                 buf->bo, offset,
+                                 NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4));
+            offset += info->indirect_stride;
+         }
+      }
+      count -= draws;
+      drawid += draws;
+   }
 }
 
 static inline void
@@ -864,7 +920,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   int i, s;
+   int s;
 
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
    nvc0->vb_elt_first = info->min_index + info->index_bias;
@@ -901,29 +957,25 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    /* 8 as minimum to avoid immediate double validation of new buffers */
    nvc0_state_validate(nvc0, ~0, 8);
 
+   if (nvc0->vertprog->vp.need_draw_parameters) {
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, 512);
+      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+      if (!info->indirect) {
+         BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+         PUSH_DATA (push, 256 + 128);
+         PUSH_DATA (push, info->index_bias);
+         PUSH_DATA (push, info->start_instance);
+         PUSH_DATA (push, info->drawid);
+      }
+   }
+
    push->kick_notify = nvc0_draw_vbo_kick_notify;
 
-   /* TODO: Instead of iterating over all the buffer resources looking for
-    * coherent buffers, keep track of a context-wide count.
-    */
    for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
-      uint32_t valid = nvc0->constbuf_valid[s];
-
-      while (valid && !nvc0->cb_dirty) {
-         const unsigned i = ffs(valid) - 1;
-         struct pipe_resource *res;
-
-         valid &= ~(1 << i);
-         if (nvc0->constbuf[s][i].user)
-            continue;
-
-         res = nvc0->constbuf[s][i].u.buf;
-         if (!res)
-            continue;
-
-         if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nvc0->cb_dirty = true;
-      }
+      if (nvc0->constbuf_coherent[s])
+         nvc0->cb_dirty = true;
    }
 
    if (nvc0->cb_dirty) {
@@ -932,14 +984,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    for (s = 0; s < 5; ++s) {
+      if (!nvc0->textures_coherent[s])
+         continue;
+
       for (int i = 0; i < nvc0->num_textures[s]; ++i) {
          struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
-         struct pipe_resource *res;
-         if (!tic)
-            continue;
-         res = nvc0->textures[s][i]->texture;
-         if (res->target != PIPE_BUFFER ||
-             !(res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+         if (!(nvc0->textures_coherent[s] & (1 << i)))
             continue;
 
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -965,12 +1015,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_DATA (push, info->start_instance);
    }
 
-   for (i = 0; i < nvc0->num_vtxbufs && !nvc0->base.vbo_dirty; ++i) {
-      if (!nvc0->vtxbuf[i].buffer)
-         continue;
-      if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nvc0->base.vbo_dirty = true;
-   }
+   nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
 
    if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer &&
        nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
index 48ffac1b715..a9fd1d20942 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -26,6 +26,24 @@
 #include "util/u_format.h"
 
 static void
+nvc0_decoder_begin_frame(struct pipe_video_codec *decoder,
+                         struct pipe_video_buffer *target,
+                         struct pipe_picture_desc *picture)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   uint32_t comm_seq = ++dec->fence_seq;
+   unsigned ret = 0;
+
+   assert(dec);
+   assert(target);
+   assert(target->buffer_format == PIPE_FORMAT_NV12);
+
+   ret = nvc0_decoder_bsp_begin(dec, comm_seq);
+
+   assert(ret == 2);
+}
+
+static void
 nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                               struct pipe_video_buffer *video_target,
                               struct pipe_picture_desc *picture,
@@ -34,8 +52,24 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                               const unsigned *num_bytes)
 {
    struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   uint32_t comm_seq = dec->fence_seq;
+   unsigned ret = 0;
+
+   assert(decoder);
+
+   ret = nvc0_decoder_bsp_next(dec, comm_seq, num_buffers, data, num_bytes);
+
+   assert(ret == 2);
+}
+
+static void
+nvc0_decoder_end_frame(struct pipe_video_codec *decoder,
+                       struct pipe_video_buffer *video_target,
+                       struct pipe_picture_desc *picture)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
    struct nouveau_vp3_video_buffer *target = (struct nouveau_vp3_video_buffer *)video_target;
-   uint32_t comm_seq = ++dec->fence_seq;
+   uint32_t comm_seq = dec->fence_seq;
    union pipe_desc desc;
 
    unsigned vp_caps, is_ref, ret;
@@ -43,11 +77,7 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
 
    desc.base = picture;
 
-   assert(target->base.buffer_format == PIPE_FORMAT_NV12);
-
-   ret = nvc0_decoder_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes,
-                          &vp_caps, &is_ref, refs);
+   ret = nvc0_decoder_bsp_end(dec, desc, target, comm_seq, &vp_caps, &is_ref, refs);
 
    /* did we decode bitstream correctly? */
    assert(ret == 2);
@@ -164,14 +194,19 @@ nvc0_create_decoder(struct pipe_context *context,
    PUSH_DATA (push[2], dec->ppp->handle);
 
    dec->base.context = context;
+   dec->base.begin_frame = nvc0_decoder_begin_frame;
    dec->base.decode_bitstream = nvc0_decoder_decode_bitstream;
+   dec->base.end_frame = nvc0_decoder_end_frame;
 
    for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                            0, 1 << 20, &cfg, &dec->bsp_bo[i]);
-   if (!ret)
+   if (!ret) {
+      /* total fudge factor... just has to be bigger for higher bitrates? */
+      unsigned inter_size = align(templ->width * templ->height * 2, 4 << 20);
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
-                           0x100, 4 << 20, &cfg, &dec->inter_bo[0]);
+                           0x100, inter_size, &cfg, &dec->inter_bo[0]);
+   }
    if (!ret) {
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                            0x100, dec->inter_bo[0]->size, &cfg,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
index 9ee0280f8ea..cf3c942355b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
@@ -30,12 +30,18 @@
 #include "util/u_video.h"
 
 extern unsigned
-nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
-                 struct nouveau_vp3_video_buffer *target,
-                 unsigned comm_seq, unsigned num_buffers,
-                 const void *const *data, const unsigned *num_bytes,
-                 unsigned *vp_caps, unsigned *is_ref,
-                 struct nouveau_vp3_video_buffer *refs[16]);
+nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq);
+
+extern unsigned
+nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
+                      unsigned comm_seq, unsigned num_buffers,
+                      const void *const *data, const unsigned *num_bytes);
+
+extern unsigned
+nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                     struct nouveau_vp3_video_buffer *target,
+                     unsigned comm_seq, unsigned *vp_caps, unsigned *is_ref,
+                     struct nouveau_vp3_video_buffer *refs[16]);
 
 extern void
 nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
index 4392f62c530..c53f946a762 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -32,40 +32,34 @@ static void dump_comm_bsp(struct comm *comm)
 #endif
 
 unsigned
-nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
-                 struct nouveau_vp3_video_buffer *target,
-                 unsigned comm_seq, unsigned num_buffers,
-                 const void *const *data, const unsigned *num_bytes,
-                 unsigned *vp_caps, unsigned *is_ref,
-                 struct nouveau_vp3_video_buffer *refs[16])
+nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq)
 {
-   struct nouveau_pushbuf *push = dec->pushbuf[0];
-   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
-   uint32_t bsp_addr, comm_addr, inter_addr;
-   uint32_t slice_size, bucket_size, ring_size, bsp_size;
-   uint32_t caps, i;
-   int ret;
    struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
-   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
-   unsigned fence_extra = 0;
-   struct nouveau_pushbuf_refn bo_refs[] = {
-      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
-      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
-#if NOUVEAU_VP3_DEBUG_FENCE
-      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
-#endif
-      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
-   };
-   int num_refs = ARRAY_SIZE(bo_refs);
+   unsigned ret = 0;
 
-   if (!dec->bitplane_bo)
-      num_refs--;
+   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
+   if (ret) {
+      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+      return -1;
+   }
 
-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
+   nouveau_vp3_bsp_begin(dec);
+
+   return 2;
+}
 
-   bsp_size = NOUVEAU_VP3_BSP_RESERVED_SIZE;
+unsigned
+nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
+                      unsigned comm_seq, unsigned num_buffers,
+                      const void *const *data, const unsigned *num_bytes)
+{
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   uint32_t bsp_size = 0;
+   uint32_t i = 0;
+   unsigned ret = 0;
+
+   bsp_size = dec->bsp_ptr - (char *)bsp_bo->map;
    for (i = 0; i < num_buffers; i++)
       bsp_size += num_bytes[i];
    bsp_size += 256; /* the 4 end markers */
@@ -81,14 +75,29 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       bsp_size += (1 << 20) - 1;
       bsp_size &= ~((1 << 20) - 1);
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
       if (ret) {
          debug_printf("reallocating bsp %u -> %u failed with %i\n",
                       bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
          return -1;
       }
+
+      ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret) {
+         debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+         return -1;
+      }
+
+      /* Preserve previous buffer. */
+      /* TODO: offload this copy to the GPU, as otherwise we're reading and
+       * writing to VRAM. */
+      memcpy(tmp_bo->map, bsp_bo->map, bsp_bo->size);
+
+      /* update position to current chunk */
+      dec->bsp_ptr = tmp_bo->map + (dec->bsp_ptr - (char *)bsp_bo->map);
+
       nouveau_bo_ref(NULL, &bsp_bo);
-      bo_refs[0].bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo;
+      dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo;
    }
 
    if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
@@ -98,24 +107,61 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       cfg.nvc0.tile_mode = 0x10;
       cfg.nvc0.memtype = 0xfe;
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
       if (ret) {
          debug_printf("reallocating inter %u -> %u failed with %i\n",
                       inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
          return -1;
       }
+
+      ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret) {
+         debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+         return -1;
+      }
+
       nouveau_bo_ref(NULL, &inter_bo);
-      bo_refs[1].bo = dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo;
+      dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo;
    }
 
-   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
-   if (ret) {
-      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
-      return -1;
-   }
+   nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+
+   return 2;
+}
+
+
+unsigned
+nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                     struct nouveau_vp3_video_buffer *target, unsigned comm_seq,
+                     unsigned *vp_caps, unsigned *is_ref,
+                     struct nouveau_vp3_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[0];
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   uint32_t bsp_addr, comm_addr, inter_addr;
+   uint32_t slice_size, bucket_size, ring_size;
+   uint32_t caps;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   unsigned fence_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = ARRAY_SIZE(bo_refs);
+
+   if (!dec->bitplane_bo)
+      num_refs--;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
 
-   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes);
+   caps = nouveau_vp3_bsp_end(dec, desc);
 
    nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
index 4ea8ca3cfa2..79abe78b77a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
@@ -68,6 +68,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define SUBC_SW(m) 7, (m)
 
 #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE
+#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
 
 static inline uint32_t
 NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
diff --git a/src/gallium/drivers/r300/compiler/r500_fragprog.c b/src/gallium/drivers/r300/compiler/r500_fragprog.c
index 88aad8a054f..4c415afcb05 100644
--- a/src/gallium/drivers/r300/compiler/r500_fragprog.c
+++ b/src/gallium/drivers/r300/compiler/r500_fragprog.c
@@ -384,7 +384,7 @@ void r500FragmentProgramDump(struct radeon_compiler *c, void *user)
     case R500_INST_TYPE_OUT: str = "OUT"; break;
     case R500_INST_TYPE_FC: str = "FC"; break;
     case R500_INST_TYPE_TEX: str = "TEX"; break;
-    };
+    }
     fprintf(stderr,"%s %s %s %s %s ", str,
 	    inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "",
 	    inst & R500_INST_LAST ? "LAST" : "",
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index b393769c861..82ba0435118 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -421,8 +421,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.create_video_codec = vl_create_decoder;
     r300->context.create_video_buffer = vl_video_buffer_create;
 
-    r300->uploader = u_upload_create(&r300->context, 256 * 1024, 4,
-                                     PIPE_BIND_CUSTOM);
+    r300->uploader = u_upload_create(&r300->context, 256 * 1024,
+                                     PIPE_BIND_CUSTOM, PIPE_USAGE_STREAM);
 
     r300->blitter = util_blitter_create(&r300->context);
     if (r300->blitter == NULL)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index b482fa140ed..7eda6753d0d 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -1010,7 +1010,7 @@ static void r300_render_draw_elements(struct vbuf_render* render,
     CS_LOCALS(r300);
     DBG(r300, DBG_DRAW, "r300: render_draw_elements (count: %d)\n", count);
 
-    u_upload_data(r300->uploader, 0, count * 2, indices,
+    u_upload_data(r300->uploader, 0, count * 2, 4, indices,
                   &index_buffer_offset, &index_buffer);
     if (!index_buffer) {
         return;
diff --git a/src/gallium/drivers/r300/r300_render_translate.c b/src/gallium/drivers/r300/r300_render_translate.c
index caeeec05909..7221211deea 100644
--- a/src/gallium/drivers/r300/r300_render_translate.c
+++ b/src/gallium/drivers/r300/r300_render_translate.c
@@ -37,7 +37,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     switch (*index_size) {
     case 1:
         *out_buffer = NULL;
-        u_upload_alloc(r300->uploader, 0, count * 2,
+        u_upload_alloc(r300->uploader, 0, count * 2, 4,
                        &out_offset, out_buffer, &ptr);
 
         util_shorten_ubyte_elts_to_userptr(
@@ -51,7 +51,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     case 2:
         if (index_offset) {
             *out_buffer = NULL;
-            u_upload_alloc(r300->uploader, 0, count * 2,
+            u_upload_alloc(r300->uploader, 0, count * 2, 4,
                            &out_offset, out_buffer, &ptr);
 
             util_rebuild_ushort_elts_to_userptr(&r300->context, ib,
@@ -65,7 +65,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     case 4:
         if (index_offset) {
             *out_buffer = NULL;
-            u_upload_alloc(r300->uploader, 0, count * 4,
+            u_upload_alloc(r300->uploader, 0, count * 4, 4,
                            &out_offset, out_buffer, &ptr);
 
             util_rebuild_uint_elts_to_userptr(&r300->context, ib,
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 606e25f915b..d1b59ab4345 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -183,6 +183,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SAMPLE_SHADING:
         case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
         case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
         case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
         case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
         case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -200,6 +202,11 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SHAREABLE_SHADERS:
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
         case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
             return 0;
 
         /* SWTCL-only features. */
@@ -304,6 +311,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
             return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
             return 32;
@@ -362,6 +370,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
             return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
             return 32;
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index 737a6f5e4f8..42c8e3a0fc5 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -42,7 +42,7 @@ void r300_upload_index_buffer(struct r300_context *r300,
     *index_buffer = NULL;
 
     u_upload_data(r300->uploader,
-                  0, count * index_size,
+                  0, count * index_size, 4,
                   ptr + (*start * index_size),
                   &index_offset,
                   index_buffer);
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index da472f4d7f4..741e263e7ed 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -52,7 +52,6 @@ enum r300_rs_col_write_type {
 
 static void r300_draw_emit_attrib(struct r300_context* r300,
                                   enum attrib_emit emit,
-                                  enum interp_mode interp,
                                   int index)
 {
     struct r300_vertex_shader* vs = r300->vs_state.state;
@@ -62,7 +61,7 @@ static void r300_draw_emit_attrib(struct r300_context* r300,
     output = draw_find_shader_output(r300->draw,
                                      info->output_semantic_name[index],
                                      info->output_semantic_index[index]);
-    draw_emit_vertex_attr(&r300->vertex_info, emit, interp, output);
+    draw_emit_vertex_attr(&r300->vertex_info, emit, output);
 }
 
 static void r300_draw_emit_all_attribs(struct r300_context* r300)
@@ -73,31 +72,27 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
 
     /* Position. */
     if (vs_outputs->pos != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->pos);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->pos);
     } else {
         assert(0);
     }
 
     /* Point size. */
     if (vs_outputs->psize != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, INTERP_POS,
-                              vs_outputs->psize);
+        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, vs_outputs->psize);
     }
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (vs_outputs->color[i] != ATTR_UNUSED) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
-                                  vs_outputs->color[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->color[i]);
         }
     }
 
     /* Back-face colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
-                                  vs_outputs->bcolor[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->bcolor[i]);
         }
     }
 
@@ -108,16 +103,14 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
     for (i = 0; i < ATTR_GENERIC_COUNT && gen_count < 8; i++) {
         if (vs_outputs->generic[i] != ATTR_UNUSED &&
             !(r300->sprite_coord_enable & (1 << i))) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                                  vs_outputs->generic[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->generic[i]);
             gen_count++;
         }
     }
 
     /* Fog coordinates. */
     if (gen_count < 8 && vs_outputs->fog != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->fog);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->fog);
         gen_count++;
     }
 
@@ -125,8 +118,7 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
     if (r300_fs(r300)->shader->inputs.wpos != ATTR_UNUSED && gen_count < 8) {
         DBG(r300, DBG_SWTCL, "draw_emit_attrib: WPOS, index: %i\n",
             vs_outputs->wpos);
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->wpos);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->wpos);
     }
 }
 
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index d83eb17c280..20945ece155 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -600,7 +600,7 @@ static void evergreen_launch_grid(
                            ctx->screen->has_compressed_msaa_texturing);
                 bc->type = TGSI_PROCESSOR_COMPUTE;
                 bc->isa = ctx->isa;
-                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
+                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);
 
                 if (dump && !sb_disasm) {
                         r600_bytecode_disasm(bc);
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 1aee7dd2da8..9dfb84965cf 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1956,7 +1956,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 
 		if (!gs_ring_buffer) {
 			radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4,
-						    ALIGN_DIVUP(cb->buffer_size, 256), pkt_flags);
+						    DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags);
 			radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8,
 						    pkt_flags);
 		}
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 1cc30317ba5..8b91372f3ae 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -726,7 +726,7 @@ static void tex_fetch_args(
 		 * That operand should be passed as a float value in the args array
 		 * right after the coord vector. After packing it's not used anymore,
 		 * that's why arg_count is not increased */
-		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 	}
 
 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
@@ -915,14 +915,17 @@ unsigned r600_llvm_compile(
 	enum radeon_family family,
 	struct r600_bytecode *bc,
 	boolean *use_kill,
-	unsigned dump)
+	unsigned dump,
+	struct pipe_debug_callback *debug)
 {
 	unsigned r;
 	struct radeon_shader_binary binary;
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
 	memset(&binary, 0, sizeof(struct radeon_shader_binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL);
+	if (dump)
+		LLVMDumpModule(mod);
+	r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h
index 9b5304d9fcb..f570b739fbe 100644
--- a/src/gallium/drivers/r600/r600_llvm.h
+++ b/src/gallium/drivers/r600/r600_llvm.h
@@ -7,6 +7,7 @@
 #include "radeon/radeon_llvm.h"
 #include <llvm-c/Core.h>
 
+struct pipe_debug_callback;
 struct r600_bytecode;
 struct r600_shader_ctx;
 struct radeon_llvm_context;
@@ -22,7 +23,8 @@ unsigned r600_llvm_compile(
 	enum radeon_family family,
 	struct r600_bytecode *bc,
 	boolean *use_kill,
-	unsigned dump);
+	unsigned dump,
+	struct pipe_debug_callback *debug);
 
 unsigned r600_create_shader(struct r600_bytecode *bc,
 		const struct radeon_shader_binary *binary,
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 17006f70601..e61d9286542 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -348,6 +348,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
@@ -522,6 +529,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
 		return 0;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		/* due to a bug in the shader compiler, some loops hang
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 31f2a729494..0e4dd16525b 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -946,7 +946,6 @@ static inline uint32_t S_FIXED(float value, uint32_t frac_bits)
 {
 	return value * (1 << frac_bits);
 }
-#define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
 
 /* 12.4 fixed-point */
 static inline unsigned r600_pack_float_12p4(float x)
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index d411b0be50e..df40f94bdcf 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -162,7 +162,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_shader_selector *sel = shader->selector;
 	int r;
-	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
+	bool dump = r600_can_dump_shader(&rctx->screen->b,
+					 tgsi_get_processor_type(sel->tokens));
 	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
 	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
 	unsigned export_shader;
@@ -394,7 +395,7 @@ static int tgsi_last_instruction(unsigned writemask)
 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
-	int j;
+	unsigned j;
 
 	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
 		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
@@ -1166,7 +1167,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off
 */
 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 	int num_baryc;
 	struct tgsi_parse_context parse;
 
@@ -1585,7 +1586,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-	int i;
+	unsigned i;
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		struct tgsi_full_src_register *src = &inst->Src[i];
@@ -1854,7 +1855,7 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re
 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-	int i;
+	unsigned i;
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		struct tgsi_full_src_register *src = &inst->Src[i];
@@ -2784,7 +2785,7 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
 
 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 	int stride, outer_comps, inner_comps;
 	int tessinner_idx = -1, tessouter_idx = -1;
 	int r;
@@ -3238,7 +3239,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	if (use_llvm) {
 		struct radeon_llvm_context radeon_llvm_ctx;
 		LLVMModuleRef mod;
-		bool dump = r600_can_dump_shader(&rscreen->b, tokens);
+		bool dump = r600_can_dump_shader(&rscreen->b,
+						 tgsi_get_processor_type(tokens));
 		boolean use_kill = false;
 
 		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
@@ -3259,7 +3261,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
 		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
 
-		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
+		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill,
+				      dump, &rctx->b.debug)) {
 			radeon_llvm_dispose(&radeon_llvm_ctx);
 			use_llvm = 0;
 			fprintf(stderr, "R600 LLVM backend failed to compile "
@@ -4424,7 +4427,7 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ctx->inst_info->op;
 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
+				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
 			}
 			alu.dst.sel = t1;
 			alu.dst.chan = i;
@@ -4791,7 +4794,7 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 	{
 		int chan;
 		int sel;
-		int i;
+		unsigned i;
 
 		if (ctx->bc->chip_class == CAYMAN) {
 			for (i = 0; i < 3; i++) {
@@ -7925,7 +7928,7 @@ static int tgsi_exp(struct r600_shader_ctx *ctx)
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bytecode_alu alu;
 	int r;
-	int i;
+	unsigned i;
 
 	/* result.x = 2^floor(src); */
 	if (inst->Dst[0].Register.WriteMask & 1) {
@@ -8054,7 +8057,7 @@ static int tgsi_log(struct r600_shader_ctx *ctx)
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bytecode_alu alu;
 	int r;
-	int i;
+	unsigned i;
 
 	/* result.x = floor(log2(|src|)); */
 	if (inst->Dst[0].Register.WriteMask & 1) {
@@ -8781,7 +8784,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
 
 static int tgsi_endloop(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 
 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
 
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 43b80742cb5..f60e30486a2 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1768,7 +1768,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 
 		if (!gs_ring_buffer) {
 			radeon_set_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4,
-					       ALIGN_DIVUP(cb->buffer_size, 256));
+					       DIV_ROUND_UP(cb->buffer_size, 256));
 			radeon_set_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8);
 		}
 
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index ca589fa7759..c3346f29811 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1106,10 +1106,10 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint
 				tmpPtr[i] = util_cpu_to_le32(((uint32_t *)ptr)[i]);
 			}
 
-			u_upload_data(rctx->b.uploader, 0, size, tmpPtr, &cb->buffer_offset, &cb->buffer);
+			u_upload_data(rctx->b.uploader, 0, size, 256, tmpPtr, &cb->buffer_offset, &cb->buffer);
 			free(tmpPtr);
 		} else {
-			u_upload_data(rctx->b.uploader, 0, input->buffer_size, ptr, &cb->buffer_offset, &cb->buffer);
+			u_upload_data(rctx->b.uploader, 0, input->buffer_size, 256, ptr, &cb->buffer_offset, &cb->buffer);
 		}
 		/* account it in gtt */
 		rctx->b.gtt += input->buffer_size;
@@ -1732,7 +1732,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 				}
 			}
 
-			u_upload_alloc(rctx->b.uploader, start, count * 2,
+			u_upload_alloc(rctx->b.uploader, start, count * 2, 256,
 				       &out_offset, &out_buffer, &ptr);
 
 			util_shorten_ubyte_elts_to_userptr(
@@ -1753,7 +1753,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect ||
 						 info.instance_count > 1 ||
 						 info.count*ib.index_size > 20)) {
-			u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size,
+			u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size, 256,
 				      ib.user_buffer, &ib.offset, &ib.buffer);
 			ib.user_buffer = NULL;
 		}
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 18925277d2d..484f5c8d5b7 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -298,7 +298,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 			struct r600_resource *staging = NULL;
 
 			u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
-				       &offset, (struct pipe_resource**)&staging, (void**)&data);
+				       256, &offset, (struct pipe_resource**)&staging, (void**)&data);
 
 			if (staging) {
 				data += box->x % R600_MAP_BUFFER_ALIGNMENT;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 9a5e9878176..52c365e81d0 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -85,7 +85,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	/* Upload vertices. The hw rectangle has only 3 vertices,
 	 * I guess the 4th one is derived from the first 3.
 	 * The vertex specification should match u_blitter's vertex element state. */
-	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, &offset, &buf, (void**)&vb);
+	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb);
 	if (!buf)
 		return;
 
@@ -227,6 +227,17 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 	return PIPE_UNKNOWN_CONTEXT_RESET;
 }
 
+static void r600_set_debug_callback(struct pipe_context *ctx,
+				    const struct pipe_debug_callback *cb)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+	if (cb)
+		rctx->debug = *cb;
+	else
+		memset(&rctx->debug, 0, sizeof(rctx->debug));
+}
+
 bool r600_common_context_init(struct r600_common_context *rctx,
 			      struct r600_common_screen *rscreen)
 {
@@ -252,6 +263,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->b.transfer_inline_write = u_default_transfer_inline_write;
         rctx->b.memory_barrier = r600_memory_barrier;
 	rctx->b.flush = r600_flush_from_st;
+	rctx->b.set_debug_callback = r600_set_debug_callback;
 
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
 		rctx->b.get_device_reset_status = r600_get_reset_status;
@@ -272,9 +284,9 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (!rctx->allocator_so_filled_size)
 		return false;
 
-	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, 256,
+	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
 					PIPE_BIND_INDEX_BUFFER |
-					PIPE_BIND_CONSTANT_BUFFER);
+					PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
 	if (!rctx->uploader)
 		return false;
 
@@ -999,13 +1011,9 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 }
 
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-			  const struct tgsi_token *tokens)
+			  unsigned processor)
 {
-	/* Compute shader don't have tgsi_tokens */
-	if (!tokens)
-		return (rscreen->debug_flags & DBG_CS) != 0;
-
-	switch (tgsi_get_processor_type(tokens)) {
+	switch (processor) {
 	case TGSI_PROCESSOR_VERTEX:
 		return (rscreen->debug_flags & DBG_VS) != 0;
 	case TGSI_PROCESSOR_TESS_CTRL:
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index c3933b1da98..68b50a9fb0f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -440,6 +440,8 @@ struct r600_common_context {
 	 * the GPU addresses are updated. */
 	struct list_head		texture_buffers;
 
+	struct pipe_debug_callback	debug;
+
 	/* Copy one resource to another using async DMA. */
 	void (*dma_copy)(struct pipe_context *ctx,
 			 struct pipe_resource *dst,
@@ -514,7 +516,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 void r600_common_context_cleanup(struct r600_common_context *rctx);
 void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r);
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-			  const struct tgsi_token *tokens);
+			  unsigned processor);
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
 			      unsigned offset, unsigned size, unsigned value,
 			      bool is_framebuffer);
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 61ed9402122..3d0987624a6 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -23,12 +23,15 @@
  * Authors: Tom Stellard <[email protected]>
  *
  */
+
 #include "radeon_llvm_emit.h"
 #include "radeon_elf_util.h"
 #include "c11/threads.h"
 #include "gallivm/lp_bld_misc.h"
+#include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
 
 #include <llvm-c/Target.h>
 #include <llvm-c/TargetMachine.h>
@@ -123,16 +126,44 @@ LLVMTargetRef radeon_llvm_get_r600_target(const char *triple)
 	return target;
 }
 
+struct radeon_llvm_diagnostics {
+	struct pipe_debug_callback *debug;
+	unsigned retval;
+};
+
 static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
 {
-	if (LLVMGetDiagInfoSeverity(di) == LLVMDSError) {
-		unsigned int *diagnosticflag = (unsigned int *)context;
-		char *diaginfo_message = LLVMGetDiagInfoDescription(di);
+	struct radeon_llvm_diagnostics *diag = (struct radeon_llvm_diagnostics *)context;
+	LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
+	char *description = LLVMGetDiagInfoDescription(di);
+	const char *severity_str = NULL;
+
+	switch (severity) {
+	case LLVMDSError:
+		severity_str = "error";
+		break;
+	case LLVMDSWarning:
+		severity_str = "warning";
+		break;
+	case LLVMDSRemark:
+		severity_str = "remark";
+		break;
+	case LLVMDSNote:
+		severity_str = "note";
+		break;
+	default:
+		severity_str = "unknown";
+	}
+
+	pipe_debug_message(diag->debug, SHADER_INFO,
+			   "LLVM diagnostic (%s): %s", severity_str, description);
 
-		*diagnosticflag = 1;
-		fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", diaginfo_message);
-		LLVMDisposeMessage(diaginfo_message);
+	if (severity == LLVMDSError) {
+		diag->retval = 1;
+		fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description);
 	}
+
+	LLVMDisposeMessage(description);
 }
 
 /**
@@ -141,22 +172,25 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  * @returns 0 for success, 1 for failure
  */
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			     const char *gpu_family, bool dump_ir, bool dump_asm,
-			     LLVMTargetMachineRef tm)
+			     const char *gpu_family,
+			     LLVMTargetMachineRef tm,
+			     struct pipe_debug_callback *debug)
 {
-
+	struct radeon_llvm_diagnostics diag;
 	char cpu[CPU_STRING_LEN];
 	char fs[FS_STRING_LEN];
 	char *err;
 	bool dispose_tm = false;
 	LLVMContextRef llvm_ctx;
-	unsigned rval = 0;
 	LLVMMemoryBufferRef out_buffer;
 	unsigned buffer_size;
 	const char *buffer_data;
 	char triple[TRIPLE_STRING_LEN];
 	LLVMBool mem_err;
 
+	diag.debug = debug;
+	diag.retval = 0;
+
 	if (!tm) {
 		strncpy(triple, "r600--", TRIPLE_STRING_LEN);
 		LLVMTargetRef target = radeon_llvm_get_r600_target(triple);
@@ -165,20 +199,17 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 		}
 		strncpy(cpu, gpu_family, CPU_STRING_LEN);
 		memset(fs, 0, sizeof(fs));
-		if (dump_asm)
-			strncpy(fs, "+DumpCode", FS_STRING_LEN);
+		strncpy(fs, "+DumpCode", FS_STRING_LEN);
 		tm = LLVMCreateTargetMachine(target, triple, cpu, fs,
 				  LLVMCodeGenLevelDefault, LLVMRelocDefault,
 						  LLVMCodeModelDefault);
 		dispose_tm = true;
 	}
-	if (dump_ir)
-		LLVMDumpModule(M);
+
 	/* Setup Diagnostic Handler*/
 	llvm_ctx = LLVMGetModuleContext(M);
 
-	LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &rval);
-	rval = 0;
+	LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &diag);
 
 	/* Compile IR*/
 	mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile, &err,
@@ -187,15 +218,13 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	/* Process Errors/Warnings */
 	if (mem_err) {
 		fprintf(stderr, "%s: %s", __FUNCTION__, err);
+		pipe_debug_message(debug, SHADER_INFO,
+				   "LLVM emit error: %s", err);
 		FREE(err);
-		rval = 1;
+		diag.retval = 1;
 		goto out;
 	}
 
-	if (0 != rval) {
-		fprintf(stderr, "%s: Processing Diag Flag\n", __FUNCTION__);
-	}
-
 	/* Extract Shader Code*/
 	buffer_size = LLVMGetBufferSize(out_buffer);
 	buffer_data = LLVMGetBufferStart(out_buffer);
@@ -209,5 +238,7 @@ out:
 	if (dispose_tm) {
 		LLVMDisposeTargetMachine(tm);
 	}
-	return rval;
+	if (diag.retval != 0)
+		pipe_debug_message(debug, SHADER_INFO, "LLVM compile failed");
+	return diag.retval;
 }
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index e20aed94c6b..45f05a9e0e1 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -31,6 +31,7 @@
 #include <llvm-c/TargetMachine.h>
 #include <stdbool.h>
 
+struct pipe_debug_callback;
 struct radeon_shader_binary;
 
 void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
@@ -38,7 +39,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
 LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
 
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			     const char *gpu_family, bool dump_ir, bool dump_asm,
-			     LLVMTargetMachineRef tm);
+			     const char *gpu_family,
+			     LLVMTargetMachineRef tm,
+			     struct pipe_debug_callback *debug);
 
 #endif /* RADEON_LLVM_EMIT_H */
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 2de237b4716..105a1b2a878 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -196,7 +196,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 			(tile_split << 11) | (mt << 8) | (array_mode << 3) |
 			lbpe;
 		cs->buf[cs->cdw++] = y << 16; /* | x */
-		cs->buf[cs->cdw++] = 0; /* z */;
+		cs->buf[cs->cdw++] = 0; /* z */
 		cs->buf[cs->cdw++] = addr & 0xfffffffc;
 		cs->buf[cs->cdw++] = addr >> 32;
 		cs->buf[cs->cdw++] = (pitch / bpe) - 1;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 47a74eea0e0..5a08cbfb198 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -67,9 +67,9 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 				program->shader.binary.global_symbol_offsets[i];
 		unsigned scratch_bytes_needed;
 
-		si_shader_binary_read_config(sctx->screen,
-						&program->shader, offset);
-		scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
+		si_shader_binary_read_config(&program->shader.binary,
+					     &program->shader.config, offset);
+		scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
 		scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
 	}
 
@@ -87,7 +87,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 	 * to the maximum bytes needed, so it can compute the stride
 	 * correctly.
 	 */
-	program->shader.scratch_bytes_per_wave = scratch_bytes;
+	program->shader.config.scratch_bytes_per_wave = scratch_bytes;
 
 	/* Patch the shader with the scratch buffer address. */
 	si_shader_apply_scratch_relocs(sctx,
@@ -122,8 +122,12 @@ static void *si_create_compute_state(
 	        for (i = 0; i < program->num_kernels; i++) {
 		        LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i,
                                                         code, header->num_bytes);
-			si_compile_llvm(sctx->screen, &program->kernels[i], sctx->tm,
-					mod);
+			si_compile_llvm(sctx->screen, &program->kernels[i].binary,
+					&program->kernels[i].config, sctx->tm,
+					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+			si_shader_dump(sctx->screen, &program->kernels[i],
+				       &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+			si_shader_binary_upload(sctx->screen, &program->kernels[i]);
 			LLVMDisposeModule(mod);
 		}
 	}
@@ -136,7 +140,11 @@ static void *si_create_compute_state(
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader);
+	si_shader_binary_read_config(&program->shader.binary,
+				     &program->shader.config, 0);
+	si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug,
+		       TGSI_PROCESSOR_COMPUTE);
+	si_shader_binary_upload(sctx->screen, &program->shader);
 
 #endif
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
@@ -259,7 +267,7 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
 	/* Read the config information */
-	si_shader_binary_read_config(sctx->screen, shader, pc);
+	si_shader_binary_read_config(&shader->binary, &shader->config, pc);
 #endif
 
 	/* Upload the kernel arguments */
@@ -280,12 +288,12 @@ static void si_launch_grid(
 
 	memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size);
 
-	if (shader->scratch_bytes_per_wave > 0) {
+	if (shader->config.scratch_bytes_per_wave > 0) {
 
 		COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
 		            "Total Scratch: %u bytes\n", num_waves_for_scratch,
-			    shader->scratch_bytes_per_wave,
-			    shader->scratch_bytes_per_wave *
+			    shader->config.scratch_bytes_per_wave,
+			    shader->config.scratch_bytes_per_wave *
 			    num_waves_for_scratch);
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
@@ -312,7 +320,7 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va);
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12,
 		S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32)
-		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64));
+		|  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64));
 
 	si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0);
 	si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0);
@@ -360,9 +368,9 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
 
-	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);
+	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->config.rsrc1);
 
-	lds_blocks = shader->lds_size;
+	lds_blocks = shader->config.lds_size;
 	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
 	 * blocks of 256 bytes, so if there are 4 bytes lds allocated in
 	 * the shader and 4 bytes allocated by the state tracker, then
@@ -376,10 +384,10 @@ static void si_launch_grid(
 
 	assert(lds_blocks <= 0xFF);
 
-	shader->rsrc2 &= C_00B84C_LDS_SIZE;
-	shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+	shader->config.rsrc2 &= C_00B84C_LDS_SIZE;
+	shader->config.rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
 
-	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
+	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->config.rsrc2);
 	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);
 
 	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
@@ -401,7 +409,7 @@ static void si_launch_grid(
 		 * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled.
 		 */
 		S_00B860_WAVES(num_waves_for_scratch)
-		| S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10))
+		| S_00B860_WAVESIZE(shader->config.scratch_bytes_per_wave >> 10))
 		;
 
 	si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT);
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index b3719dea252..d157a9ffb00 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -109,7 +109,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
 	if (!desc->list_dirty)
 		return true;
 
-	u_upload_alloc(sctx->b.uploader, 0, list_size,
+	u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
 		       &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, &ptr);
 	if (!desc->buffer)
@@ -391,7 +391,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	 * directly through a staging buffer and don't go through
 	 * the fine-grained upload path.
 	 */
-	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
+	u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
 	if (!desc->buffer)
 		return false;
@@ -465,7 +465,7 @@ void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuf
 {
 	void *tmp;
 
-	u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
+	u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset,
 		       (struct pipe_resource**)rbuffer, &tmp);
 	if (rbuffer)
 		util_memcpy_cpu_to_le32(tmp, ptr, size);
@@ -1011,19 +1011,19 @@ void si_init_all_descriptors(struct si_context *sctx)
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_init_buffer_resources(&sctx->const_buffers[i],
-					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
+					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
 					 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
 		si_init_buffer_resources(&sctx->rw_buffers[i],
 					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
 
 		si_init_descriptors(&sctx->samplers[i].views.desc,
-				    SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+				    SI_SGPR_SAMPLER_VIEWS, 8, SI_NUM_SAMPLER_VIEWS);
 		si_init_descriptors(&sctx->samplers[i].states.desc,
-				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
+				    SI_SGPR_SAMPLER_STATES, 4, SI_NUM_SAMPLER_STATES);
 	}
 
-	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
 			    4, SI_NUM_VERTEX_BUFFERS);
 
 	/* Set pipe_context functions. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index ac13407e2a1..c2ca94339ac 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -340,6 +340,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
@@ -512,6 +519,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		return 1;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+		return 0;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 0e98784d51b..1db3e484915 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -82,11 +82,11 @@ struct si_shader_context
 	int param_es2gs_offset;
 	LLVMTargetMachineRef tm;
 	LLVMValueRef const_md;
-	LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
+	LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
 	LLVMValueRef lds;
 	LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
-	LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
-	LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
+	LLVMValueRef sampler_views[SI_NUM_SAMPLER_VIEWS];
+	LLVMValueRef sampler_states[SI_NUM_SAMPLER_STATES];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
 	LLVMValueRef gsvs_ring[4];
@@ -394,7 +394,7 @@ static void declare_input_vs(
 	LLVMValueRef input;
 
 	/* Load the T list */
-	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFER);
+	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
 
 	t_offset = lp_build_const_int32(gallivm, input_index);
 
@@ -1065,7 +1065,7 @@ static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld,
 	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 	LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
 	LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
 
@@ -1233,13 +1233,13 @@ static LLVMValueRef fetch_constant(
 	}
 
 	if (reg->Register.Dimension && reg->Dimension.Indirect) {
-		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 		LLVMValueRef index;
 		index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
 						   reg->Dimension.Index);
 		bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
 	} else
-		bufp = si_shader_ctx->const_resource[buf];
+		bufp = si_shader_ctx->const_buffers[buf];
 
 	addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 	addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
@@ -1260,7 +1260,7 @@ static LLVMValueRef fetch_constant(
 		addr2 = lp_build_add(&bld_base->uint_bld, addr2,
 				     lp_build_const_int32(base->gallivm, idx * 4));
 
-		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
+		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_buffers[buf],
 				   addr2, bld_base->base.elem_type);
 
 		result = radeon_llvm_emit_fetch_double(bld_base,
@@ -1302,18 +1302,8 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 		int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
-		if (cbuf >= 0 && cbuf < 8) {
+		if (cbuf >= 0 && cbuf < 8)
 			compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
-
-			if (compressed)
-				si_shader_ctx->shader->spi_shader_col_format |=
-					V_028714_SPI_SHADER_FP16_ABGR << (4 * cbuf);
-			else
-				si_shader_ctx->shader->spi_shader_col_format |=
-					V_028714_SPI_SHADER_32_ABGR << (4 * cbuf);
-
-			si_shader_ctx->shader->cb_shader_mask |= 0xf << (4 * cbuf);
-		}
 	}
 
 	/* Set COMPR flag */
@@ -1333,34 +1323,19 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 						    LLVMInt32TypeInContext(base->gallivm->context),
 						    pack_args, 2,
 						    LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
-			args[chan + 7] = args[chan + 5] =
+			args[chan + 5] =
 				LLVMBuildBitCast(base->gallivm->builder,
 						 packed,
 						 LLVMFloatTypeInContext(base->gallivm->context),
 						 "");
+			args[chan + 7] = base->undef;
 		}
 	} else
 		memcpy(&args[5], values, sizeof(values[0]) * 4);
 }
 
-/* Load from output pointers and initialize arguments for the shader export intrinsic */
-static void si_llvm_init_export_args_load(struct lp_build_tgsi_context *bld_base,
-					  LLVMValueRef *out_ptr,
-					  unsigned target,
-					  LLVMValueRef *args)
-{
-	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	LLVMValueRef values[4];
-	int i;
-
-	for (i = 0; i < 4; i++)
-		values[i] = LLVMBuildLoad(gallivm->builder, out_ptr[i], "");
-
-	si_llvm_init_export_args(bld_base, values, target, args);
-}
-
 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
-			  LLVMValueRef alpha_ptr)
+			  LLVMValueRef alpha)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1372,8 +1347,7 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 		LLVMValueRef alpha_pass =
 			lp_build_cmp(&bld_base->base,
 				     si_shader_ctx->shader->key.ps.alpha_func,
-				     LLVMBuildLoad(gallivm->builder, alpha_ptr, ""),
-				     alpha_ref);
+				     alpha, alpha_ref);
 		LLVMValueRef arg =
 			lp_build_select(&bld_base->base,
 					alpha_pass,
@@ -1390,16 +1364,14 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 				LLVMVoidTypeInContext(gallivm->context),
 				NULL, 0, 0);
 	}
-
-	si_shader_ctx->shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
 }
 
-static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
-					  LLVMValueRef alpha_ptr)
+static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
+						  LLVMValueRef alpha)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	LLVMValueRef coverage, alpha;
+	LLVMValueRef coverage;
 
 	/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
 	coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -1417,9 +1389,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base
 				 lp_build_const_float(gallivm,
 					1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
 
-	alpha = LLVMBuildLoad(gallivm->builder, alpha_ptr, "");
-	alpha = LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
-	LLVMBuildStore(gallivm->builder, alpha, alpha_ptr);
+	return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
 }
 
 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
@@ -1432,7 +1402,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
 	unsigned chan;
 	unsigned const_chan;
 	LLVMValueRef base_elt;
-	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 	LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF);
 	LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index);
 
@@ -2112,197 +2082,193 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	FREE(outputs);
 }
 
-static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
+static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
+			   LLVMValueRef depth, LLVMValueRef stencil,
+			   LLVMValueRef samplemask)
 {
-	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader * shader = si_shader_ctx->shader;
-	struct lp_build_context * base = &bld_base->base;
-	struct lp_build_context * uint = &bld_base->uint_bld;
-	struct tgsi_shader_info *info = &shader->selector->info;
-	LLVMBuilderRef builder = base->gallivm->builder;
+	struct si_screen *sscreen = si_shader_context(bld_base)->screen;
+	struct lp_build_context *base = &bld_base->base;
+	struct lp_build_context *uint = &bld_base->uint_bld;
 	LLVMValueRef args[9];
-	LLVMValueRef last_args[9] = { 0 };
-	int depth_index = -1, stencil_index = -1, samplemask_index = -1;
-	int i;
+	unsigned mask = 0;
 
-	for (i = 0; i < info->num_outputs; i++) {
-		unsigned semantic_name = info->output_semantic_name[i];
-		unsigned semantic_index = info->output_semantic_index[i];
-		unsigned target;
-		LLVMValueRef alpha_ptr;
+	assert(depth || stencil || samplemask);
 
-		/* Select the correct target */
-		switch (semantic_name) {
-		case TGSI_SEMANTIC_POSITION:
-			depth_index = i;
-			continue;
-		case TGSI_SEMANTIC_STENCIL:
-			stencil_index = i;
-			continue;
-		case TGSI_SEMANTIC_SAMPLEMASK:
-			samplemask_index = i;
-			continue;
-		case TGSI_SEMANTIC_COLOR:
-			target = V_008DFC_SQ_EXP_MRT + semantic_index;
-			alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3];
+	args[1] = uint->one; /* whether the EXEC mask is valid */
+	args[2] = uint->one; /* DONE bit */
 
-			if (si_shader_ctx->shader->key.ps.clamp_color) {
-				for (int j = 0; j < 4; j++) {
-					LLVMValueRef ptr = si_shader_ctx->radeon_bld.soa.outputs[i][j];
-					LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+	/* Specify the target we are exporting */
+	args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
 
-					result = radeon_llvm_saturate(bld_base, result);
-					LLVMBuildStore(builder, result, ptr);
-				}
-			}
+	args[4] = uint->zero; /* COMP flag */
+	args[5] = base->undef; /* R, depth */
+	args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
+	args[7] = base->undef; /* B, sample mask */
+	args[8] = base->undef; /* A, alpha to mask */
 
-			if (si_shader_ctx->shader->key.ps.alpha_to_one)
-				LLVMBuildStore(base->gallivm->builder,
-					       base->one, alpha_ptr);
+	if (depth) {
+		args[5] = depth;
+		mask |= 0x1;
+	}
 
-			if (semantic_index == 0 &&
-			    si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
-				si_alpha_test(bld_base, alpha_ptr);
+	if (stencil) {
+		args[6] = stencil;
+		mask |= 0x2;
+	}
 
-			if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
-				si_scale_alpha_by_sample_mask(bld_base, alpha_ptr);
+	if (samplemask) {
+		args[7] = samplemask;
+		mask |= 0x4;
+	}
 
-			break;
-		default:
-			target = 0;
-			fprintf(stderr,
-				"Warning: SI unhandled fs output type:%d\n",
-				semantic_name);
-		}
+	/* SI (except OLAND) has a bug that it only looks
+	 * at the X writemask component. */
+	if (sscreen->b.chip_class == SI &&
+	    sscreen->b.family != CHIP_OLAND)
+		mask |= 0x1;
 
-		si_llvm_init_export_args_load(bld_base,
-					      si_shader_ctx->radeon_bld.soa.outputs[i],
-					      target, args);
-
-		if (semantic_name == TGSI_SEMANTIC_COLOR) {
-			/* If there is an export instruction waiting to be emitted, do so now. */
-			if (last_args[0]) {
-				lp_build_intrinsic(base->gallivm->builder,
-						   "llvm.SI.export",
-						   LLVMVoidTypeInContext(base->gallivm->context),
-						   last_args, 9, 0);
-			}
+	/* Specify which components to enable */
+	args[0] = lp_build_const_int32(base->gallivm, mask);
 
-			/* This instruction will be emitted at the end of the shader. */
-			memcpy(last_args, args, sizeof(args));
-
-			/* Handle FS_COLOR0_WRITES_ALL_CBUFS. */
-			if (shader->selector->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
-			    semantic_index == 0 &&
-			    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
-				for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
-					si_llvm_init_export_args_load(bld_base,
-								      si_shader_ctx->radeon_bld.soa.outputs[i],
-								      V_008DFC_SQ_EXP_MRT + c, args);
-					lp_build_intrinsic(base->gallivm->builder,
-							   "llvm.SI.export",
-							   LLVMVoidTypeInContext(base->gallivm->context),
-							   args, 9, 0);
-				}
-			}
-		} else {
-			lp_build_intrinsic(base->gallivm->builder,
-					   "llvm.SI.export",
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
+
+static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
+				LLVMValueRef *color, unsigned index,
+				bool is_last)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct lp_build_context *base = &bld_base->base;
+	LLVMValueRef args[9];
+	int i;
+
+	/* Clamp color */
+	if (si_shader_ctx->shader->key.ps.clamp_color)
+		for (i = 0; i < 4; i++)
+			color[i] = radeon_llvm_saturate(bld_base, color[i]);
+
+	/* Alpha to one */
+	if (si_shader_ctx->shader->key.ps.alpha_to_one)
+		color[3] = base->one;
+
+	/* Alpha test */
+	if (index == 0 &&
+	    si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+		si_alpha_test(bld_base, color[3]);
+
+	/* Line & polygon smoothing */
+	if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
+		color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+
+	/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+	if (index == 0 &&
+	    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
+		for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
+			si_llvm_init_export_args(bld_base, color,
+						 V_008DFC_SQ_EXP_MRT + c, args);
+			lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
 					   args, 9, 0);
 		}
 	}
 
-	if (depth_index >= 0 || stencil_index >= 0 || samplemask_index >= 0) {
-		LLVMValueRef out_ptr;
-		unsigned mask = 0;
-
-		/* Specify the target we are exporting */
-		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
-
-		args[5] = base->zero; /* R, depth */
-		args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
-		args[7] = base->zero; /* B, sample mask */
-		args[8] = base->zero; /* A, alpha to mask */
-
-		if (depth_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[depth_index][2];
-			args[5] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x1;
-			si_shader_ctx->shader->db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1);
-		}
-
-		if (stencil_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[stencil_index][1];
-			args[6] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x2;
-			si_shader_ctx->shader->db_shader_control |=
-				S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(1);
-		}
-
-		if (samplemask_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[samplemask_index][0];
-			args[7] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x4;
-			si_shader_ctx->shader->db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(1);
-		}
-
-		/* SI (except OLAND) has a bug that it only looks
-		 * at the X writemask component. */
-		if (si_shader_ctx->screen->b.chip_class == SI &&
-		    si_shader_ctx->screen->b.family != CHIP_OLAND)
-			mask |= 0x1;
+	/* Export */
+	si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
+				 args);
+	if (is_last) {
+		args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
+		args[2] = bld_base->uint_bld.one; /* DONE bit */
+	}
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
 
-		if (samplemask_index >= 0)
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_ABGR;
-		else if (stencil_index >= 0)
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_GR;
-		else
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_R;
+static void si_export_null(struct lp_build_tgsi_context *bld_base)
+{
+	struct lp_build_context *base = &bld_base->base;
+	struct lp_build_context *uint = &bld_base->uint_bld;
+	LLVMValueRef args[9];
 
-		/* Specify which components to enable */
-		args[0] = lp_build_const_int32(base->gallivm, mask);
+	args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+	args[1] = uint->one; /* whether the EXEC mask is valid */
+	args[2] = uint->one; /* DONE bit */
+	args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
+	args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+	args[5] = uint->undef; /* R */
+	args[6] = uint->undef; /* G */
+	args[7] = uint->undef; /* B */
+	args[8] = uint->undef; /* A */
+
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
 
-		args[1] =
-		args[2] =
-		args[4] = uint->zero;
+static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
+{
+	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader * shader = si_shader_ctx->shader;
+	struct lp_build_context * base = &bld_base->base;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	LLVMBuilderRef builder = base->gallivm->builder;
+	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+	int last_color_export = -1;
+	int i;
 
-		if (last_args[0])
-			lp_build_intrinsic(base->gallivm->builder,
-					   "llvm.SI.export",
-					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9, 0);
-		else
-			memcpy(last_args, args, sizeof(args));
+	/* If there are no outputs, add a dummy export. */
+	if (!info->num_outputs) {
+		si_export_null(bld_base);
+		return;
 	}
 
-	if (!last_args[0]) {
-		/* Specify which components to enable */
-		last_args[0] = lp_build_const_int32(base->gallivm, 0x0);
+	/* Determine the last export. If MRTZ is present, it's always last.
+	 * Otherwise, find the last color export.
+	 */
+	if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask)
+		for (i = 0; i < info->num_outputs; i++)
+			if (info->output_semantic_name[i] == TGSI_SEMANTIC_COLOR)
+				last_color_export = i;
 
-		/* Specify the target we are exporting */
-		last_args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
+	for (i = 0; i < info->num_outputs; i++) {
+		unsigned semantic_name = info->output_semantic_name[i];
+		unsigned semantic_index = info->output_semantic_index[i];
+		unsigned j;
+		LLVMValueRef color[4] = {};
 
-		/* Set COMPR flag to zero to export data as 32-bit */
-		last_args[4] = uint->zero;
+		/* Select the correct target */
+		switch (semantic_name) {
+		case TGSI_SEMANTIC_POSITION:
+			depth = LLVMBuildLoad(builder,
+					      si_shader_ctx->radeon_bld.soa.outputs[i][2], "");
+			break;
+		case TGSI_SEMANTIC_STENCIL:
+			stencil = LLVMBuildLoad(builder,
+						si_shader_ctx->radeon_bld.soa.outputs[i][1], "");
+			break;
+		case TGSI_SEMANTIC_SAMPLEMASK:
+			samplemask = LLVMBuildLoad(builder,
+						   si_shader_ctx->radeon_bld.soa.outputs[i][0], "");
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			for (j = 0; j < 4; j++)
+				color[j] = LLVMBuildLoad(builder,
+							 si_shader_ctx->radeon_bld.soa.outputs[i][j], "");
 
-		/* dummy bits */
-		last_args[5]= uint->zero;
-		last_args[6]= uint->zero;
-		last_args[7]= uint->zero;
-		last_args[8]= uint->zero;
+			si_export_mrt_color(bld_base, color, semantic_index,
+					    last_color_export == i);
+			break;
+		default:
+			fprintf(stderr,
+				"Warning: SI unhandled fs output type:%d\n",
+				semantic_name);
+		}
 	}
 
-	/* Specify whether the EXEC mask represents the valid mask */
-	last_args[1] = uint->one;
-
-	/* Specify that this is the last export */
-	last_args[2] = lp_build_const_int32(base->gallivm, 1);
-
-	lp_build_intrinsic(base->gallivm->builder,
-			   "llvm.SI.export",
-			   LLVMVoidTypeInContext(base->gallivm->context),
-			   last_args, 9, 0);
+	if (depth || stencil || samplemask)
+		si_export_mrt_z(bld_base, depth, stencil, samplemask);
 }
 
 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
@@ -2390,10 +2356,10 @@ static void tex_fetch_ptrs(
 
 		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
 
-		*res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		*res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
 		*res_ptr = build_indexed_load_const(si_shader_ctx, *res_ptr, ind_index);
 
-		*samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+		*samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
 		*samp_ptr = build_indexed_load_const(si_shader_ctx, *samp_ptr, ind_index);
 
 		if (target == TGSI_TEXTURE_2D_MSAA ||
@@ -2401,13 +2367,13 @@ static void tex_fetch_ptrs(
 			ind_index = LLVMBuildAdd(gallivm->builder, ind_index,
 						 lp_build_const_int32(gallivm,
 								      SI_FMASK_TEX_OFFSET), "");
-			*fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+			*fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
 			*fmask_ptr = build_indexed_load_const(si_shader_ctx, *fmask_ptr, ind_index);
 		}
 	} else {
-		*res_ptr = si_shader_ctx->resources[sampler_index];
-		*samp_ptr = si_shader_ctx->samplers[sampler_index];
-		*fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
+		*res_ptr = si_shader_ctx->sampler_views[sampler_index];
+		*samp_ptr = si_shader_ctx->sampler_states[sampler_index];
+		*fmask_ptr = si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + sampler_index];
 	}
 }
 
@@ -2487,7 +2453,7 @@ static void tex_fetch_args(
 		emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 		emit_data->args[0] = res;
 		emit_data->args[1] = bld_base->uint_bld.zero;
-		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
+		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
 		emit_data->arg_count = 3;
 		return;
 	}
@@ -2536,12 +2502,12 @@ static void tex_fetch_args(
 	if (opcode == TGSI_OPCODE_TXB)
 		address[count++] = coords[3];
 	if (opcode == TGSI_OPCODE_TXB2)
-		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 
 	/* Pack depth comparison value */
 	if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
 		if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-			address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+			address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 		} else {
 			assert(ref_pos >= 0);
 			address[count++] = coords[ref_pos];
@@ -2612,7 +2578,7 @@ static void tex_fetch_args(
 	if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
 		address[count++] = coords[3];
 	else if (opcode == TGSI_OPCODE_TXL2)
-		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 
 	if (count > 16) {
 		assert(!"Cannot handle more than 16 texture address parameters");
@@ -3105,10 +3071,10 @@ static void interp_fetch_args(
 		/* offset is in second src, first two channels */
 		emit_data->args[0] = lp_build_emit_fetch(bld_base,
 							 emit_data->inst, 1,
-							 0);
+							 TGSI_CHAN_X);
 		emit_data->args[1] = lp_build_emit_fetch(bld_base,
 							 emit_data->inst, 1,
-							 1);
+							 TGSI_CHAN_Y);
 		emit_data->arg_count = 2;
 	} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
 		LLVMValueRef sample_position;
@@ -3119,7 +3085,7 @@ static void interp_fetch_args(
 		 * and place into first two channels.
 		 */
 		sample_id = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 1, 0);
+						emit_data->inst, 1, TGSI_CHAN_X);
 		sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
 					     LLVMInt32TypeInContext(gallivm->context),
 					     "");
@@ -3432,15 +3398,15 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	v16i8 = LLVMVectorType(i8, 16);
 
 	params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS);
-	params[SI_PARAM_CONST] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
-	params[SI_PARAM_SAMPLER] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
-	params[SI_PARAM_RESOURCE] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
-	last_array_pointer = SI_PARAM_RESOURCE;
+	params[SI_PARAM_CONST_BUFFERS] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
+	params[SI_PARAM_SAMPLER_STATES] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
+	params[SI_PARAM_SAMPLER_VIEWS] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
+	last_array_pointer = SI_PARAM_SAMPLER_VIEWS;
 
 	switch (si_shader_ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
-		params[SI_PARAM_VERTEX_BUFFER] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
-		last_array_pointer = SI_PARAM_VERTEX_BUFFER;
+		params[SI_PARAM_VERTEX_BUFFERS] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
+		last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
 		params[SI_PARAM_BASE_VERTEX] = i32;
 		params[SI_PARAM_START_INSTANCE] = i32;
 		num_params = SI_PARAM_START_INSTANCE+1;
@@ -3452,8 +3418,8 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
 			if (shader->is_gs_copy_shader) {
-				last_array_pointer = SI_PARAM_CONST;
-				num_params = SI_PARAM_CONST+1;
+				last_array_pointer = SI_PARAM_CONST_BUFFERS;
+				num_params = SI_PARAM_CONST_BUFFERS+1;
 			} else {
 				params[SI_PARAM_VS_STATE_BITS] = i32;
 				num_params = SI_PARAM_VS_STATE_BITS+1;
@@ -3610,7 +3576,7 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	const struct tgsi_shader_info * info = bld_base->info;
 	unsigned buf;
-	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 
 	for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
 		unsigned i, num_const = info->const_file_max[buf] + 1;
@@ -3622,14 +3588,14 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
 		si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
 
 		/* Load the resource descriptor */
-		si_shader_ctx->const_resource[buf] =
+		si_shader_ctx->const_buffers[buf] =
 			build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf));
 
 		/* Load the constants, we rely on the code sinking to do the rest */
 		for (i = 0; i < num_const * 4; ++i) {
 			si_shader_ctx->constants[buf][i] =
 				buffer_load_const(gallivm->builder,
-					si_shader_ctx->const_resource[buf],
+					si_shader_ctx->const_buffers[buf],
 					lp_build_const_int32(gallivm, i * 4),
 					bld_base->base.elem_type);
 		}
@@ -3650,23 +3616,23 @@ static void preload_samplers(struct si_shader_context *si_shader_ctx)
 	if (num_samplers == 0)
 		return;
 
-	res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
-	samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+	res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
+	samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
 
 	/* Load the resources and samplers, we rely on the code sinking to do the rest */
 	for (i = 0; i < num_samplers; ++i) {
 		/* Resource */
 		offset = lp_build_const_int32(gallivm, i);
-		si_shader_ctx->resources[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
+		si_shader_ctx->sampler_views[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
 
 		/* Sampler */
 		offset = lp_build_const_int32(gallivm, i);
-		si_shader_ctx->samplers[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
+		si_shader_ctx->sampler_states[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
 
 		/* FMASK resource */
 		if (info->is_msaa_sampler[i]) {
 			offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i);
-			si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + i] =
+			si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + i] =
 				build_indexed_load_const(si_shader_ctx, res_ptr, offset);
 		}
 	}
@@ -3741,20 +3707,19 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 	}
 }
 
-void si_shader_binary_read_config(const struct si_screen *sscreen,
-				struct si_shader *shader,
-				unsigned symbol_offset)
+void si_shader_binary_read_config(struct radeon_shader_binary *binary,
+				  struct si_shader_config *conf,
+				  unsigned symbol_offset)
 {
 	unsigned i;
 	const unsigned char *config =
-		radeon_shader_binary_config_start(&shader->binary,
-						symbol_offset);
+		radeon_shader_binary_config_start(binary, symbol_offset);
 
 	/* XXX: We may be able to emit some of these values directly rather than
 	 * extracting fields to be emitted later.
 	 */
 
-	for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) {
+	for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 		unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
 		unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 		switch (reg) {
@@ -3762,25 +3727,25 @@ void si_shader_binary_read_config(const struct si_screen *sscreen,
 		case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
 		case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
 		case R_00B848_COMPUTE_PGM_RSRC1:
-			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
-			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
-			shader->float_mode =  G_00B028_FLOAT_MODE(value);
-			shader->rsrc1 = value;
+			conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+			conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+			conf->float_mode =  G_00B028_FLOAT_MODE(value);
+			conf->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
-			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+			conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
-			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
-			shader->rsrc2 = value;
+			conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
+			conf->rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
-			shader->spi_ps_input_ena = value;
+			conf->spi_ps_input_ena = value;
 			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
-			shader->scratch_bytes_per_wave =
+			conf->scratch_bytes_per_wave =
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
 		default:
@@ -3799,7 +3764,7 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	uint32_t scratch_rsrc_dword0 = scratch_va;
 	uint32_t scratch_rsrc_dword1 =
 		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
-		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+		|  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64);
 
 	for (i = 0 ; i < shader->binary.reloc_count; i++) {
 		const struct radeon_shader_reloc *reloc =
@@ -3840,80 +3805,124 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 	return 0;
 }
 
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
+static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
+				       struct pipe_debug_callback *debug)
 {
-	const struct radeon_shader_binary *binary = &shader->binary;
-	unsigned i;
-	int r;
-	bool dump  = r600_can_dump_shader(&sscreen->b,
-		shader->selector ? shader->selector->tokens : NULL);
+	char *line, *p;
+	unsigned i, count;
+
+	if (binary->disasm_string) {
+		fprintf(stderr, "\nShader Disassembly:\n\n");
+		fprintf(stderr, "%s\n", binary->disasm_string);
+
+		if (debug && debug->debug_message) {
+			/* Very long debug messages are cut off, so send the
+			 * disassembly one line at a time. This causes more
+			 * overhead, but on the plus side it simplifies
+			 * parsing of resulting logs.
+			 */
+			pipe_debug_message(debug, SHADER_INFO,
+					   "Shader Disassembly Begin");
 
-	si_shader_binary_read_config(sscreen, shader, 0);
-	r = si_shader_binary_upload(sscreen, shader);
-	if (r)
-		return r;
-
-	if (dump) {
-		if (!(sscreen->b.debug_flags & DBG_NO_ASM)) {
-			if (binary->disasm_string) {
-				fprintf(stderr, "\nShader Disassembly:\n\n");
-				fprintf(stderr, "%s\n", binary->disasm_string);
-			} else {
-				fprintf(stderr, "SI CODE:\n");
-				for (i = 0; i < binary->code_size; i+=4 ) {
-					fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
-					binary->code[i + 2], binary->code[i + 1],
-					binary->code[i]);
+			line = binary->disasm_string;
+			while (*line) {
+				p = strchrnul(line, '\n');
+				count = p - line;
+
+				if (count) {
+					pipe_debug_message(debug, SHADER_INFO,
+							   "%.*s", count, line);
 				}
+
+				if (!*p)
+					break;
+				line = p + 1;
 			}
+
+			pipe_debug_message(debug, SHADER_INFO,
+					   "Shader Disassembly End");
+		}
+	} else {
+		fprintf(stderr, "SI CODE:\n");
+		for (i = 0; i < binary->code_size; i += 4) {
+			fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
+				binary->code[i + 3], binary->code[i + 2],
+				binary->code[i + 1], binary->code[i]);
 		}
+	}
+}
 
+static void si_shader_dump_stats(struct si_screen *sscreen,
+			         struct si_shader_config *conf,
+				 unsigned code_size,
+			         struct pipe_debug_callback *debug,
+			         unsigned processor)
+{
+	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "*** SHADER STATS ***\n"
 			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
 			"Scratch: %d bytes per wave\n********************\n",
-			shader->num_sgprs, shader->num_vgprs, binary->code_size,
-			shader->lds_size, shader->scratch_bytes_per_wave);
+			conf->num_sgprs, conf->num_vgprs, code_size,
+			conf->lds_size, conf->scratch_bytes_per_wave);
 	}
-	return 0;
+
+	pipe_debug_message(debug, SHADER_INFO,
+			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
+			   conf->num_sgprs, conf->num_vgprs, code_size,
+			   conf->lds_size, conf->scratch_bytes_per_wave);
+}
+
+void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+		    struct pipe_debug_callback *debug, unsigned processor)
+{
+	if (r600_can_dump_shader(&sscreen->b, processor))
+		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
+			si_shader_dump_disassembly(&shader->binary, debug);
+
+	si_shader_dump_stats(sscreen, &shader->config,
+			     shader->binary.code_size, debug, processor);
 }
 
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMTargetMachineRef tm, LLVMModuleRef mod)
+int si_compile_llvm(struct si_screen *sscreen,
+		    struct radeon_shader_binary *binary,
+		    struct si_shader_config *conf,
+		    LLVMTargetMachineRef tm,
+		    LLVMModuleRef mod,
+		    struct pipe_debug_callback *debug,
+		    unsigned processor)
 {
 	int r = 0;
-	bool dump_asm = r600_can_dump_shader(&sscreen->b,
-				shader->selector ? shader->selector->tokens : NULL);
-	bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR);
 	unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
 
-	if (dump_ir || dump_asm)
+	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
 
-	if (!si_replace_shader(count, &shader->binary)) {
-		r = radeon_llvm_compile(mod, &shader->binary,
-			r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm);
+		if (!(sscreen->b.debug_flags & DBG_NO_IR))
+			LLVMDumpModule(mod);
+	}
+
+	if (!si_replace_shader(count, binary)) {
+		r = radeon_llvm_compile(mod, binary,
+			r600_get_llvm_processor_name(sscreen->b.family), tm,
+			debug);
 		if (r)
 			return r;
 	}
 
-	r = si_shader_binary_read(sscreen, shader);
+	si_shader_binary_read_config(binary, conf, 0);
 
-	FREE(shader->binary.config);
-	FREE(shader->binary.rodata);
-	FREE(shader->binary.global_symbol_offsets);
-	if (shader->scratch_bytes_per_wave == 0) {
-		FREE(shader->binary.code);
-		FREE(shader->binary.relocs);
-		memset(&shader->binary, 0,
-		       offsetof(struct radeon_shader_binary, disasm_string));
-	}
+	FREE(binary->config);
+	FREE(binary->global_symbol_offsets);
+	binary->config = NULL;
+	binary->global_symbol_offsets = NULL;
 	return r;
 }
 
 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 				      struct si_shader_context *si_shader_ctx,
-				      struct si_shader *gs, bool dump)
+				      struct si_shader *gs, bool dump,
+				      struct pipe_debug_callback *debug)
 {
 	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
@@ -3979,8 +3988,15 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	if (dump)
 		fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
 
-	r = si_compile_llvm(sscreen, si_shader_ctx->shader,
-			    si_shader_ctx->tm, bld_base->base.gallivm->module);
+	r = si_compile_llvm(sscreen, &si_shader_ctx->shader->binary,
+			    &si_shader_ctx->shader->config, si_shader_ctx->tm,
+			    bld_base->base.gallivm->module,
+			    debug, TGSI_PROCESSOR_GEOMETRY);
+	if (!r) {
+		si_shader_dump(sscreen, si_shader_ctx->shader, debug,
+			       TGSI_PROCESSOR_GEOMETRY);
+		r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
+	}
 
 	radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
 
@@ -4034,7 +4050,8 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 }
 
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader)
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug)
 {
 	struct si_shader_selector *sel = shader->selector;
 	struct tgsi_token *tokens = sel->tokens;
@@ -4045,11 +4062,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	int r = 0;
 	bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
 			    shader->key.ps.poly_stipple;
-	bool dump = r600_can_dump_shader(&sscreen->b, sel->tokens);
+	bool dump = r600_can_dump_shader(&sscreen->b, sel->info.processor);
 
 	if (poly_stipple) {
 		tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-						SI_POLY_STIPPLE_SAMPLER);
+						SI_POLY_STIPPLE_SAMPLER,
+						TGSI_FILE_INPUT);
 		tgsi_scan_shader(tokens, &stipple_shader_info);
 	}
 
@@ -4070,9 +4088,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	if (sel->type != PIPE_SHADER_COMPUTE)
 		shader->dx10_clamp_mode = true;
 
-	if (sel->info.uses_kill)
-		shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
-
 	shader->uses_instanceid = sel->info.uses_instanceid;
 	bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
@@ -4147,17 +4162,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	case TGSI_PROCESSOR_FRAGMENT:
 		si_shader_ctx.radeon_bld.load_input = declare_input_fs;
 		bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
-
-		switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
-		case TGSI_FS_DEPTH_LAYOUT_GREATER:
-			shader->db_shader_control |=
-				S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
-			break;
-		case TGSI_FS_DEPTH_LAYOUT_LESS:
-			shader->db_shader_control |=
-				S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
-			break;
-		}
 		break;
 	default:
 		assert(!"Unsupported shader type");
@@ -4188,12 +4192,21 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
 
 	mod = bld_base->base.gallivm->module;
-	r = si_compile_llvm(sscreen, shader, tm, mod);
+	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
+			    mod, debug, si_shader_ctx.type);
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
 		goto out;
 	}
 
+	si_shader_dump(sscreen, shader, debug, si_shader_ctx.type);
+
+	r = si_shader_binary_upload(sscreen, shader);
+	if (r) {
+		fprintf(stderr, "LLVM failed to upload shader\n");
+		goto out;
+	}
+
 	radeon_llvm_dispose(&si_shader_ctx.radeon_bld);
 
 	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
@@ -4202,7 +4215,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		shader->gs_copy_shader->key = shader->key;
 		si_shader_ctx.shader = shader->gs_copy_shader;
 		if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
-						    shader, dump))) {
+						    shader, dump, debug))) {
 			free(shader->gs_copy_shader);
 			shader->gs_copy_shader = NULL;
 			goto out;
@@ -4217,6 +4230,14 @@ out:
 	return r;
 }
 
+void si_shader_destroy_binary(struct radeon_shader_binary *binary)
+{
+	FREE(binary->code);
+	FREE(binary->rodata);
+	FREE(binary->relocs);
+	FREE(binary->disasm_string);
+}
+
 void si_shader_destroy(struct si_shader *shader)
 {
 	if (shader->gs_copy_shader) {
@@ -4228,8 +4249,5 @@ void si_shader_destroy(struct si_shader *shader)
 		r600_resource_reference(&shader->scratch_bo, NULL);
 
 	r600_resource_reference(&shader->bo, NULL);
-
-	FREE(shader->binary.code);
-	FREE(shader->binary.relocs);
-	FREE(shader->binary.disasm_string);
+	si_shader_destroy_binary(&shader->binary);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b0c8680ecb3..1635358d505 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -76,10 +76,10 @@ struct radeon_shader_binary;
 struct radeon_shader_reloc;
 
 #define SI_SGPR_RW_BUFFERS	0  /* rings (& stream-out, VS only) */
-#define SI_SGPR_CONST		2
-#define SI_SGPR_SAMPLER		4
-#define SI_SGPR_RESOURCE	6
-#define SI_SGPR_VERTEX_BUFFER	8  /* VS only */
+#define SI_SGPR_CONST_BUFFERS	2
+#define SI_SGPR_SAMPLER_STATES	4
+#define SI_SGPR_SAMPLER_VIEWS	6
+#define SI_SGPR_VERTEX_BUFFERS	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
 #define SI_SGPR_VS_STATE_BITS	12 /* VS(VS) only */
@@ -101,12 +101,12 @@ struct radeon_shader_reloc;
 
 /* LLVM function parameter indices */
 #define SI_PARAM_RW_BUFFERS	0
-#define SI_PARAM_CONST		1
-#define SI_PARAM_SAMPLER	2
-#define SI_PARAM_RESOURCE	3
+#define SI_PARAM_CONST_BUFFERS	1
+#define SI_PARAM_SAMPLER_STATES	2
+#define SI_PARAM_SAMPLER_VIEWS	3
 
 /* VS only parameters */
-#define SI_PARAM_VERTEX_BUFFER	4
+#define SI_PARAM_VERTEX_BUFFERS	4
 #define SI_PARAM_BASE_VERTEX	5
 #define SI_PARAM_START_INSTANCE	6
 /* [0] = clamp vertex color */
@@ -201,6 +201,7 @@ struct si_shader_selector {
 	bool		forces_persample_interp_for_persp;
 	bool		forces_persample_interp_for_linear;
 
+	/* GS parameters. */
 	unsigned	esgs_itemsize;
 	unsigned	gs_input_verts_per_prim;
 	unsigned	gs_output_prim;
@@ -210,6 +211,9 @@ struct si_shader_selector {
 	unsigned	gsvs_vertex_size;
 	unsigned	max_gsvs_emit_size;
 
+	/* PS parameters. */
+	unsigned	db_shader_control;
+
 	/* masks of "get_unique_index" bits */
 	uint64_t	outputs_written;
 	uint32_t	patch_outputs_written;
@@ -258,6 +262,17 @@ union si_shader_key {
 	} tes; /* tessellation evaluation shader */
 };
 
+struct si_shader_config {
+	unsigned			num_sgprs;
+	unsigned			num_vgprs;
+	unsigned			lds_size;
+	unsigned			spi_ps_input_ena;
+	unsigned			float_mode;
+	unsigned			scratch_bytes_per_wave;
+	unsigned			rsrc1;
+	unsigned			rsrc2;
+};
+
 struct si_shader {
 	struct si_shader_selector	*selector;
 	struct si_shader		*next_variant;
@@ -266,18 +281,9 @@ struct si_shader {
 	struct si_pm4_state		*pm4;
 	struct r600_resource		*bo;
 	struct r600_resource		*scratch_bo;
-	struct radeon_shader_binary	binary;
-	unsigned			num_sgprs;
-	unsigned			num_vgprs;
-	unsigned			lds_size;
-	unsigned			spi_ps_input_ena;
-	unsigned			float_mode;
-	unsigned			scratch_bytes_per_wave;
-	unsigned			spi_shader_col_format;
-	unsigned			spi_shader_z_format;
-	unsigned			db_shader_control;
-	unsigned			cb_shader_mask;
 	union si_shader_key		key;
+	struct radeon_shader_binary	binary;
+	struct si_shader_config		config;
 
 	unsigned		nparam;
 	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
@@ -288,9 +294,6 @@ struct si_shader {
 	unsigned		nr_param_exports;
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
-
-	unsigned		rsrc1;
-	unsigned		rsrc2;
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
@@ -327,19 +330,27 @@ static inline bool si_vs_exports_prim_id(struct si_shader *shader)
 
 /* radeonsi_shader.c */
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader);
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug);
 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f);
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMTargetMachineRef tm, LLVMModuleRef mod);
+int si_compile_llvm(struct si_screen *sscreen,
+		    struct radeon_shader_binary *binary,
+		    struct si_shader_config *conf,
+		    LLVMTargetMachineRef tm,
+		    LLVMModuleRef mod,
+		    struct pipe_debug_callback *debug,
+		    unsigned processor);
 void si_shader_destroy(struct si_shader *shader);
+void si_shader_destroy_binary(struct radeon_shader_binary *binary);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
+void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+		    struct pipe_debug_callback *debug, unsigned processor);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);
-void si_shader_binary_read_config(const struct si_screen *sscreen,
-				struct si_shader *shader,
-				unsigned symbol_offset);
+void si_shader_binary_read_config(struct radeon_shader_binary *binary,
+				  struct si_shader_config *conf,
+				  unsigned symbol_offset);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index e5500111f43..91ccd073267 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->rsrc2;
+	ls_rsrc2 = ls->current->config.rsrc2;
 
 	if (sctx->b.chip_class >= CIK) {
 		assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
 		radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
 	radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->rsrc1);
+	radeon_emit(cs, ls->current->config.rsrc1);
 	radeon_emit(cs, ls_rsrc2);
 
 	/* Compute userdata SGPRs. */
@@ -818,7 +818,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			si_get_draw_start_count(sctx, info, &start, &count);
 			start_offset = start * ib.index_size;
 
-			u_upload_alloc(sctx->b.uploader, start_offset, count * 2,
+			u_upload_alloc(sctx->b.uploader, start_offset, count * 2, 256,
 				       &out_offset, &out_buffer, &ptr);
 			if (!out_buffer) {
 				pipe_resource_reference(&ib.buffer, NULL);
@@ -842,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			start_offset = start * ib.index_size;
 
 			u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size,
-				      (char*)ib.user_buffer + start_offset,
+				      256, (char*)ib.user_buffer + start_offset,
 				      &ib.offset, &ib.buffer);
 			if (!ib.buffer)
 				return;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 8700590435f..64adf699604 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -111,7 +111,7 @@ static void si_shader_ls(struct si_shader *shader)
 	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
 
 	num_user_sgprs = SI_LS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	if (num_user_sgprs > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
 		num_sgprs = num_user_sgprs + 2;
@@ -121,12 +121,12 @@ static void si_shader_ls(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
 
-	shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+	shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
 			   S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
-	shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
-			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
+	shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+			   S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 }
 
 static void si_shader_hs(struct si_shader *shader)
@@ -143,7 +143,7 @@ static void si_shader_hs(struct si_shader *shader)
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 
 	num_user_sgprs = SI_TCS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with tessellation factor
 	 * buffer offset. */
 	if ((num_user_sgprs + 1) > num_sgprs) {
@@ -155,12 +155,12 @@ static void si_shader_hs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
 	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
-		       S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B428_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B428_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
 		       S_00B42C_USER_SGPR(num_user_sgprs) |
-		       S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_es(struct si_shader *shader)
@@ -187,7 +187,7 @@ static void si_shader_es(struct si_shader *shader)
 	} else
 		unreachable("invalid shader selector type");
 
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
 	if ((num_user_sgprs + 1) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -200,13 +200,13 @@ static void si_shader_es(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
 	si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
 	si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
-		       S_00B328_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B328_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
 		       S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
 		       S_00B32C_USER_SGPR(num_user_sgprs) |
-		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 
 	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
 		si_set_tesseval_regs(shader, pm4);
@@ -272,7 +272,7 @@ static void si_shader_gs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
 
 	num_user_sgprs = SI_GS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */
 	if ((num_user_sgprs + 2) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -281,12 +281,12 @@ static void si_shader_gs(struct si_shader *shader)
 	assert(num_sgprs <= 104);
 
 	si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-		       S_00B228_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B228_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
 		       S_00B22C_USER_SGPR(num_user_sgprs) |
-		       S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_vs(struct si_shader *shader)
@@ -329,7 +329,7 @@ static void si_shader_vs(struct si_shader *shader)
 	} else
 		unreachable("invalid shader selector type");
 
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	if (num_user_sgprs > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
 		num_sgprs = num_user_sgprs + 2;
@@ -356,7 +356,7 @@ static void si_shader_vs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40);
 	si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS,
-		       S_00B128_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B128_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
 		       S_00B128_DX10_CLAMP(shader->dx10_clamp_mode));
@@ -367,7 +367,7 @@ static void si_shader_vs(struct si_shader *shader)
 		       S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
 		       S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
 		       S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
-		       S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 	if (window_space)
 		si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
 			       S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
@@ -387,6 +387,8 @@ static void si_shader_ps(struct si_shader *shader)
 	struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
 	unsigned i, spi_ps_in_control;
+	unsigned spi_shader_col_format = 0, cb_shader_mask = 0;
+	unsigned colors_written, export_16bpc;
 	unsigned num_sgprs, num_user_sgprs;
 	unsigned spi_baryc_cntl = 0;
 	uint64_t va;
@@ -422,19 +424,43 @@ static void si_shader_ps(struct si_shader *shader)
 		}
 	}
 
-	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) ||
-		       G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena);
+	/* Find out what SPI_SHADER_COL_FORMAT and CB_SHADER_MASK should be. */
+	colors_written = info->colors_written;
+	export_16bpc = shader->key.ps.export_16bpc;
+
+	if (info->colors_written == 0x1 &&
+	    info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
+		colors_written |= (1 << (shader->key.ps.last_cbuf + 1)) - 1;
+	}
+
+	while (colors_written) {
+		i = u_bit_scan(&colors_written);
+		if (export_16bpc & (1 << i))
+			spi_shader_col_format |= V_028714_SPI_SHADER_FP16_ABGR << (4 * i);
+		else
+			spi_shader_col_format |= V_028714_SPI_SHADER_32_ABGR << (4 * i);
+		cb_shader_mask |= 0xf << (4 * i);
+	}
+
+	/* Set interpolation controls. */
+	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
+		       G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
 
 	spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
 			    S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
 
+	/* Set registers. */
 	si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
 	si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control);
 
-	si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT, shader->spi_shader_z_format);
-	si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT,
-		       shader->spi_shader_col_format);
-	si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, shader->cb_shader_mask);
+	si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT,
+		       info->writes_samplemask ? V_028710_SPI_SHADER_32_ABGR :
+		       info->writes_stencil ? V_028710_SPI_SHADER_32_GR :
+		       info->writes_z ? V_028710_SPI_SHADER_32_R :
+		       V_028710_SPI_SHADER_ZERO);
+
+	si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT, spi_shader_col_format);
+	si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, cb_shader_mask);
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
@@ -442,7 +468,7 @@ static void si_shader_ps(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40);
 
 	num_user_sgprs = SI_PS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */
 	if ((num_user_sgprs + 1) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -451,13 +477,13 @@ static void si_shader_ps(struct si_shader *shader)
 	assert(num_sgprs <= 104);
 
 	si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
-		       S_00B028_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B028_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
-		       S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
+		       S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
 		       S_00B02C_USER_SGPR(num_user_sgprs) |
-		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_init_pm4_state(struct si_shader *shader)
@@ -496,6 +522,16 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 	}
 }
 
+static unsigned si_get_alpha_test_func(struct si_context *sctx)
+{
+	/* Alpha-test should be disabled if colorbuffer 0 is integer. */
+	if (sctx->queued.named.dsa &&
+	    !sctx->framebuffer.cb0_is_integer)
+		return sctx->queued.named.dsa->alpha_func;
+
+	return PIPE_FUNC_ALWAYS;
+}
+
 /* Compute the key for the hw shader variant */
 static inline void si_shader_selector_key(struct pipe_context *ctx,
 					  struct si_shader_selector *sel,
@@ -537,8 +573,10 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 	case PIPE_SHADER_FRAGMENT: {
 		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+		    sel->info.colors_written == 0x1)
 			key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+
 		key->ps.export_16bpc = sctx->framebuffer.export_16bpc;
 
 		if (rs) {
@@ -562,11 +600,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			key->ps.clamp_color = rs->clamp_fragment_color;
 		}
 
-		key->ps.alpha_func = PIPE_FUNC_ALWAYS;
-		/* Alpha-test should be disabled if colorbuffer 0 is integer. */
-		if (sctx->queued.named.dsa &&
-		    !sctx->framebuffer.cb0_is_integer)
-			key->ps.alpha_func = sctx->queued.named.dsa->alpha_func;
+		key->ps.alpha_func = si_get_alpha_test_func(sctx);
 		break;
 	}
 	default:
@@ -616,7 +650,7 @@ static int si_shader_select(struct pipe_context *ctx,
 	shader->selector = sel;
 	shader->key = key;
 
-	r = si_shader_create(sctx->screen, sctx->tm, shader);
+	r = si_shader_create(sctx->screen, sctx->tm, shader, &sctx->b.debug);
 	if (unlikely(r)) {
 		R600_ERR("Failed to build shader variant (type=%u) %d\n",
 			 sel->type, r);
@@ -731,6 +765,25 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		break;
 	}
 
+	/* DB_SHADER_CONTROL */
+	sel->db_shader_control =
+		S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
+		S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
+		S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
+		S_02880C_KILL_ENABLE(sel->info.uses_kill);
+
+	switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
+	case TGSI_FS_DEPTH_LAYOUT_GREATER:
+		sel->db_shader_control |=
+			S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+		break;
+	case TGSI_FS_DEPTH_LAYOUT_LESS:
+		sel->db_shader_control |=
+			S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+		break;
+	}
+
+	/* Pre-compilation. */
 	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
 		struct si_shader_ctx_state state = {sel};
 
@@ -987,7 +1040,7 @@ static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom
 	if (!ps)
 		return;
 
-	input_ena = ps->spi_ps_input_ena;
+	input_ena = ps->config.spi_ps_input_ena;
 
 	/* we need to enable at least one of them, otherwise we hang the GPU */
 	assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
@@ -1216,7 +1269,7 @@ static int si_update_scratch_buffer(struct si_context *sctx,
 		return 0;
 
 	/* This shader doesn't need a scratch buffer */
-	if (shader->scratch_bytes_per_wave == 0)
+	if (shader->config.scratch_bytes_per_wave == 0)
 		return 0;
 
 	/* This shader is already configured to use the current
@@ -1248,7 +1301,7 @@ static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
 
 static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
 {
-	return shader ? shader->scratch_bytes_per_wave : 0;
+	return shader ? shader->config.scratch_bytes_per_wave : 0;
 }
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
@@ -1549,6 +1602,10 @@ bool si_update_shaders(struct si_context *sctx)
 	si_update_vgt_shader_config(sctx);
 
 	if (sctx->ps_shader.cso) {
+		unsigned db_shader_control =
+			sctx->ps_shader.cso->db_shader_control |
+			S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
+
 		r = si_shader_select(ctx, &sctx->ps_shader);
 		if (r)
 			return false;
@@ -1568,8 +1625,8 @@ bool si_update_shaders(struct si_context *sctx)
 			si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
 		}
 
-		if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) {
-			sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control;
+		if (sctx->ps_db_shader_control != db_shader_control) {
+			sctx->ps_db_shader_control = db_shader_control;
 			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 		}
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 8e5e24217a5..d5c4aaae638 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -37,6 +37,7 @@
 #include "draw/draw_vertex.h"
 
 #include "sp_quad_pipe.h"
+#include "sp_setup.h"
 
 
 /** Do polygon stipple in the draw module? */
@@ -117,17 +118,17 @@ struct softpipe_context {
    unsigned const_buffer_size[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
 
    /** Vertex format */
+   struct sp_setup_info setup_info;
    struct vertex_info vertex_info;
-   struct vertex_info vertex_info_vbuf;
 
    /** Which vertex shader output slot contains point size */
-   int psize_slot;
+   int8_t psize_slot;
 
    /** Which vertex shader output slot contains viewport index */
-   int viewport_index_slot;
+   int8_t viewport_index_slot;
 
    /** Which vertex shader output slot contains layer */
-   int layer_slot;
+   int8_t layer_slot;
 
    /** The reduced version of the primitive supplied by the state tracker */
    unsigned reduced_api_prim;
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index f8a3eacdb37..95d1ac1514f 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -161,7 +161,7 @@ sp_vbuf_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
-   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const unsigned stride = softpipe->vertex_info.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
    struct setup_context *setup = cvbr->setup;
    const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
@@ -358,7 +358,7 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
    struct setup_context *setup = cvbr->setup;
-   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const unsigned stride = softpipe->vertex_info.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 76105b4c0ec..c28d28d5f5d 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -223,7 +223,7 @@ softpipe_get_query_result(struct pipe_context *pipe,
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
       memcpy(vresult, &sq->stats,
-             sizeof(struct pipe_query_data_pipeline_statistics));;
+             sizeof(struct pipe_query_data_pipeline_statistics));
       break;
    case PIPE_QUERY_GPU_FINISHED:
       vresult->b = TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 9939720e259..29e392b94e8 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -251,6 +251,13 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index ac2d97825ce..ffe49260b9a 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -38,7 +38,6 @@
 #include "sp_setup.h"
 #include "sp_state.h"
 #include "draw/draw_context.h"
-#include "draw/draw_vertex.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
@@ -599,10 +598,12 @@ setup_tri_coefficients(struct setup_context *setup)
 {
    struct softpipe_context *softpipe = setup->softpipe;
    const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info;
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    float v[3];
 
+   assert(sinfo->valid);
+
    /* z and w are done by linear interpolation:
     */
    v[0] = setup->vmin[0][2];
@@ -618,15 +619,16 @@ setup_tri_coefficients(struct setup_context *setup)
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         for (j = 0; j < TGSI_NUM_CHANNELS; j++)
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
+         for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         }
          break;
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                        setup->vmid[vertSlot][j],
@@ -636,7 +638,7 @@ setup_tri_coefficients(struct setup_context *setup)
             tri_linear_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                        setup->vmid[vertSlot][j],
@@ -646,7 +648,7 @@ setup_tri_coefficients(struct setup_context *setup)
             tri_persp_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
@@ -966,11 +968,13 @@ setup_line_coefficients(struct setup_context *setup,
 {
    struct softpipe_context *softpipe = setup->softpipe;
    const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info;
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    float area;
    float v[2];
 
+   assert(sinfo->valid);
+
    /* use setup->vmin, vmax to point to vertices */
    if (softpipe->rasterizer->flatshade_first)
       setup->vprovoke = v0;
@@ -1001,15 +1005,15 @@ setup_line_coefficients(struct setup_context *setup,
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                         setup->vmax[vertSlot][j],
@@ -1018,7 +1022,7 @@ setup_line_coefficients(struct setup_context *setup,
             line_linear_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                         setup->vmax[vertSlot][j],
@@ -1027,7 +1031,7 @@ setup_line_coefficients(struct setup_context *setup,
             line_persp_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
@@ -1236,7 +1240,7 @@ sp_setup_point(struct setup_context *setup,
    const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth;
    const float x = v0[0][0];  /* Note: data[0] is always position */
    const float y = v0[0][1];
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    uint layer = 0;
    unsigned viewport_index = 0;
@@ -1245,6 +1249,8 @@ sp_setup_point(struct setup_context *setup,
    print_vertex(setup, v0);
 #endif
 
+   assert(sinfo->valid);
+
    if (setup->softpipe->no_rast || setup->softpipe->rasterizer->rasterizer_discard)
       return;
 
@@ -1285,22 +1291,22 @@ sp_setup_point(struct setup_context *setup,
    const_coeff(setup, &setup->posCoef, 0, 3);
 
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
          /* fall-through */
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             point_persp_coeff(setup, setup->vprovoke,
                               &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
index 191494acbb8..a54dc5dad0c 100644
--- a/src/gallium/drivers/softpipe/sp_setup.h
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -30,11 +30,30 @@
 struct setup_context;
 struct softpipe_context;
 
+/**
+ * Attribute interpolation mode
+ */
+enum sp_interp_mode {
+   SP_INTERP_POS,       /**< special case for frag position */
+   SP_INTERP_CONSTANT,
+   SP_INTERP_LINEAR,
+   SP_INTERP_PERSPECTIVE
+};
+
+
+struct sp_setup_info {
+   unsigned valid;
+   struct {
+      unsigned interp:8;      /**< SP_INTERP_X */
+      int src_index:8;
+   } attrib[PIPE_MAX_SHADER_OUTPUTS];
+};
+
 void 
-sp_setup_tri( struct setup_context *setup,
-	   const float (*v0)[4],
-	   const float (*v1)[4],
-	   const float (*v2)[4] );
+sp_setup_tri(struct setup_context *setup,
+             const float (*v0)[4],
+             const float (*v1)[4],
+             const float (*v2)[4]);
 
 void
 sp_setup_line(struct setup_context *setup,
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index c35534c931d..16a2897f526 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -175,9 +175,6 @@ softpipe_unmap_texture_surfaces(struct softpipe_context *sp);
 
 
 struct vertex_info *
-softpipe_get_vertex_info(struct softpipe_context *softpipe);
-
-struct vertex_info *
 softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe);
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 7e998af1325..d4d03f1be50 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -48,7 +48,7 @@
 static void
 invalidate_vertex_layout(struct softpipe_context *softpipe)
 {
-   softpipe->vertex_info.num_attribs =  0;
+   softpipe->setup_info.valid =  0;
 }
 
 
@@ -57,50 +57,64 @@ invalidate_vertex_layout(struct softpipe_context *softpipe)
  * (simple float[][4]) used by the 'draw' module into vertices for
  * rasterization.
  *
- * This function validates the vertex layout and returns a pointer to a
- * vertex_info object.
+ * This function validates the vertex layout.
  */
-struct vertex_info *
-softpipe_get_vertex_info(struct softpipe_context *softpipe)
+static void
+softpipe_compute_vertex_info(struct softpipe_context *softpipe)
 {
-   struct vertex_info *vinfo = &softpipe->vertex_info;
-   int vs_index;
+   struct sp_setup_info *sinfo = &softpipe->setup_info;
 
-   if (vinfo->num_attribs == 0) {
-      /* compute vertex layout now */
+   if (sinfo->valid == 0) {
       const struct tgsi_shader_info *fsInfo = &softpipe->fs_variant->info;
-      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-      const uint num = draw_num_shader_outputs(softpipe->draw);
+      struct vertex_info *vinfo = &softpipe->vertex_info;
       uint i;
-
-      /* Tell draw_vbuf to simply emit the whole post-xform vertex
-       * as-is.  No longer any need to try and emit draw vertex_header
-       * info.
+      int vs_index;
+      /*
+       * This doesn't quite work right (wrt face injection, prim id,
+       * wide points) - hit a couple assertions, misrenderings plus
+       * memory corruption. Albeit could fix (the former two) by calling
+       * this "more often" (rasterizer changes etc.). (The latter would
+       * need to be included in draw_prepare_shader_outputs, but it looks
+       * like that would potentially allocate quite some unused additional
+       * vertex outputs.)
+       * draw_prepare_shader_outputs(softpipe->draw);
        */
-      vinfo_vbuf->num_attribs = 0;
-      for (i = 0; i < num; i++) {
-	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-      }
-      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
-       * Loop over fragment shader inputs, searching for the matching output
-       * from the vertex shader.
+       * Those can't actually be 0 (because pos is always at 0).
+       * But use ints anyway to avoid confusion (in vs outputs, they
+       * can very well be at pos 0).
        */
+      softpipe->viewport_index_slot = -1;
+      softpipe->layer_slot = -1;
+      softpipe->psize_slot = -1;
+
       vinfo->num_attribs = 0;
+
+      /*
+       * Put position always first (setup needs it there).
+       */
+      vs_index = draw_find_shader_output(softpipe->draw,
+                                         TGSI_SEMANTIC_POSITION, 0);
+
+      draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+
+      /*
+       * Match FS inputs against VS outputs, emitting the necessary
+       * attributes.
+       */
       for (i = 0; i < fsInfo->num_inputs; i++) {
-         int src;
-         enum interp_mode interp = INTERP_LINEAR;
+         enum sp_interp_mode interp = SP_INTERP_LINEAR;
 
          switch (fsInfo->input_interpolate[i]) {
          case TGSI_INTERPOLATE_CONSTANT:
-            interp = INTERP_CONSTANT;
+            interp = SP_INTERP_CONSTANT;
             break;
          case TGSI_INTERPOLATE_LINEAR:
-            interp = INTERP_LINEAR;
+            interp = SP_INTERP_LINEAR;
             break;
          case TGSI_INTERPOLATE_PERSPECTIVE:
-            interp = INTERP_PERSPECTIVE;
+            interp = SP_INTERP_PERSPECTIVE;
             break;
          case TGSI_INTERPOLATE_COLOR:
             assert(fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR);
@@ -111,88 +125,121 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
 
          switch (fsInfo->input_semantic_name[i]) {
          case TGSI_SEMANTIC_POSITION:
-            interp = INTERP_POS;
+            interp = SP_INTERP_POS;
             break;
 
          case TGSI_SEMANTIC_COLOR:
             if (fsInfo->input_interpolate[i] == TGSI_INTERPOLATE_COLOR) {
                if (softpipe->rasterizer->flatshade)
-                  interp = INTERP_CONSTANT;
+                  interp = SP_INTERP_CONSTANT;
                else
-                  interp = INTERP_PERSPECTIVE;
+                  interp = SP_INTERP_PERSPECTIVE;
             }
             break;
          }
 
-         /* this includes texcoords and varying vars */
-         src = draw_find_shader_output(softpipe->draw,
-                                       fsInfo->input_semantic_name[i],
-                                       fsInfo->input_semantic_index[i]);
-	 if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && src == -1)
-	   /* try and find a bcolor */
-	   src = draw_find_shader_output(softpipe->draw,
-					 TGSI_SEMANTIC_BCOLOR, fsInfo->input_semantic_index[i]);
+         /*
+          * Search for each input in current vs output:
+          */
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            fsInfo->input_semantic_name[i],
+                                            fsInfo->input_semantic_index[i]);
+
+         if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+             vs_index == -1) {
+            /*
+             * try and find a bcolor.
+             * Note that if there's both front and back color, draw will
+             * have copied back to front color already.
+             */
+            vs_index = draw_find_shader_output(softpipe->draw,
+                                               TGSI_SEMANTIC_BCOLOR,
+                                               fsInfo->input_semantic_index[i]);
+         }
 
-         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
+         sinfo->attrib[i].interp = interp;
+         /* extremely pointless index map */
+         sinfo->attrib[i].src_index = i + 1;
+         /*
+          * For vp index and layer, if the fs requires them but the vs doesn't
+          * provide them, draw (vbuf) will give us the required 0 (slot -1).
+          * (This means in this case we'll also use those slots in setup, which
+          * isn't necessary but they'll contain the correct (0) value.)
+          */
+         if (fsInfo->input_semantic_name[i] ==
+                    TGSI_SEMANTIC_VIEWPORT_INDEX) {
+            softpipe->viewport_index_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+            softpipe->layer_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+            /*
+             * Note that we'd actually want to skip position (as we won't use
+             * the attribute in the fs) but can't. The reason is that we don't
+             * actually have a input/output map for setup (even though it looks
+             * like we do...). Could adjust for this though even without a map.
+             */
+         } else {
+            /*
+             * Note that we'd actually want to skip position (as we won't use
+             * the attribute in the fs) but can't. The reason is that we don't
+             * actually have a input/output map for setup (even though it looks
+             * like we do...). Could adjust for this though even without a map.
+             */
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
 
-      /* Figure out if we need pointsize as well. */
+      /* Figure out if we need pointsize as well.
+       */
       vs_index = draw_find_shader_output(softpipe->draw,
                                          TGSI_SEMANTIC_PSIZE, 0);
 
       if (vs_index >= 0) {
-         softpipe->psize_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         softpipe->psize_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
 
-      /* Figure out if we need viewport index */
-      vs_index = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_VIEWPORT_INDEX,
-                                         0);
-      if (vs_index >= 0) {
-         softpipe->viewport_index_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else {
-         softpipe->viewport_index_slot = 0;
+      /* Figure out if we need viewport index (if it wasn't already in fs input) */
+      if (softpipe->viewport_index_slot < 0) {
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            TGSI_SEMANTIC_VIEWPORT_INDEX,
+                                            0);
+         if (vs_index >= 0) {
+            softpipe->viewport_index_slot =(int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
 
-      /* Figure out if we need layer */
-      vs_index = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_LAYER,
-                                         0);
-      if (vs_index >= 0) {
-         softpipe->layer_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else {
-         softpipe->layer_slot = 0;
+      /* Figure out if we need layer (if it wasn't already in fs input) */
+      if (softpipe->layer_slot < 0) {
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            TGSI_SEMANTIC_LAYER,
+                                            0);
+         if (vs_index >= 0) {
+            softpipe->layer_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
 
       draw_compute_vertex_size(vinfo);
+      softpipe->setup_info.valid = 1;
    }
-
-   return vinfo;
+   return;
 }
 
 
 /**
  * Called from vbuf module.
  *
- * Note that there's actually two different vertex layouts in softpipe.
- *
- * The normal one is computed in softpipe_get_vertex_info() above and is
- * used by the point/line/tri "setup" code.
- *
- * The other one (this one) is only used by the vbuf module (which is
- * not normally used by default but used in testing).  For the vbuf module,
- * we basically want to pass-through the draw module's vertex layout as-is.
- * When the softpipe vbuf code begins drawing, the normal vertex layout
- * will come into play again.
+ * This will trigger validation of the vertex layout (and also compute
+ * the required information for setup).
  */
 struct vertex_info *
 softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 {
-   (void) softpipe_get_vertex_info(softpipe);
-   return &softpipe->vertex_info_vbuf;
+   softpipe_compute_vertex_info(softpipe);
+   return &softpipe->vertex_info;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c
index dce0404de5b..f0d66a53ec6 100644
--- a/src/gallium/drivers/softpipe/sp_state_shader.c
+++ b/src/gallium/drivers/softpipe/sp_state_shader.c
@@ -64,7 +64,8 @@ create_fs_variant(struct softpipe_context *softpipe,
          /* get new shader that implements polygon stippling */
          var->tokens = 
             util_pstipple_create_fragment_shader(curfs->tokens,
-                                                 &var->stipple_sampler_unit, 0);
+                                                 &var->stipple_sampler_unit, 0,
+                                                 TGSI_FILE_INPUT);
       }
       else
 #endif
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 10442cb46e7..e45b3e72aeb 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -337,7 +337,7 @@ SVGA3D_DefineSurface2D(struct svga_winsys_context *swc,    // IN
    mipSizes[0].height = height;
    mipSizes[0].depth = 1;
 
-   swc->commit(swc);;
+   swc->commit(swc);
 
    return PIPE_OK;
 }
@@ -372,7 +372,7 @@ SVGA3D_DestroySurface(struct svga_winsys_context *swc,
    
    swc->surface_relocation(swc, &cmd->sid, NULL, sid,
                            SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
-   swc->commit(swc);;
+   swc->commit(swc);
 
    return PIPE_OK;
 }
@@ -473,6 +473,7 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
    pSuffix->flags = flags;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -543,6 +544,7 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc,
    pSuffix->flags = flags;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1016,7 +1018,7 @@ SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
    *decls = declArray;
    *ranges = rangeArray;
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1720,6 +1722,7 @@ SVGA3D_UpdateGBImage(struct svga_winsys_context *swc,
    cmd->box = *box;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1746,6 +1749,7 @@ SVGA3D_UpdateGBSurface(struct svga_winsys_context *swc,
                            SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1775,6 +1779,7 @@ SVGA3D_ReadbackGBImage(struct svga_winsys_context *swc,
    cmd->image.mipmap = mipLevel;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1801,6 +1806,7 @@ SVGA3D_ReadbackGBSurface(struct svga_winsys_context *swc,
                            SVGA_RELOC_READ | SVGA_RELOC_INTERNAL);
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1829,6 +1835,7 @@ SVGA3D_ReadbackGBImagePartial(struct svga_winsys_context *swc,
    cmd->invertBox = invertBox;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_cmd_vgpu10.c b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
index 5c121089f91..4cd9d5b9d1e 100644
--- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
@@ -535,7 +535,7 @@ SVGA3D_vgpu10_Draw(struct svga_winsys_context *swc,
 
    SVGA3D_COPY_BASIC_2(vertexCount, startVertexLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -551,7 +551,7 @@ SVGA3D_vgpu10_DrawIndexed(struct svga_winsys_context *swc,
    SVGA3D_COPY_BASIC_3(indexCount, startIndexLocation,
                        baseVertexLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -568,7 +568,7 @@ SVGA3D_vgpu10_DrawInstanced(struct svga_winsys_context *swc,
    SVGA3D_COPY_BASIC_4(vertexCountPerInstance, instanceCount,
                        startVertexLocation, startInstanceLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -588,7 +588,7 @@ SVGA3D_vgpu10_DrawIndexedInstanced(struct svga_winsys_context *swc,
                        startInstanceLocation);
 
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -598,7 +598,7 @@ SVGA3D_vgpu10_DrawAuto(struct svga_winsys_context *swc)
 {
    SVGA3D_CREATE_COMMAND(DrawAuto, DRAW_AUTO);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index d407785ddd9..b10eb45e548 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -46,7 +46,6 @@
 #include "svga_winsys.h"
 
 #define CONST0_UPLOAD_DEFAULT_SIZE 65536
-#define CONST0_UPLOAD_ALIGNMENT 256
 
 DEBUG_GET_ONCE_BOOL_OPTION(no_swtnl, "SVGA_NO_SWTNL", FALSE)
 DEBUG_GET_ONCE_BOOL_OPTION(force_swtnl, "SVGA_FORCE_SWTNL", FALSE);
@@ -220,8 +219,8 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen,
 
    svga->const0_upload = u_upload_create(&svga->pipe,
                                          CONST0_UPLOAD_DEFAULT_SIZE,
-                                         CONST0_UPLOAD_ALIGNMENT,
-                                         PIPE_BIND_CONSTANT_BUFFER);
+                                         PIPE_BIND_CONSTANT_BUFFER,
+                                         PIPE_USAGE_STREAM);
    if (!svga->const0_upload)
       goto cleanup;
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 78e346a92b9..e4f29b8497e 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -74,6 +74,8 @@
  */
 #define SVGA_MAX_CONST_BUF_SIZE (4096 * 4 * sizeof(int))
 
+#define CONST0_UPLOAD_ALIGNMENT 256
+
 struct draw_vertex_shader;
 struct draw_fragment_shader;
 struct svga_shader_variant;
@@ -312,7 +314,7 @@ struct svga_hw_view_state
    struct svga_sampler_view *v;
    unsigned min_lod;
    unsigned max_lod;
-   int dirty;
+   boolean dirty;
 };
 
 /* Updated by calling svga_update_state( SVGA_STATE_HW_DRAW )
@@ -343,6 +345,11 @@ struct svga_hw_draw_state
    SVGA3dElementLayoutId layout_id;
    SVGA3dPrimitiveType topology;
 
+   /** Vertex buffer state */
+   SVGA3dVertexBuffer vbuffers[PIPE_MAX_ATTRIBS];
+   struct svga_winsys_surface *vbuffer_handles[PIPE_MAX_ATTRIBS];
+   unsigned num_vbuffers;
+
    struct svga_winsys_surface *ib;  /**< index buffer for drawing */
    SVGA3dSurfaceFormat ib_format;
    unsigned ib_offset;
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 2d3631d6f9c..80526ed4d15 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -517,11 +517,27 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
          buffers[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
       }
       if (vbuf_count > 0) {
-         ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count,
-                                              0,    /* startBuffer */
-                                              buffers, vb_handle);
-         if (ret != PIPE_OK)
-            return ret;
+         /* If we haven't yet emitted a drawing command or if any
+          * vertex buffer state is changing, issue that state now.
+          */
+         if (((hwtnl->cmd.swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) == 0) ||
+             vbuf_count != svga->state.hw_draw.num_vbuffers ||
+             memcmp(buffers, svga->state.hw_draw.vbuffers,
+                    vbuf_count * sizeof(buffers[0])) ||
+             memcmp(vb_handle, svga->state.hw_draw.vbuffer_handles,
+                    vbuf_count * sizeof(vb_handle[0]))) {
+            ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count,
+                                                 0,    /* startBuffer */
+                                                 buffers, vb_handle);
+            if (ret != PIPE_OK)
+               return ret;
+
+            svga->state.hw_draw.num_vbuffers = vbuf_count;
+            memcpy(svga->state.hw_draw.vbuffers, buffers,
+                   vbuf_count * sizeof(buffers[0]));
+            memcpy(svga->state.hw_draw.vbuffer_handles, vb_handle,
+                   vbuf_count * sizeof(vb_handle[0]));
+         }
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
index fa1744fc33e..8e0db539574 100644
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -368,13 +368,16 @@ static void svga_bind_rasterizer_state( struct pipe_context *pipe,
    struct svga_context *svga = svga_context(pipe);
    struct svga_rasterizer_state *raster = (struct svga_rasterizer_state *)state;
 
+   if (!raster ||
+       !svga->curr.rast ||
+       raster->templ.poly_stipple_enable !=
+       svga->curr.rast->templ.poly_stipple_enable) {
+      svga->dirty |= SVGA_NEW_STIPPLE;
+   }
+
    svga->curr.rast = raster;
 
    svga->dirty |= SVGA_NEW_RAST;
-
-   if (raster && raster->templ.poly_stipple_enable) {
-      svga->dirty |= SVGA_NEW_STIPPLE;
-   }
 }
 
 static void
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 95241176510..3e778f0a087 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -287,6 +287,7 @@ svga_bind_sampler_states(struct pipe_context *pipe,
 {
    struct svga_context *svga = svga_context(pipe);
    unsigned i;
+   boolean any_change = FALSE;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(start + num <= PIPE_MAX_SAMPLERS);
@@ -295,8 +296,15 @@ svga_bind_sampler_states(struct pipe_context *pipe,
    if (!svga_have_vgpu10(svga) && shader != PIPE_SHADER_FRAGMENT)
       return;
 
-   for (i = 0; i < num; i++)
+   for (i = 0; i < num; i++) {
+      if (svga->curr.sampler[shader][start + i] != samplers[i])
+         any_change = TRUE;
       svga->curr.sampler[shader][start + i] = samplers[i];
+   }
+
+   if (!any_change) {
+      return;
+   }
 
    /* find highest non-null sampler[] entry */
    {
@@ -405,6 +413,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
    unsigned flag_1d = 0;
    unsigned flag_srgb = 0;
    uint i;
+   boolean any_change = FALSE;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(start + num <= Elements(svga->curr.sampler_views[shader]));
@@ -422,6 +431,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
          pipe_sampler_view_release(pipe, &svga->curr.sampler_views[shader][start + i]);
          pipe_sampler_view_reference(&svga->curr.sampler_views[shader][start + i],
                                      views[i]);
+         any_change = TRUE;
       }
 
       if (!views[i])
@@ -434,6 +444,10 @@ svga_set_sampler_views(struct pipe_context *pipe,
          flag_1d |= 1 << (start + i);
    }
 
+   if (!any_change) {
+      return;
+   }
+
    /* find highest non-null sampler_views[] entry */
    {
       unsigned j = MAX2(svga->curr.num_sampler_views[shader], start + num);
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 8c5cff5abc1..7f7ceab0aa5 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -221,7 +221,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    struct svga_3d_update_gb_image *whole_update_cmd = NULL;
    uint32 numBoxes = sbuf->map.num_ranges;
    struct pipe_resource *dummy;
-   unsigned int i;
+   unsigned i;
 
    assert(numBoxes);
    assert(sbuf->dma.updates == NULL);
@@ -308,6 +308,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    pipe_resource_reference(&dummy, &sbuf->b.b);
    SVGA_FIFOCommitAll(swc);
 
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;
@@ -381,6 +382,7 @@ svga_buffer_upload_command(struct svga_context *svga,
 
    SVGA_FIFOCommitAll(swc);
 
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index fca501bc47d..0f41e4ea254 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -342,6 +342,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -349,6 +351,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
@@ -384,6 +388,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
    }
 
@@ -457,6 +464,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -515,6 +523,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -606,6 +615,7 @@ vgpu10_get_shader_param(struct pipe_screen *screen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index 2cf41134bd6..8ab1693088a 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -613,7 +613,8 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
     */
    new_buf_size = align(new_buf_size, 16);
 
-   u_upload_alloc(svga->const0_upload, 0, new_buf_size, &offset,
+   u_upload_alloc(svga->const0_upload, 0, new_buf_size,
+                  CONST0_UPLOAD_ALIGNMENT, &offset,
                   &dst_buffer, &dst_map);
    if (!dst_map) {
       if (src_map)
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index e392778c2fb..bac91669be1 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -452,6 +452,7 @@ struct svga_tracked_state svga_hw_fs =
     SVGA_NEW_TEXTURE_BINDING |
     SVGA_NEW_NEED_SWTNL |
     SVGA_NEW_RAST |
+    SVGA_NEW_STIPPLE |
     SVGA_NEW_REDUCED_PRIMITIVE |
     SVGA_NEW_SAMPLER |
     SVGA_NEW_FRAME_BUFFER |
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 24574c1bf85..a103dab25fe 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -173,8 +173,11 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
       return;
    }
 
+   /* SVGA_NEW_PRESCALE */
    key->vs.need_prescale = svga->state.hw_clear.prescale.enabled &&
                            (svga->curr.gs == NULL);
+
+   /* SVGA_NEW_RAST */
    key->vs.allow_psiz = svga->curr.rast->templ.point_size_per_vertex;
 
    /* SVGA_NEW_FS */
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
index 79dc0bf580c..4d21f4f0e60 100644
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -220,8 +220,6 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
    struct draw_context *draw = svga->swtnl.draw;
    struct vertex_info *vinfo = &svga_render->vertex_info;
    SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
-   const enum interp_mode colorInterp =
-      svga->curr.rast->templ.flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
    struct svga_fragment_shader *fs = svga->curr.fs;
    int offset = 0;
    int nr_decls = 0;
@@ -236,7 +234,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
 
    /* always add position */
    src = draw_find_shader_output(draw, TGSI_SEMANTIC_POSITION, 0);
-   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, src);
    vinfo->attrib[0].emit = EMIT_4F;
    vdecl[0].array.offset = offset;
    vdecl[0].identity.method = SVGA3D_DECLMETHOD_DEFAULT;
@@ -257,14 +255,14 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
 
       switch (sem_name) {
       case TGSI_SEMANTIC_COLOR:
-         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_COLOR;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
          offset += 16;
          nr_decls++;
          break;
       case TGSI_SEMANTIC_GENERIC:
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
          vdecl[nr_decls].identity.usageIndex =
@@ -273,7 +271,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
          nr_decls++;
          break;
       case TGSI_SEMANTIC_FOG:
-         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(vinfo, EMIT_1F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT1;
          assert(vdecl[nr_decls].identity.usageIndex == 0);
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index dbb90f7654e..489e68f88e8 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -166,7 +166,7 @@ scalar(struct src_register src, unsigned comp)
 static boolean
 svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < emit->num_arl_consts; ++i) {
       if (emit->arl_consts[i].arl_num == emit->current_arl)
@@ -179,7 +179,7 @@ svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
 static int
 svga_arl_adjustment( const struct svga_shader_emitter *emit )
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < emit->num_arl_consts; ++i) {
       if (emit->arl_consts[i].arl_num == emit->current_arl)
@@ -1175,7 +1175,7 @@ emit_div(struct svga_shader_emitter *emit,
    const struct src_register src1 =
       translate_src_register(emit, &insn->Src[1] );
    SVGA3dShaderDestToken temp = get_temp( emit );
-   int i;
+   unsigned i;
 
    /* For each enabled element, perform a RCP instruction.  Note that
     * RCP is scalar in SVGA3D:
@@ -1822,7 +1822,7 @@ emit_tex_swizzle(struct svga_shader_emitter *emit,
    const unsigned swizzleIn[4] = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
    unsigned srcSwizzle[4];
    unsigned srcWritemask = 0x0, zeroWritemask = 0x0, oneWritemask = 0x0;
-   int i;
+   unsigned i;
 
    /* build writemasks and srcSwizzle terms */
    for (i = 0; i < 4; i++) {
@@ -3371,7 +3371,7 @@ emit_light_twoside(struct svga_shader_emitter *emit)
    struct src_register back[2];
    SVGA3dShaderDestToken color[2];
    int count = emit->internal_color_count;
-   int i;
+   unsigned i;
    SVGA3dShaderInstToken if_token;
 
    if (count == 0)
@@ -3698,7 +3698,7 @@ static boolean
 pre_parse_add_indirect( struct svga_shader_emitter *emit,
                         int num, int current_arl)
 {
-   int i;
+   unsigned i;
    assert(num < 0);
 
    for (i = 0; i < emit->num_arl_consts; ++i) {
@@ -3844,7 +3844,8 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit,
    if (emit->unit == PIPE_SHADER_FRAGMENT && emit->key.fs.pstipple) {
       unsigned unit;
 
-      new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0);
+      new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0,
+                                                        TGSI_FILE_INPUT);
 
       if (new_tokens) {
          /* Setup texture state for stipple */
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index c979f4a8a56..1223e446055 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -2298,11 +2298,13 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
       emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1);
       return TRUE;
 
+#if 0
    case TGSI_FILE_RESOURCE:
       /*opcode0.opcodeType = VGPU10_OPCODE_DCL_RESOURCE;*/
       /* XXX more, VGPU10_RETURN_TYPE_FLOAT */
       assert(!"TGSI_FILE_RESOURCE not handled yet");
       return FALSE;
+#endif
 
    case TGSI_FILE_ADDRESS:
       emit->num_address_regs = MAX2(emit->num_address_regs,
@@ -6170,6 +6172,11 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit)
 
       while (adjust_mask) {
          unsigned index = u_bit_scan(&adjust_mask);
+
+         /* skip the instruction if this vertex attribute is not being used */
+         if (emit->info.input_usage_mask[index] == 0)
+            continue;
+
          unsigned tmp = emit->vs.adjusted_input[index];
          struct tgsi_full_src_register input_src =
             make_src_reg(TGSI_FILE_INPUT, index);
@@ -6604,7 +6611,8 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit,
       tgsi_dump(tokens,0);
    }
 
-   new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0);
+   new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0,
+                                                     TGSI_FILE_INPUT);
 
    emit->fs.pstipple_sampler_unit = unit;
 
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 3129e46ed06..562c6690fc1 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -85,7 +85,7 @@ struct winsys_handle;
 #define SVGA_QUERY_FLAG_SET        (1 << 0)
 #define SVGA_QUERY_FLAG_REF        (1 << 1)
 
-#define SVGA_HINT_FLAG_DRAW_EMITTED (1 << 0)
+#define SVGA_HINT_FLAG_CAN_PRE_FLUSH (1 << 0)  /* Can preemptively flush */
 
 /** Opaque surface handle */
 struct svga_winsys_surface;
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 312b006f96e..a0888f23265 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -254,8 +254,9 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
         if (!vc4->primconvert)
                 goto fail;
 
-        vc4->uploader = u_upload_create(pctx, 16 * 1024, 4,
-                                        PIPE_BIND_INDEX_BUFFER);
+        vc4->uploader = u_upload_create(pctx, 16 * 1024,
+                                        PIPE_BIND_INDEX_BUFFER,
+                                        PIPE_USAGE_STREAM);
 
         vc4_debug |= saved_shaderdb_flag;
 
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index c00855698b8..9b0b540d3fc 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -319,7 +319,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                         if (vc4->indexbuf.user_buffer) {
                                 prsc = NULL;
                                 u_upload_data(vc4->uploader, 0,
-                                              info->count * index_size,
+                                              info->count * index_size, 4,
                                               vc4->indexbuf.user_buffer,
                                               &offset, &prsc);
                         } else {
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index aea2b9dbe87..b8ce377ff6b 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -145,43 +145,6 @@ qir_opt_algebraic(struct vc4_compile *c)
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->op) {
-                case QOP_SEL_X_Y_ZS:
-                case QOP_SEL_X_Y_ZC:
-                case QOP_SEL_X_Y_NS:
-                case QOP_SEL_X_Y_NC:
-                case QOP_SEL_X_Y_CS:
-                case QOP_SEL_X_Y_CC:
-                        if (is_zero(c, inst->src[1])) {
-                                /* Replace references to a 0 uniform value
-                                 * with the SEL_X_0 equivalent.
-                                 */
-                                dump_from(c, inst);
-                                inst->op -= (QOP_SEL_X_Y_ZS - QOP_SEL_X_0_ZS);
-                                inst->src[1] = c->undef;
-                                progress = true;
-                                dump_to(c, inst);
-                                break;
-                        }
-
-                        if (is_zero(c, inst->src[0])) {
-                                /* Replace references to a 0 uniform value
-                                 * with the SEL_X_0 equivalent, flipping the
-                                 * condition being evaluated since the operand
-                                 * order is flipped.
-                                 */
-                                dump_from(c, inst);
-                                inst->op -= QOP_SEL_X_Y_ZS;
-                                inst->op ^= 1;
-                                inst->op += QOP_SEL_X_0_ZS;
-                                inst->src[0] = inst->src[1];
-                                inst->src[1] = c->undef;
-                                progress = true;
-                                dump_to(c, inst);
-                                break;
-                        }
-
-                        break;
-
                 case QOP_FMIN:
                         if (is_1f(c, inst->src[1]) &&
                             inst->src[0].pack >= QPU_UNPACK_8D_REP &&
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index da0d21111a0..3e402d048ba 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -89,7 +89,7 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
                 range->dst_offset = c->next_ubo_dst_offset;
                 c->next_ubo_dst_offset += range->size;
                 c->num_ubo_ranges++;
-        };
+        }
 
         offset -= range->src_offset;
 
@@ -204,27 +204,6 @@ ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
         return r;
 };
 
-static struct qreg
-get_swizzled_channel(struct vc4_compile *c,
-                     struct qreg *srcs, int swiz)
-{
-        switch (swiz) {
-        default:
-        case UTIL_FORMAT_SWIZZLE_NONE:
-                fprintf(stderr, "warning: unknown swizzle\n");
-                /* FALLTHROUGH */
-        case UTIL_FORMAT_SWIZZLE_0:
-                return qir_uniform_f(c, 0.0);
-        case UTIL_FORMAT_SWIZZLE_1:
-                return qir_uniform_f(c, 1.0);
-        case UTIL_FORMAT_SWIZZLE_X:
-        case UTIL_FORMAT_SWIZZLE_Y:
-        case UTIL_FORMAT_SWIZZLE_Z:
-        case UTIL_FORMAT_SWIZZLE_W:
-                return srcs[swiz];
-        }
-}
-
 static inline struct qreg
 qir_SAT(struct vc4_compile *c, struct qreg val)
 {
@@ -275,7 +254,7 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
                                    qir_uniform_f(c, 2.4));
 
         qir_SF(c, qir_FSUB(c, srgb, qir_uniform_f(c, 0.04045)));
-        return qir_SEL_X_Y_NS(c, low, high);
+        return qir_SEL(c, QPU_COND_NS, low, high);
 }
 
 static struct qreg
@@ -338,30 +317,20 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
         struct qreg tex = qir_TEX_RESULT(c);
         c->num_texture_samples++;
 
-        struct qreg texture_output[4];
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         enum pipe_format format = c->key->tex[unit].format;
         if (util_format_is_depth_or_stencil(format)) {
                 struct qreg scaled = ntq_scale_depth_texture(c, tex);
                 for (int i = 0; i < 4; i++)
-                        texture_output[i] = scaled;
+                        dest[i] = scaled;
         } else {
-                struct qreg tex_result_unpacked[4];
                 for (int i = 0; i < 4; i++)
-                        tex_result_unpacked[i] = qir_UNPACK_8_F(c, tex, i);
-
-                const uint8_t *format_swiz =
-                        vc4_get_format_swizzle(c->key->tex[unit].format);
-                for (int i = 0; i < 4; i++) {
-                        texture_output[i] =
-                                get_swizzled_channel(c, tex_result_unpacked,
-                                                     format_swiz[i]);
-                }
+                        dest[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
-        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         for (int i = 0; i < 4; i++) {
-                dest[i] = get_swizzled_channel(c, texture_output,
-                                               c->key->tex[unit].swizzle[i]);
+                if (c->tex_srgb_decode[unit] & (1 << i))
+                        dest[i] = qir_srgb_decode(c, dest[i]);
         }
 }
 
@@ -470,12 +439,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 
         enum pipe_format format = c->key->tex[unit].format;
 
-        struct qreg unpacked[4];
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         if (util_format_is_depth_or_stencil(format)) {
                 struct qreg normalized = ntq_scale_depth_texture(c, tex);
                 struct qreg depth_output;
 
-                struct qreg one = qir_uniform_f(c, 1.0f);
+                struct qreg u0 = qir_uniform_f(c, 0.0f);
+                struct qreg u1 = qir_uniform_f(c, 1.0f);
                 if (c->key->tex[unit].compare_mode) {
                         if (has_proj)
                                 compare = qir_FMUL(c, compare, proj);
@@ -485,31 +455,31 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                 depth_output = qir_uniform_f(c, 0.0f);
                                 break;
                         case PIPE_FUNC_ALWAYS:
-                                depth_output = one;
+                                depth_output = u1;
                                 break;
                         case PIPE_FUNC_EQUAL:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_ZS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
                                 break;
                         case PIPE_FUNC_NOTEQUAL:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_ZC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
                                 break;
                         case PIPE_FUNC_GREATER:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_NC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
                                 break;
                         case PIPE_FUNC_GEQUAL:
                                 qir_SF(c, qir_FSUB(c, normalized, compare));
-                                depth_output = qir_SEL_X_0_NS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
                                 break;
                         case PIPE_FUNC_LESS:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_NS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
                                 break;
                         case PIPE_FUNC_LEQUAL:
                                 qir_SF(c, qir_FSUB(c, normalized, compare));
-                                depth_output = qir_SEL_X_0_NC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
                                 break;
                         }
                 } else {
@@ -517,29 +487,15 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                 }
 
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = depth_output;
+                        dest[i] = depth_output;
         } else {
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
-        }
-
-        const uint8_t *format_swiz = vc4_get_format_swizzle(format);
-        struct qreg texture_output[4];
-        for (int i = 0; i < 4; i++) {
-                texture_output[i] = get_swizzled_channel(c, unpacked,
-                                                         format_swiz[i]);
-        }
-
-        if (util_format_is_srgb(format)) {
-                for (int i = 0; i < 3; i++)
-                        texture_output[i] = qir_srgb_decode(c,
-                                                            texture_output[i]);
+                        dest[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
-        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         for (int i = 0; i < 4; i++) {
-                dest[i] = get_swizzled_channel(c, texture_output,
-                                               c->key->tex[unit].swizzle[i]);
+                if (c->tex_srgb_decode[unit] & (1 << i))
+                        dest[i] = qir_srgb_decode(c, dest[i]);
         }
 }
 
@@ -553,9 +509,8 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
         struct qreg diff = qir_FSUB(c, src, trunc);
         qir_SF(c, diff);
-        return qir_SEL_X_Y_NS(c,
-                              qir_FADD(c, diff, qir_uniform_f(c, 1.0)),
-                              diff);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff);
 }
 
 /**
@@ -572,9 +527,8 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src)
          */
         qir_SF(c, qir_FSUB(c, src, trunc));
 
-        return qir_SEL_X_Y_NS(c,
-                              qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)),
-                              trunc);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc);
 }
 
 /**
@@ -591,9 +545,8 @@ ntq_fceil(struct vc4_compile *c, struct qreg src)
          */
         qir_SF(c, qir_FSUB(c, trunc, src));
 
-        return qir_SEL_X_Y_NS(c,
-                              qir_FADD(c, trunc, qir_uniform_f(c, 1.0)),
-                              trunc);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc);
 }
 
 static struct qreg
@@ -668,10 +621,13 @@ ntq_fcos(struct vc4_compile *c, struct qreg src)
 static struct qreg
 ntq_fsign(struct vc4_compile *c, struct qreg src)
 {
+        struct qreg t = qir_get_temp(c);
+
         qir_SF(c, src);
-        return qir_SEL_X_Y_NC(c,
-                              qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0)),
-                              qir_uniform_f(c, -1.0));
+        qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
+        qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
+        qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
+        return t;
 }
 
 static void
@@ -888,6 +844,100 @@ ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
         return qir_UNPACK_8_I(c, base, offset_bit / 8);
 }
 
+/**
+ * If compare_instr is a valid comparison instruction, emits the
+ * compare_instr's comparison and returns the sel_instr's return value based
+ * on the compare_instr's result.
+ */
+static bool
+ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
+                    nir_alu_instr *compare_instr,
+                    nir_alu_instr *sel_instr)
+{
+        enum qpu_cond cond;
+
+        switch (compare_instr->op) {
+        case nir_op_feq:
+        case nir_op_ieq:
+        case nir_op_seq:
+                cond = QPU_COND_ZS;
+                break;
+        case nir_op_fne:
+        case nir_op_ine:
+        case nir_op_sne:
+                cond = QPU_COND_ZC;
+                break;
+        case nir_op_fge:
+        case nir_op_ige:
+        case nir_op_uge:
+        case nir_op_sge:
+                cond = QPU_COND_NC;
+                break;
+        case nir_op_flt:
+        case nir_op_ilt:
+        case nir_op_slt:
+                cond = QPU_COND_NS;
+                break;
+        default:
+                return false;
+        }
+
+        struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
+        struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
+
+        if (nir_op_infos[compare_instr->op].input_types[0] == nir_type_float)
+                qir_SF(c, qir_FSUB(c, src0, src1));
+        else
+                qir_SF(c, qir_SUB(c, src0, src1));
+
+        switch (sel_instr->op) {
+        case nir_op_seq:
+        case nir_op_sne:
+        case nir_op_sge:
+        case nir_op_slt:
+                *dest = qir_SEL(c, cond,
+                                qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
+                break;
+
+        case nir_op_bcsel:
+                *dest = qir_SEL(c, cond,
+                                ntq_get_alu_src(c, sel_instr, 1),
+                                ntq_get_alu_src(c, sel_instr, 2));
+                break;
+
+        default:
+                *dest = qir_SEL(c, cond,
+                                qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
+                break;
+        }
+
+        return true;
+}
+
+/**
+ * Attempts to fold a comparison generating a boolean result into the
+ * condition code for selecting between two values, instead of comparing the
+ * boolean result against 0 to generate the condition code.
+ */
+static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
+                                  struct qreg *src)
+{
+        if (!instr->src[0].src.is_ssa)
+                goto out;
+        nir_alu_instr *compare =
+                nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        if (!compare)
+                goto out;
+
+        struct qreg dest;
+        if (ntq_emit_comparison(c, &dest, compare, instr))
+                return dest;
+
+out:
+        qir_SF(c, src[0]);
+        return qir_SEL(c, QPU_COND_NS, src[1], src[2]);
+}
+
 static void
 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 {
@@ -974,7 +1024,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         case nir_op_i2b:
         case nir_op_f2b:
                 qir_SF(c, src[0]);
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
+                *dest = qir_SEL(c, QPU_COND_ZC,
+                                qir_uniform_ui(c, ~0),
+                                qir_uniform_ui(c, 0));
                 break;
 
         case nir_op_iadd:
@@ -1016,65 +1068,29 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_seq:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_sne:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_sge:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_slt:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_feq:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_fne:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_fge:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_flt:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ieq:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ine:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ige:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_uge:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_CC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ilt:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
+                if (!ntq_emit_comparison(c, dest, instr, instr)) {
+                        fprintf(stderr, "Bad comparison instruction\n");
+                }
                 break;
 
         case nir_op_bcsel:
-                qir_SF(c, src[0]);
-                *dest = qir_SEL_X_Y_NS(c, src[1], src[2]);
+                *dest = ntq_emit_bcsel(c, instr, src);
                 break;
         case nir_op_fcsel:
                 qir_SF(c, src[0]);
-                *dest = qir_SEL_X_Y_ZC(c, src[1], src[2]);
+                *dest = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
                 break;
 
         case nir_op_frcp:
@@ -1789,6 +1805,56 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         if (stage == QSTAGE_FRAG)
                 vc4_nir_lower_blend(c);
 
+        struct nir_lower_tex_options tex_options = {
+                /* We would need to implement txs, but we don't want the
+                 * int/float conversions
+                 */
+                .lower_rect = false,
+
+                /* We want to use this, but we don't want to newton-raphson
+                 * its rcp.
+                 */
+                .lower_txp = false,
+
+                /* Apply swizzles to all samplers. */
+                .swizzle_result = ~0,
+        };
+
+        /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
+         * The format swizzling applies before sRGB decode, and
+         * ARB_texture_swizzle is the last thing before returning the sample.
+         */
+        for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
+                enum pipe_format format = c->key->tex[i].format;
+
+                if (!format)
+                        continue;
+
+                const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
+
+                for (int j = 0; j < 4; j++) {
+                        uint8_t arb_swiz = c->key->tex[i].swizzle[j];
+
+                        if (arb_swiz <= 3) {
+                                tex_options.swizzles[i][j] =
+                                        format_swizzle[arb_swiz];
+                        } else {
+                                tex_options.swizzles[i][j] = arb_swiz;
+                        }
+
+                        /* If ARB_texture_swizzle is reading from the R, G, or
+                         * B channels of an sRGB texture, then we need to
+                         * apply sRGB decode to this channel at sample time.
+                         */
+                        if (arb_swiz < 3 && util_format_is_srgb(format)) {
+                                c->tex_srgb_decode[i] |= (1 << j);
+                        }
+
+                }
+        }
+
+        nir_lower_tex(c->s, &tex_options);
+
         if (c->fs_key && c->fs_key->light_twoside)
                 nir_lower_two_sided_color(c->s);
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index c6916c48e7e..efbb69b71a7 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -65,19 +65,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_XOR] = { "xor", 1, 2 },
         [QOP_NOT] = { "not", 1, 1 },
 
-        [QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1, false, true },
-        [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1, false, true },
-        [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1, false, true },
-        [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1, false, true },
-        [QOP_SEL_X_0_CS] = { "fsel_x_0_cs", 1, 1, false, true },
-        [QOP_SEL_X_0_CC] = { "fsel_x_0_cc", 1, 1, false, true },
-        [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2, false, true },
-        [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2, false, true },
-        [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2, false, true },
-        [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2, false, true },
-        [QOP_SEL_X_Y_CS] = { "fsel_x_y_cs", 1, 2, false, true },
-        [QOP_SEL_X_Y_CC] = { "fsel_x_y_cc", 1, 2, false, true },
-
         [QOP_RCP] = { "rcp", 1, 1, false, true },
         [QOP_RSQ] = { "rsq", 1, 1, false, true },
         [QOP_EXP2] = { "exp2", 1, 2, false, true },
@@ -219,23 +206,8 @@ qir_is_tex(struct qinst *inst)
 bool
 qir_depends_on_flags(struct qinst *inst)
 {
-        switch (inst->op) {
-        case QOP_SEL_X_0_NS:
-        case QOP_SEL_X_0_NC:
-        case QOP_SEL_X_0_ZS:
-        case QOP_SEL_X_0_ZC:
-        case QOP_SEL_X_0_CS:
-        case QOP_SEL_X_0_CC:
-        case QOP_SEL_X_Y_NS:
-        case QOP_SEL_X_Y_NC:
-        case QOP_SEL_X_Y_ZS:
-        case QOP_SEL_X_Y_ZC:
-        case QOP_SEL_X_Y_CS:
-        case QOP_SEL_X_Y_CC:
-                return true;
-        default:
-                return false;
-        }
+        return (inst->cond != QPU_COND_ALWAYS &&
+                inst->cond != QPU_COND_NEVER);
 }
 
 bool
@@ -292,8 +264,19 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 void
 qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
 {
-        fprintf(stderr, "%s%s ",
+        static const char *conditions[] = {
+                [QPU_COND_ALWAYS] = "",
+                [QPU_COND_NEVER] = ".never",
+                [QPU_COND_ZS] = ".zs",
+                [QPU_COND_ZC] = ".zc",
+                [QPU_COND_NS] = ".ns",
+                [QPU_COND_NC] = ".nc",
+                [QPU_COND_CS] = ".cs",
+                [QPU_COND_CC] = ".cc",
+        };
+        fprintf(stderr, "%s%s%s ",
                 qir_get_op_name(inst->op),
+                conditions[inst->cond],
                 inst->sf ? ".sf" : "");
 
         qir_print_reg(c, inst->dst, true);
@@ -352,6 +335,7 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
         inst->src = calloc(2, sizeof(inst->src[0]));
         inst->src[0] = src0;
         inst->src[1] = src1;
+        inst->cond = QPU_COND_ALWAYS;
 
         return inst;
 }
@@ -503,9 +487,9 @@ qir_SF(struct vc4_compile *c, struct qreg src)
         if (!list_empty(&c->instructions))
                 last_inst = (struct qinst *)c->instructions.prev;
 
-        if (!last_inst ||
-            last_inst->dst.file != src.file ||
-            last_inst->dst.index != src.index ||
+        if (src.file != QFILE_TEMP ||
+            !c->defs[src.index] ||
+            last_inst != c->defs[src.index] ||
             qir_is_multi_instruction(last_inst)) {
                 src = qir_MOV(c, src);
                 last_inst = (struct qinst *)c->instructions.prev;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index b0fbb4c1db2..4ab4d35d0ca 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -93,23 +93,6 @@ enum qop {
         QOP_XOR,
         QOP_NOT,
 
-        /* Note: Orderings of these compares must be the same as in
-         * qpu_defines.h.  Selects the src[0] if the ns flag bit is set,
-         * otherwise 0. */
-        QOP_SEL_X_0_ZS,
-        QOP_SEL_X_0_ZC,
-        QOP_SEL_X_0_NS,
-        QOP_SEL_X_0_NC,
-        QOP_SEL_X_0_CS,
-        QOP_SEL_X_0_CC,
-        /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */
-        QOP_SEL_X_Y_ZS,
-        QOP_SEL_X_Y_ZC,
-        QOP_SEL_X_Y_NS,
-        QOP_SEL_X_Y_NC,
-        QOP_SEL_X_Y_CS,
-        QOP_SEL_X_Y_CC,
-
         QOP_FTOI,
         QOP_ITOF,
         QOP_RCP,
@@ -170,6 +153,7 @@ struct qinst {
         struct qreg dst;
         struct qreg *src;
         bool sf;
+        uint8_t cond;
 };
 
 enum qstage {
@@ -385,6 +369,11 @@ struct vc4_compile {
 
         uint8_t vattr_sizes[8];
 
+        /* Bitfield for whether a given channel of a sampler needs sRGB
+         * decode.
+         */
+        uint8_t tex_srgb_decode[VC4_MAX_TEXTURE_SAMPLERS];
+
         /**
          * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
          *
@@ -463,9 +452,11 @@ void qir_schedule_instructions(struct vc4_compile *c);
 void qir_reorder_uniforms(struct vc4_compile *c);
 
 void qir_emit(struct vc4_compile *c, struct qinst *inst);
-static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
+static inline struct qinst *
+qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
 {
         list_addtail(&inst->link, &c->instructions);
+        return inst;
 }
 
 struct qreg qir_get_temp(struct vc4_compile *c);
@@ -536,11 +527,12 @@ qir_##name(struct vc4_compile *c, struct qreg a)                         \
         qir_emit(c, qir_inst(QOP_##name, t, a, c->undef));               \
         return t;                                                        \
 }                                                                        \
-static inline void                                                       \
+static inline struct qinst *                                             \
 qir_##name##_dest(struct vc4_compile *c, struct qreg dest,               \
                   struct qreg a)                                         \
 {                                                                        \
-        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef));      \
+        return qir_emit_nodef(c, qir_inst(QOP_##name, dest, a,           \
+                                          c->undef));                    \
 }
 
 #define QIR_ALU2(name)                                                   \
@@ -592,18 +584,6 @@ QIR_ALU2(V8MAX)
 QIR_ALU2(V8ADDS)
 QIR_ALU2(V8SUBS)
 QIR_ALU2(MUL24)
-QIR_ALU1(SEL_X_0_ZS)
-QIR_ALU1(SEL_X_0_ZC)
-QIR_ALU1(SEL_X_0_NS)
-QIR_ALU1(SEL_X_0_NC)
-QIR_ALU1(SEL_X_0_CS)
-QIR_ALU1(SEL_X_0_CC)
-QIR_ALU2(SEL_X_Y_ZS)
-QIR_ALU2(SEL_X_Y_ZC)
-QIR_ALU2(SEL_X_Y_NS)
-QIR_ALU2(SEL_X_Y_NC)
-QIR_ALU2(SEL_X_Y_CS)
-QIR_ALU2(SEL_X_Y_CC)
 QIR_ALU2(FMIN)
 QIR_ALU2(FMAX)
 QIR_ALU2(FMINABS)
@@ -648,6 +628,17 @@ QIR_NODST_1(TLB_STENCIL_SETUP)
 QIR_NODST_1(MS_MASK)
 
 static inline struct qreg
+qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1)
+{
+        struct qreg t = qir_get_temp(c);
+        struct qinst *a = qir_MOV_dest(c, t, src0);
+        struct qinst *b = qir_MOV_dest(c, t, src1);
+        a->cond = cond;
+        b->cond = cond ^ 1;
+        return t;
+}
+
+static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
         struct qreg t = qir_FMOV(c, src);
diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c
index d20815f055e..2f280c54523 100644
--- a/src/gallium/drivers/vc4/vc4_qir_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -250,12 +250,11 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
         else if (inst->dst.file == QFILE_TEMP)
                 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
 
+        if (qir_depends_on_flags(inst))
+                add_dep(dir, state->last_sf, n);
+
         if (inst->sf)
                 add_write_dep(dir, &state->last_sf, n);
-
-        if (qir_depends_on_flags(inst)) {
-                add_dep(dir, state->last_sf, n);
-        }
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index cb4e0cfcc3f..b06702afea2 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -64,6 +64,12 @@ set_last_cond_add(struct vc4_compile *c, uint32_t cond)
         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
 }
 
+static void
+set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
+{
+        *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
+}
+
 /**
  * Some special registers can be read from either file, which lets us resolve
  * raddr conflicts without extra MOVs.
@@ -306,42 +312,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
                 }
 
-                switch (qinst->op) {
-                case QOP_SEL_X_0_ZS:
-                case QOP_SEL_X_0_ZC:
-                case QOP_SEL_X_0_NS:
-                case QOP_SEL_X_0_NC:
-                case QOP_SEL_X_0_CS:
-                case QOP_SEL_X_0_CC:
-                        queue(c, qpu_a_MOV(dst, src[0]) | unpack);
-                        set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
-                                          QPU_COND_ZS);
-
-                        queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
-                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
-                                              1) + QPU_COND_ZS);
-                        break;
-
-                case QOP_SEL_X_Y_ZS:
-                case QOP_SEL_X_Y_ZC:
-                case QOP_SEL_X_Y_NS:
-                case QOP_SEL_X_Y_NC:
-                case QOP_SEL_X_Y_CS:
-                case QOP_SEL_X_Y_CC:
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        if (qinst->src[0].pack)
-                                *(last_inst(c)) |= unpack;
-                        set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
-                                          QPU_COND_ZS);
-
-                        queue(c, qpu_a_MOV(dst, src[1]));
-                        if (qinst->src[1].pack)
-                                *(last_inst(c)) |= unpack;
-                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
-                                              1) + QPU_COND_ZS);
-
-                        break;
+                bool handled_qinst_cond = true;
 
+                switch (qinst->op) {
                 case QOP_RCP:
                 case QOP_RSQ:
                 case QOP_EXP2:
@@ -497,16 +470,22 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]) | unpack);
+                                set_last_cond_mul(c, qinst->cond);
                         } else {
                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]) | unpack);
+                                set_last_cond_add(c, qinst->cond);
                         }
+                        handled_qinst_cond = true;
                         set_last_dst_pack(c, qinst);
 
                         break;
                 }
 
+                assert(qinst->cond == QPU_COND_ALWAYS ||
+                       handled_qinst_cond);
+
                 if (qinst->sf) {
                         assert(!qir_is_multi_instruction(qinst));
                         *last_inst(c) |= QPU_SF;
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 9e6678a0625..036da329987 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -921,7 +921,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
 
         void *data;
         struct pipe_resource *shadow_rsc = NULL;
-        u_upload_alloc(vc4->uploader, 0, count * 2,
+        u_upload_alloc(vc4->uploader, 0, count * 2, 4,
                        shadow_offset, &shadow_rsc, &data);
         uint16_t *dst = data;
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 8ddf0865d21..0e289432bbe 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -100,6 +100,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TWO_SIDED_STENCIL:
         case PIPE_CAP_USER_INDEX_BUFFERS:
         case PIPE_CAP_TEXTURE_MULTISAMPLE:
+        case PIPE_CAP_TEXTURE_SWIZZLE:
                 return 1;
 
                 /* lying for GL 2.0 */
@@ -128,7 +129,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
         case PIPE_CAP_CUBE_MAP_ARRAY:
         case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-        case PIPE_CAP_TEXTURE_SWIZZLE:
         case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
         case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
         case PIPE_CAP_SEAMLESS_CUBE_MAP:
@@ -171,6 +171,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_TEXEL_OFFSET:
         case PIPE_CAP_MAX_VERTEX_STREAMS:
         case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
         case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
         case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
         case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -180,15 +182,20 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-	case PIPE_CAP_DEPTH_BOUNDS_TEST:
-	case PIPE_CAP_TGSI_TXQS:
-	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-	case PIPE_CAP_SHAREABLE_SHADERS:
-	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
-	case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+        case PIPE_CAP_DEPTH_BOUNDS_TEST:
+        case PIPE_CAP_TGSI_TXQS:
+        case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+        case PIPE_CAP_SHAREABLE_SHADERS:
+        case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+        case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
                 return 0;
 
                 /* Stream output. */
@@ -345,6 +352,8 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+                return 0;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
                 return 0;
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 527f7637cb6..c322503d816 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -605,7 +605,7 @@ static void virgl_draw_vbo(struct pipe_context *ctx,
            ib.offset = vctx->index_buffer.offset + info.start * ib.index_size;
 
            if (ib.user_buffer) {
-                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size,
+                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size, 256,
                                  ib.user_buffer, &ib.offset, &ib.buffer);
                    ib.user_buffer = NULL;
            }
@@ -948,8 +948,8 @@ struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
                     16, UTIL_SLAB_SINGLETHREADED);
 
    vctx->primconvert = util_primconvert_create(&vctx->base, rs->caps.caps.v1.prim_mask);
-   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024, 256,
-                                     PIPE_BIND_INDEX_BUFFER);
+   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024,
+                                     PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM);
    if (!vctx->uploader)
            goto fail;
 
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 26a4f7736e3..e8d82b37c0f 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -201,6 +201,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_MAX_VERTEX_STREAMS:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
@@ -219,6 +221,11 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index cbf0ba617be..dd76fe553e4 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -635,6 +635,13 @@ enum pipe_cap
    PIPE_CAP_SHAREABLE_SHADERS,
    PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS,
    PIPE_CAP_CLEAR_TEXTURE,
+   PIPE_CAP_DRAW_PARAMETERS,
+   PIPE_CAP_TGSI_PACK_HALF_FLOAT,
+   PIPE_CAP_MULTI_DRAW_INDIRECT,
+   PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS,
+   PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL,
+   PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL,
+   PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -701,6 +708,7 @@ enum pipe_shader_cap
    PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED,
    PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE,
    PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT,
+   PIPE_SHADER_CAP_MAX_SHADER_BUFFERS,
 };
 
 /**
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index a3137aec8db..f300207d4dd 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -76,8 +76,9 @@ enum tgsi_file_type {
    TGSI_FILE_IMMEDIATE           =7,
    TGSI_FILE_PREDICATE           =8,
    TGSI_FILE_SYSTEM_VALUE        =9,
-   TGSI_FILE_RESOURCE            =10,
+   TGSI_FILE_IMAGE               =10,
    TGSI_FILE_SAMPLER_VIEW        =11,
+   TGSI_FILE_BUFFER              =12,
    TGSI_FILE_COUNT      /**< how many TGSI_FILE_ types */
 };
 
@@ -127,7 +128,8 @@ struct tgsi_declaration
    unsigned Invariant   : 1;  /**< invariant optimization? */
    unsigned Local       : 1;  /**< optimize as subroutine local variable? */
    unsigned Array       : 1;  /**< extra array info? */
-   unsigned Padding     : 6;
+   unsigned Atomic      : 1;  /**< atomic only? for TGSI_FILE_BUFFER */
+   unsigned Padding     : 5;
 };
 
 struct tgsi_declaration_range
@@ -186,7 +188,9 @@ struct tgsi_declaration_interp
 #define TGSI_SEMANTIC_TESSINNER  33 /**< inner tessellation levels */
 #define TGSI_SEMANTIC_VERTICESIN 34 /**< number of input vertices */
 #define TGSI_SEMANTIC_HELPER_INVOCATION 35 /**< current invocation is helper */
-#define TGSI_SEMANTIC_COUNT      36 /**< number of semantic values */
+#define TGSI_SEMANTIC_BASEINSTANCE 36
+#define TGSI_SEMANTIC_DRAWID     37
+#define TGSI_SEMANTIC_COUNT      38 /**< number of semantic values */
 
 struct tgsi_declaration_semantic
 {
@@ -195,11 +199,12 @@ struct tgsi_declaration_semantic
    unsigned Padding        : 8;
 };
 
-struct tgsi_declaration_resource {
+struct tgsi_declaration_image {
    unsigned Resource    : 8; /**< one of TGSI_TEXTURE_ */
    unsigned Raw         : 1;
    unsigned Writable    : 1;
-   unsigned Padding     : 22;
+   unsigned Format      : 10; /**< one of PIPE_FORMAT_ */
+   unsigned Padding     : 12;
 };
 
 enum tgsi_return_type {
@@ -406,6 +411,7 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_ENDSUB              102
 #define TGSI_OPCODE_TXQ_LZ              103 /* TXQ for mipmap level 0 */
 #define TGSI_OPCODE_TXQS                104
+#define TGSI_OPCODE_RESQ                105
                                 /* gap */
 #define TGSI_OPCODE_NOP                 107
 
@@ -567,7 +573,8 @@ struct tgsi_instruction
    unsigned Predicate  : 1;  /* BOOL */
    unsigned Label      : 1;
    unsigned Texture    : 1;
-   unsigned Padding    : 2;
+   unsigned Memory     : 1;
+   unsigned Padding    : 1;
 };
 
 /*
@@ -724,6 +731,19 @@ struct tgsi_dst_register
    unsigned Padding     : 6;
 };
 
+#define TGSI_MEMORY_COHERENT (1 << 0)
+#define TGSI_MEMORY_RESTRICT (1 << 1)
+#define TGSI_MEMORY_VOLATILE (1 << 2)
+
+/**
+ * Specifies the type of memory access to do for the LOAD/STORE instruction.
+ */
+struct tgsi_instruction_memory
+{
+   unsigned Qualifier : 3;  /* TGSI_MEMORY_ */
+   unsigned Padding   : 29;
+};
+
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 6bdf03a8b2b..2e4d2830199 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -587,6 +587,8 @@ struct pipe_draw_info
    unsigned start_instance; /**< first instance id */
    unsigned instance_count; /**< number of instances */
 
+   unsigned drawid; /**< id of this draw in a multidraw */
+
    unsigned vertices_per_patch; /**< the number of vertices per patch */
 
    /**
@@ -618,7 +620,7 @@ struct pipe_draw_info
     */
    struct pipe_stream_output_target *count_from_stream_output;
 
-   /* Indirect parameters resource: If not NULL, most values are taken
+   /* Indirect draw parameters resource: If not NULL, most values are taken
     * from this buffer instead, which is laid out as follows:
     *
     * if indexed is TRUE:
@@ -639,6 +641,15 @@ struct pipe_draw_info
     */
    struct pipe_resource *indirect;
    unsigned indirect_offset; /**< must be 4 byte aligned */
+   unsigned indirect_stride; /**< must be 4 byte aligned */
+   unsigned indirect_count; /**< number of indirect draws */
+
+   /* Indirect draw count resource: If not NULL, contains a 32-bit value which
+    * is to be used as the real indirect_count. In that case indirect_count
+    * becomes the maximum possible value.
+    */
+   struct pipe_resource *indirect_params;
+   unsigned indirect_params_offset; /**< must be 4 byte aligned */
 };
 
 
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index f14ffea13e1..0be83658928 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -393,14 +393,15 @@ NineDevice9_ctor( struct NineDevice9 *This,
     This->driver_caps.user_cbufs = GET_PCAP(USER_CONSTANT_BUFFERS);
 
     if (!This->driver_caps.user_vbufs)
-        This->vertex_uploader = u_upload_create(This->pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+        This->vertex_uploader = u_upload_create(This->pipe, 65536,
+                                                PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM);
     if (!This->driver_caps.user_ibufs)
-        This->index_uploader = u_upload_create(This->pipe, 128 * 1024, 4, PIPE_BIND_INDEX_BUFFER);
+        This->index_uploader = u_upload_create(This->pipe, 128 * 1024,
+                                               PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM);
     if (!This->driver_caps.user_cbufs) {
-        unsigned alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
-
+        This->constbuf_alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
         This->constbuf_uploader = u_upload_create(This->pipe, This->vs_const_size,
-                                                  alignment, PIPE_BIND_CONSTANT_BUFFER);
+                                                  PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
     }
 
     This->driver_caps.window_space_position_support = GET_PCAP(TGSI_VS_WINDOW_SPACE_POSITION);
@@ -2955,6 +2956,7 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
         u_upload_data(This->vertex_uploader,
                       0,
                       (info.max_index + 1) * VertexStreamZeroStride, /* XXX */
+                      4,
                       vtxbuf.user_buffer,
                       &vtxbuf.buffer_offset,
                       &vtxbuf.buffer);
@@ -3027,6 +3029,7 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
                       base,
                       (info.max_index -
                        info.min_index + 1) * VertexStreamZeroStride, /* XXX */
+                      4,
                       (const uint8_t *)vbuf.user_buffer + base,
                       &vbuf.buffer_offset,
                       &vbuf.buffer);
@@ -3039,6 +3042,7 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
         u_upload_data(This->index_uploader,
                       0,
                       info.count * ibuf.index_size,
+                      4,
                       ibuf.user_buffer,
                       &ibuf.offset,
                       &ibuf.buffer);
diff --git a/src/gallium/state_trackers/nine/device9.h b/src/gallium/state_trackers/nine/device9.h
index 98d9c4df06a..cbc1e61f5db 100644
--- a/src/gallium/state_trackers/nine/device9.h
+++ b/src/gallium/state_trackers/nine/device9.h
@@ -128,6 +128,7 @@ struct NineDevice9
     struct u_upload_mgr *vertex_uploader;
     struct u_upload_mgr *index_uploader;
     struct u_upload_mgr *constbuf_uploader;
+    unsigned constbuf_alignment;
 
     struct nine_range_pool range_pool;
 
diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c
index fe8933be69a..0feaeab7330 100644
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -1391,7 +1391,15 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
     /* Fog.
      */
     if (key->fog_mode) {
-        struct ureg_src vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0, TGSI_INTERPOLATE_LINEAR);
+        struct ureg_src vPos;
+        if (device->screen->get_param(device->screen,
+                                      PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
+            vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+        } else {
+            vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                      TGSI_INTERPOLATE_LINEAR);
+        }
+
         struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
         if (key->fog_mode == D3DFOG_EXP) {
             ureg_MUL(ureg, rFog, _ZZZZ(vPos), _ZZZZ(_CONST(22)));
@@ -1866,6 +1874,7 @@ nine_ff_update(struct NineDevice9 *device)
             u_upload_data(device->constbuf_uploader,
                           0,
                           cb.buffer_size,
+                          device->constbuf_alignment,
                           cb.user_buffer,
                           &cb.buffer_offset,
                           &cb.buffer);
@@ -1888,6 +1897,7 @@ nine_ff_update(struct NineDevice9 *device)
             u_upload_data(device->constbuf_uploader,
                           0,
                           cb.buffer_size,
+                          device->constbuf_alignment,
                           cb.user_buffer,
                           &cb.buffer_offset,
                           &cb.buffer);
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index 28f27870dc8..ed431738abc 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -462,6 +462,8 @@ struct shader_translator
     boolean lower_preds;
     boolean want_texcoord;
     boolean shift_wpos;
+    boolean wpos_is_sysval;
+    boolean face_is_sysval_integer;
     unsigned texcoord_sn;
 
     struct sm1_instruction insn; /* current instruction */
@@ -945,10 +947,16 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
     case D3DSPR_MISCTYPE:
         switch (param->idx) {
         case D3DSMO_POSITION:
-           if (ureg_src_is_undef(tx->regs.vPos))
-               tx->regs.vPos = ureg_DECL_fs_input(ureg,
-                                                  TGSI_SEMANTIC_POSITION, 0,
-                                                  TGSI_INTERPOLATE_LINEAR);
+           if (ureg_src_is_undef(tx->regs.vPos)) {
+              if (tx->wpos_is_sysval) {
+                  tx->regs.vPos =
+                      ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+              } else {
+                  tx->regs.vPos =
+                      ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
+              }
+           }
            if (tx->shift_wpos) {
                /* TODO: do this only once */
                struct ureg_dst wpos = tx_scratch(tx);
@@ -961,9 +969,20 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
            break;
         case D3DSMO_FACE:
            if (ureg_src_is_undef(tx->regs.vFace)) {
-               tx->regs.vFace = ureg_DECL_fs_input(ureg,
-                                                   TGSI_SEMANTIC_FACE, 0,
-                                                   TGSI_INTERPOLATE_CONSTANT);
+               if (tx->face_is_sysval_integer) {
+                   tmp = tx_scratch(tx);
+                   tx->regs.vFace =
+                       ureg_DECL_system_value(ureg, TGSI_SEMANTIC_FACE, 0);
+
+                   /* convert bool to float */
+                   ureg_UCMP(ureg, tmp, ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X),
+                             ureg_imm1f(ureg, 1), ureg_imm1f(ureg, -1));
+                   tx->regs.vFace = ureg_src(tmp);
+               } else {
+                   tx->regs.vFace = ureg_DECL_fs_input(ureg,
+                                                       TGSI_SEMANTIC_FACE, 0,
+                                                       TGSI_INTERPOLATE_CONSTANT);
+               }
                tx->regs.vFace = ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X);
            }
            src = tx->regs.vFace;
@@ -3259,10 +3278,15 @@ shader_add_ps_fog_stage(struct shader_translator *tx, struct ureg_src src_col)
         return;
     }
 
-    if (tx->info->fog_mode != D3DFOG_NONE)
-        depth = ureg_scalar(ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
-                                              TGSI_INTERPOLATE_LINEAR),
-                                              TGSI_SWIZZLE_Z);
+    if (tx->info->fog_mode != D3DFOG_NONE) {
+        if (tx->wpos_is_sysval) {
+            depth = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+        } else {
+            depth = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                       TGSI_INTERPOLATE_LINEAR);
+        }
+        depth = ureg_scalar(depth, TGSI_SWIZZLE_Z);
+    }
 
     nine_info_mark_const_f_used(tx->info, 33);
     fog_color = NINE_CONSTANT_SRC(32);
@@ -3344,6 +3368,8 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
     tx->shift_wpos = !GET_CAP(TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
     tx->texcoord_sn = tx->want_texcoord ?
         TGSI_SEMANTIC_TEXCOORD : TGSI_SEMANTIC_GENERIC;
+    tx->wpos_is_sysval = GET_CAP(TGSI_FS_POSITION_IS_SYSVAL);
+    tx->face_is_sysval_integer = GET_CAP(TGSI_FS_FACE_IS_INTEGER_SYSVAL);
 
     if (IS_VS) {
         tx->num_constf_allowed = NINE_MAX_CONST_F;
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index 558d07a2bd0..aee31622088 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -260,6 +260,7 @@ prepare_vs_constants_userbuf(struct NineDevice9 *device)
         u_upload_data(device->constbuf_uploader,
                       0,
                       cb.buffer_size,
+                      device->constbuf_alignment,
                       cb.user_buffer,
                       &cb.buffer_offset,
                       &cb.buffer);
@@ -336,6 +337,7 @@ prepare_ps_constants_userbuf(struct NineDevice9 *device)
         u_upload_data(device->constbuf_uploader,
                       0,
                       cb.buffer_size,
+                      device->constbuf_alignment,
                       cb.user_buffer,
                       &cb.buffer_offset,
                       &cb.buffer);
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index 3f5be26fed7..3b1a7a4493c 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -790,7 +790,7 @@ NineSwapChain9_Present( struct NineSwapChain9 *This,
         case D3DSWAPEFFECT_FLIP:
             UNTESTED(4);
         case D3DSWAPEFFECT_DISCARD:
-            /* rotate the queue */;
+            /* rotate the queue */
             pipe_resource_reference(&res, This->buffers[0]->base.resource);
             for (i = 1; i <= This->params.BackBufferCount; i++) {
                 NineSurface9_SetResourceResize(This->buffers[i - 1],
diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c
index da9ca104d93..afcbd974e76 100644
--- a/src/gallium/state_trackers/omx/entrypoint.c
+++ b/src/gallium/state_trackers/omx/entrypoint.c
@@ -137,7 +137,7 @@ OMX_ERRORTYPE omx_workaround_Destructor(OMX_COMPONENTTYPE *comp)
    priv->state = OMX_StateInvalid;
    tsem_up(priv->messageSem);
 
-   /* wait for thread to exit */;
+   /* wait for thread to exit */
    pthread_join(priv->messageHandlerThread, NULL);
 
    return omx_base_component_Destructor(comp);
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index 7b30bf87d75..da9ca5aa6c9 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -174,6 +174,14 @@ static void
 handleSliceParameterBuffer(vlVaContext *context, vlVaBuffer *buf)
 {
    switch (u_reduce_video_profile(context->templat.profile)) {
+   case PIPE_VIDEO_FORMAT_MPEG12:
+      vlVaHandleSliceParameterBufferMPEG12(context, buf);
+      break;
+
+   case PIPE_VIDEO_FORMAT_VC1:
+      vlVaHandleSliceParameterBufferVC1(context, buf);
+      break;
+
    case PIPE_VIDEO_FORMAT_MPEG4_AVC:
       vlVaHandleSliceParameterBufferH264(context, buf);
       break;
diff --git a/src/gallium/state_trackers/va/picture_h264.c b/src/gallium/state_trackers/va/picture_h264.c
index acbfe5d61ed..883a94a2b52 100644
--- a/src/gallium/state_trackers/va/picture_h264.c
+++ b/src/gallium/state_trackers/va/picture_h264.c
@@ -48,6 +48,7 @@ void vlVaHandlePictureParameterBufferH264(vlVaDriver *drv, vlVaContext *context,
    unsigned i;
 
    assert(buf->size >= sizeof(VAPictureParameterBufferH264) && buf->num_elements == 1);
+   context->desc.h264.slice_count = 0;
    /*CurrPic*/
    context->desc.h264.field_order_cnt[0] = h264->CurrPic.TopFieldOrderCnt;
    context->desc.h264.field_order_cnt[1] = h264->CurrPic.BottomFieldOrderCnt;
@@ -162,6 +163,7 @@ void vlVaHandleSliceParameterBufferH264(vlVaContext *context, vlVaBuffer *buf)
    VASliceParameterBufferH264 *h264 = buf->data;
 
    assert(buf->size >= sizeof(VASliceParameterBufferH264) && buf->num_elements == 1);
+   context->desc.h264.slice_count += buf->num_elements;
    context->desc.h264.num_ref_idx_l0_active_minus1 =
       h264->num_ref_idx_l0_active_minus1;
    context->desc.h264.num_ref_idx_l1_active_minus1 =
diff --git a/src/gallium/state_trackers/va/picture_mpeg12.c b/src/gallium/state_trackers/va/picture_mpeg12.c
index e587b1e9c3f..812e9e5b2a9 100644
--- a/src/gallium/state_trackers/va/picture_mpeg12.c
+++ b/src/gallium/state_trackers/va/picture_mpeg12.c
@@ -32,6 +32,7 @@ void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *contex
    VAPictureParameterBufferMPEG2 *mpeg2 = buf->data;
 
    assert(buf->size >= sizeof(VAPictureParameterBufferMPEG2) && buf->num_elements == 1);
+   context->desc.mpeg12.num_slices = 0;
    /*horizontal_size;*/
    /*vertical_size;*/
    vlVaGetReferenceFrame(drv, mpeg2->forward_reference_picture, &context->desc.mpeg12.ref[0]);
@@ -78,3 +79,8 @@ void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf)
       context->desc.mpeg12.non_intra_matrix = NULL;
 }
 
+void vlVaHandleSliceParameterBufferMPEG12(vlVaContext *context, vlVaBuffer *buf)
+{
+   assert(buf->size >= sizeof(VASliceParameterBufferMPEG2) && buf->num_elements == 1);
+   context->desc.mpeg12.num_slices += buf->num_elements;
+}
diff --git a/src/gallium/state_trackers/va/picture_vc1.c b/src/gallium/state_trackers/va/picture_vc1.c
index f95fd8344f5..6ad1571ca96 100644
--- a/src/gallium/state_trackers/va/picture_vc1.c
+++ b/src/gallium/state_trackers/va/picture_vc1.c
@@ -32,6 +32,7 @@ void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context,
    VAPictureParameterBufferVC1 * vc1 = buf->data;
 
    assert(buf->size >= sizeof(VAPictureParameterBufferVC1) && buf->num_elements == 1);
+   context->desc.vc1.slice_count = 0;
    vlVaGetReferenceFrame(drv, vc1->forward_reference_picture, &context->desc.vc1.ref[0]);
    vlVaGetReferenceFrame(drv, vc1->backward_reference_picture, &context->desc.vc1.ref[1]);
    context->desc.vc1.picture_type = vc1->picture_fields.bits.picture_type;
@@ -65,3 +66,9 @@ void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context,
    context->desc.vc1.deblockEnable = vc1->post_processing != 0;
    context->desc.vc1.pquant = vc1->pic_quantizer_fields.bits.pic_quantizer_scale;
 }
+
+void vlVaHandleSliceParameterBufferVC1(vlVaContext *context, vlVaBuffer *buf)
+{
+   assert(buf->size >= sizeof(VASliceParameterBufferVC1) && buf->num_elements == 1);
+   context->desc.vc1.slice_count += buf->num_elements;
+}
diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h
index fa6e0fb301e..bf9d24b2d34 100644
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -351,10 +351,12 @@ VAStatus vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContex
 void vlVaGetReferenceFrame(vlVaDriver *drv, VASurfaceID surface_id, struct pipe_video_buffer **ref_frame);
 void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf);
+void vlVaHandleSliceParameterBufferMPEG12(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandlePictureParameterBufferH264(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleIQMatrixBufferH264(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleSliceParameterBufferH264(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
+void vlVaHandleSliceParameterBufferVC1(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandlePictureParameterBufferMPEG4(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleIQMatrixBufferMPEG4(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleSliceParameterBufferMPEG4(vlVaContext *context, vlVaBuffer *buf);
diff --git a/src/gallium/state_trackers/vdpau/mixer.c b/src/gallium/state_trackers/vdpau/mixer.c
index c0b1ecc55fa..dec79ff95e2 100644
--- a/src/gallium/state_trackers/vdpau/mixer.c
+++ b/src/gallium/state_trackers/vdpau/mixer.c
@@ -294,7 +294,7 @@ VdpStatus vlVdpVideoMixerRender(VdpVideoMixer mixer,
    default:
       pipe_mutex_unlock(vmixer->device->mutex);
       return VDP_STATUS_INVALID_VIDEO_MIXER_PICTURE_STRUCTURE;
-   };
+   }
 
    if (deinterlace != VL_COMPOSITOR_WEAVE && vmixer->deint.enabled &&
        video_surface_past_count > 1 && video_surface_future_count > 0) {
diff --git a/src/gallium/targets/va/Makefile.am b/src/gallium/targets/va/Makefile.am
index 733e7acb455..1edd5c2a5c6 100644
--- a/src/gallium/targets/va/Makefile.am
+++ b/src/gallium/targets/va/Makefile.am
@@ -42,6 +42,8 @@ TARGET_DRIVERS =
 TARGET_CPPFLAGS =
 TARGET_LIB_DEPS =
 
+include $(top_srcdir)/src/gallium/drivers/nouveau/Automake.inc
+
 include $(top_srcdir)/src/gallium/drivers/r600/Automake.inc
 include $(top_srcdir)/src/gallium/drivers/radeonsi/Automake.inc
 
diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c
index bcdfb11c4f1..5ce12abe227 100644
--- a/src/gallium/tests/trivial/compute.c
+++ b/src/gallium/tests/trivial/compute.c
@@ -428,6 +428,35 @@ static void launch_grid(struct context *ctx, const uint *block_layout,
         pipe->launch_grid(pipe, block_layout, grid_layout, pc, input);
 }
 
+static void test_default_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xdeadbeef;
+}
+
+/* test_system_values */
+static void test_system_values_expect(void *p, int s, int x, int y)
+{
+        int id = x / 16, sv = (x % 16) / 4, c = x % 4;
+        int tid[] = { id % 20, (id % 240) / 20, id / 240, 0 };
+        int bsz[] = { 4, 3, 5, 1};
+        int gsz[] = { 5, 4, 1, 1};
+
+        switch (sv) {
+        case 0:
+                *(uint32_t *)p = tid[c] / bsz[c];
+                break;
+        case 1:
+                *(uint32_t *)p = bsz[c];
+                break;
+        case 2:
+                *(uint32_t *)p = gsz[c];
+                break;
+        case 3:
+                *(uint32_t *)p = tid[c] % bsz[c];
+                break;
+        }
+}
+
 static void test_system_values(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -461,44 +490,31 @@ static void test_system_values(struct context *ctx)
                 "  STORE RES[0].xyzw, TEMP[0], SV[3]\n"
                 "  RET\n"
                 "ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                int id = x / 16, sv = (x % 16) / 4, c = x % 4;
-                int tid[] = { id % 20, (id % 240) / 20, id / 240, 0 };
-                int bsz[] = { 4, 3, 5, 1};
-                int gsz[] = { 5, 4, 1, 1};
-
-                switch (sv) {
-                case 0:
-                        *(uint32_t *)p = tid[c] / bsz[c];
-                        break;
-                case 1:
-                        *(uint32_t *)p = bsz[c];
-                        break;
-                case 2:
-                        *(uint32_t *)p = gsz[c];
-                        break;
-                case 3:
-                        *(uint32_t *)p = tid[c] % bsz[c];
-                        break;
-                }
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 76800, 0, init);
+                 76800, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){4, 3, 5}, (uint []){5, 4, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_system_values_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_resource_access */
+static void test_resource_access_init0(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)x;
+}
+
+static void test_resource_access_expect(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)((x + 4 * y) & 0x3f);
+}
+
 static void test_resource_access(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -519,31 +535,33 @@ static void test_resource_access(struct context *ctx)
                 "       STORE RES[1].xyzw, TEMP[1], TEMP[0]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init0(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)x;
-        }
-        void init1(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)((x + 4*y) & 0x3f);
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init0);
+                 256, 0, test_resource_access_init0);
         init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 60, 12, init1);
+                 60, 12, test_default_init);
         init_compute_resources(ctx, (int []) { 0, 1, -1 });
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){15, 12, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_resource_access_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_function_calls */
+static void test_function_calls_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 15 * y + x;
+}
+
+static void test_function_calls_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = (15 * y + x) < 4 ? 2 : 1 ;
+}
+
 static void test_function_calls(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -585,26 +603,26 @@ static void test_function_calls(struct context *ctx)
                 "21:  STORE RES[0].x, TEMP[2], TEMP[1].xxxx\n"
                 "22:  RET\n"
                 "23: ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 15 * y + x;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = (15 * y + x) < 4 ? 2 : 1 ;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 15, 12, init);
+                 15, 12, test_function_calls_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){3, 3, 3}, (uint []){5, 4, 1}, 15, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_function_calls_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_input_global */
+static void test_input_global_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xdeadbeef - (x == 0 ? 0x10001 + 2 * s : 0);
+}
+
 static void test_input_global(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -621,35 +639,39 @@ static void test_input_global(struct context *ctx)
                 "       STORE RGLOBAL.x, TEMP[1].yyyy, TEMP[1]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef - (x == 0 ? 0x10001 + 2 * s : 0);
-        }
         uint32_t input[8] = { 0x10001, 0x10002, 0x10003, 0x10004,
                               0x10005, 0x10006, 0x10007, 0x10008 };
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 32, src, NULL);
-        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 2, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 3, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 2, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 3, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
         init_globals(ctx, (int []){ 0, 1, 2, 3, -1 },
                      (uint32_t *[]){ &input[1], &input[3],
                                      &input[5], &input[7] });
         launch_grid(ctx, (uint []){4, 1, 1}, (uint []){1, 1, 1}, 0, input);
-        check_tex(ctx, 0, expect, NULL);
-        check_tex(ctx, 1, expect, NULL);
-        check_tex(ctx, 2, expect, NULL);
-        check_tex(ctx, 3, expect, NULL);
+        check_tex(ctx, 0, test_input_global_expect, NULL);
+        check_tex(ctx, 1, test_input_global_expect, NULL);
+        check_tex(ctx, 2, test_input_global_expect, NULL);
+        check_tex(ctx, 3, test_input_global_expect, NULL);
         destroy_globals(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_private */
+static void test_private_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = (x / 32) + x % 32;
+}
+
 static void test_private(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -691,26 +713,26 @@ static void test_private(struct context *ctx)
                 "       ENDLOOP\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = (x / 32) + x % 32;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 128, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 32768, 0, init);
+                 32768, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){16, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_private_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_local */
+static void test_local_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x & 0x20 ? 2 : 1;
+}
+
 static void test_local(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -778,26 +800,42 @@ static void test_local(struct context *ctx)
                 "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x & 0x20 ? 2 : 1;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 256, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_local_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_sample */
+static void test_sample_init(void *p, int s, int x, int y)
+{
+        *(float *)p = s ? 1 : x * y;
+}
+
+static void test_sample_expect(void *p, int s, int x, int y)
+{
+        switch (x % 4) {
+        case 0:
+                *(float *)p = x / 4 * y;
+                break;
+        case 1:
+        case 2:
+                *(float *)p = 0;
+                break;
+        case 3:
+                *(float *)p = 1;
+                break;
+        }
+}
+
 static void test_sample(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -818,36 +856,19 @@ static void test_sample(struct context *ctx)
                 "       STORE RES[0].xyzw, TEMP[0], TEMP[1]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(float *)p = s ? 1 : x * y;
-        }
-        void expect(void *p, int s, int x, int y) {
-                switch (x % 4) {
-                case 0:
-                        *(float *)p = x / 4 * y;
-                        break;
-                case 1:
-                case 2:
-                        *(float *)p = 0;
-                        break;
-                case 3:
-                        *(float *)p = 1;
-                        break;
-                }
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 128, 32, init);
+                 128, 32, test_sample_init);
         init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 512, 32, init);
+                 512, 32, test_sample_init);
         init_compute_resources(ctx, (int []) { 1, -1 });
         init_sampler_views(ctx, (int []) { 0, -1 });
         init_sampler_states(ctx, 2);
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_sample_expect, NULL);
         destroy_sampler_states(ctx);
         destroy_sampler_views(ctx);
         destroy_compute_resources(ctx);
@@ -855,6 +876,12 @@ static void test_sample(struct context *ctx)
         destroy_prog(ctx);
 }
 
+/* test_many_kern */
+static void test_many_kern_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x;
+}
+
 static void test_many_kern(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -883,29 +910,34 @@ static void test_many_kern(struct context *ctx)
                 "       STORE RES[0].x, TEMP[0], IMM[0].wwww\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 16, 0, init);
+                 16, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 5, NULL);
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 10, NULL);
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 15, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_many_kern_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_constant */
+static void test_constant_init(void *p, int s, int x, int y)
+{
+        *(float *)p = s ? 0xdeadbeef : 8.0 - (float)x;
+}
+
+static void test_constant_expect(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)x;
+}
+
 static void test_constant(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -922,28 +954,36 @@ static void test_constant(struct context *ctx)
                 "       STORE RES[1].x, TEMP[0], TEMP[1]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(float *)p = s ? 0xdeadbeef : 8.0 - (float)x;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)x;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_constant_init);
         init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_constant_init);
         init_compute_resources(ctx, (int []) { 0, 1, -1 });
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_constant_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_resource_indirect */
+static void test_resource_indirect_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = s == 0 ? 0xdeadbeef :
+                s == 1 ? x % 2 :
+                s == 2 ? 2 * x :
+                2 * x + 1;
+}
+
+static void test_resource_indirect_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 2 * x + (x % 2 ? 1 : 0);
+}
+
 static void test_resource_indirect(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -961,35 +1001,27 @@ static void test_resource_indirect(struct context *ctx)
                 "       STORE RES[0].x, TEMP[0], TEMP[1]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = s == 0 ? 0xdeadbeef :
-                   s == 1 ? x % 2 :
-                   s == 2 ? 2 * x :
-                   2 * x + 1;
-        }
-        void expect(void *p, int s, int x, int y) {
-           *(uint32_t *)p = 2 * x + (x % 2 ? 1 : 0);
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
         init_tex(ctx, 1, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
         init_tex(ctx, 2, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
         init_tex(ctx, 3, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
         init_compute_resources(ctx, (int []) { 0, 1, 2, 3, -1 });
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_resource_indirect_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_surface_ld */
 enum pipe_format surface_fmts[] = {
         PIPE_FORMAT_B8G8R8A8_UNORM,
         PIPE_FORMAT_B8G8R8X8_UNORM,
@@ -1023,6 +1055,42 @@ enum pipe_format surface_fmts[] = {
         PIPE_FORMAT_R32G32B32A32_SINT
 };
 
+static void test_surface_ld_init0f(void *p, int s, int x, int y)
+{
+        float v[] = { 1.0, -.75, .50, -.25 };
+        int i = 0;
+
+        util_format_write_4f(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_ld_init0i(void *p, int s, int x, int y)
+{
+        int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+        int i = 0;
+
+        util_format_write_4i(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_ld_expectf(void *p, int s, int x, int y)
+{
+        float v[4], w[4];
+        int i = 0;
+
+        test_surface_ld_init0f(v, s, x / 4, y);
+        util_format_read_4f(surface_fmts[i], w, 0, v, 0, 0, 0, 1, 1);
+        *(float *)p = w[x % 4];
+}
+
+static void test_surface_ld_expecti(void *p, int s, int x, int y)
+{
+        int32_t v[4], w[4];
+        int i = 0;
+
+        test_surface_ld_init0i(v, s, x / 4, y);
+        util_format_read_4i(surface_fmts[i], w, 0, v, 0, 0, 0, 1, 1);
+        *(uint32_t *)p = w[x % 4];
+}
+
 static void test_surface_ld(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -1040,33 +1108,6 @@ static void test_surface_ld(struct context *ctx)
                 "       RET\n"
                 "    ENDSUB\n";
         int i = 0;
-        void init0f(void *p, int s, int x, int y) {
-                float v[] = { 1.0, -.75, .50, -.25 };
-                util_format_write_4f(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void init0i(void *p, int s, int x, int y) {
-                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
-                util_format_write_4i(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void init1(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expectf(void *p, int s, int x, int y) {
-                float v[4], w[4];
-                init0f(v, s, x / 4, y);
-                util_format_read_4f(surface_fmts[i], w, 0,
-                                    v, 0, 0, 0, 1, 1);
-                *(float *)p = w[x % 4];
-        }
-        void expecti(void *p, int s, int x, int y) {
-                int32_t v[4], w[4];
-                init0i(v, s, x / 4, y);
-                util_format_read_4i(surface_fmts[i], w, 0,
-                                    v, 0, 0, 0, 1, 1);
-                *(uint32_t *)p = w[x % 4];
-        }
 
         printf("- %s\n", __func__);
 
@@ -1085,14 +1126,14 @@ static void test_surface_ld(struct context *ctx)
                 }
 
                 init_tex(ctx, 0, PIPE_TEXTURE_2D, true, surface_fmts[i],
-                         128, 32, (is_int ? init0i : init0f));
+                         128, 32, (is_int ? test_surface_ld_init0i : test_surface_ld_init0f));
                 init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                         512, 32, init1);
+                         512, 32, test_default_init);
                 init_compute_resources(ctx, (int []) { 0, 1, -1 });
                 init_sampler_states(ctx, 2);
                 launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
                             NULL);
-                check_tex(ctx, 1, (is_int ? expecti : expectf), NULL);
+                check_tex(ctx, 1, (is_int ? test_surface_ld_expecti : test_surface_ld_expectf), NULL);
                 destroy_sampler_states(ctx);
                 destroy_compute_resources(ctx);
                 destroy_tex(ctx);
@@ -1101,6 +1142,73 @@ static void test_surface_ld(struct context *ctx)
         destroy_prog(ctx);
 }
 
+/* test_surface_st */
+static void test_surface_st_init0f(void *p, int s, int x, int y)
+{
+        float v[] = { 1.0, -.75, 0.5, -.25 };
+        *(float *)p = v[x % 4];
+}
+
+static void test_surface_st_init0i(void *p, int s, int x, int y)
+{
+        int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+        *(int32_t *)p = v[x % 4];
+}
+
+static void test_surface_st_init1(void *p, int s, int x, int y)
+{
+        int i = 0;
+        memset(p, 1, util_format_get_blocksize(surface_fmts[i]));
+}
+
+static void test_surface_st_expectf(void *p, int s, int x, int y)
+{
+        float vf[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0f(&vf[j], s, 4 * x + j, y);
+        util_format_write_4f(surface_fmts[i], vf, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_st_expects(void *p, int s, int x, int y)
+{
+        int32_t v[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0i(&v[j], s, 4 * x + j, y);
+        util_format_write_4i(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_st_expectu(void *p, int s, int x, int y)
+{
+        uint32_t v[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0i(&v[j], s, 4 * x + j, y);
+        util_format_write_4ui(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static bool test_surface_st_check(void *x, void *y, int sz)
+{
+        int i = 0, j;
+
+        if (util_format_is_float(surface_fmts[i])) {
+                return fabs(*(float *)x - *(float *)y) < 3.92156863e-3;
+
+        } else if ((sz % 4) == 0) {
+                for (j = 0; j < sz / 4; j++)
+                        if (abs(((uint32_t *)x)[j] -
+                                ((uint32_t *)y)[j]) > 1)
+                                return false;
+                return true;
+        } else {
+                return !memcmp(x, y, sz);
+        }
+}
+
 static void test_surface_st(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -1118,60 +1226,6 @@ static void test_surface_st(struct context *ctx)
                 "       RET\n"
                 "    ENDSUB\n";
         int i = 0;
-        void init0f(void *p, int s, int x, int y) {
-                float v[] = { 1.0, -.75, 0.5, -.25 };
-                *(float *)p = v[x % 4];
-        }
-        void init0i(void *p, int s, int x, int y) {
-                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
-                *(int32_t *)p = v[x % 4];
-        }
-        void init1(void *p, int s, int x, int y) {
-                memset(p, 1, util_format_get_blocksize(surface_fmts[i]));
-        }
-        void expectf(void *p, int s, int x, int y) {
-                float vf[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0f(&vf[j], s, 4 * x + j, y);
-                util_format_write_4f(surface_fmts[i], vf, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void expects(void *p, int s, int x, int y) {
-                int32_t v[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0i(&v[j], s, 4 * x + j, y);
-                util_format_write_4i(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void expectu(void *p, int s, int x, int y) {
-                uint32_t v[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0i(&v[j], s, 4 * x + j, y);
-                util_format_write_4ui(surface_fmts[i], v, 0,
-                                      p, 0, 0, 0, 1, 1);
-        }
-        bool check(void *x, void *y, int sz) {
-                int j;
-
-                if (util_format_is_float(surface_fmts[i])) {
-                        return fabs(*(float *)x - *(float *)y) < 3.92156863e-3;
-
-                } else if ((sz % 4) == 0) {
-                        for (j = 0; j < sz / 4; j++)
-                                if (abs(((uint32_t *)x)[j] -
-                                        ((uint32_t *)y)[j]) > 1)
-                                        return false;
-                        return true;
-                } else {
-                        return !memcmp(x, y, sz);
-                }
-        }
 
         printf("- %s\n", __func__);
 
@@ -1192,16 +1246,16 @@ static void test_surface_st(struct context *ctx)
                 }
 
                 init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                         512, 32, (is_int ? init0i : init0f));
+                         512, 32, (is_int ? test_surface_st_init0i : test_surface_st_init0f));
                 init_tex(ctx, 1, PIPE_TEXTURE_2D, true, surface_fmts[i],
-                         128, 32, init1);
+                         128, 32, test_surface_st_init1);
                 init_compute_resources(ctx, (int []) { 0, 1, -1 });
                 init_sampler_states(ctx, 2);
                 launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
                             NULL);
-                check_tex(ctx, 1, (is_int && is_signed ? expects :
-                                   is_int && !is_signed ? expectu :
-                                   expectf), check);
+                check_tex(ctx, 1, (is_int && is_signed ? test_surface_st_expects :
+                                   is_int && !is_signed ? test_surface_st_expectu :
+                                   test_surface_st_expectf), test_surface_st_check);
                 destroy_sampler_states(ctx);
                 destroy_compute_resources(ctx);
                 destroy_tex(ctx);
@@ -1210,6 +1264,12 @@ static void test_surface_st(struct context *ctx)
         destroy_prog(ctx);
 }
 
+/* test_barrier */
+static void test_barrier_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 31;
+}
+
 static void test_barrier(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -1259,26 +1319,62 @@ static void test_barrier(struct context *ctx)
                 "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 31;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 256, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_barrier_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_atom_ops */
+static void test_atom_ops_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xbad;
+}
+
+static void test_atom_ops_expect(void *p, int s, int x, int y)
+{
+        switch (x) {
+        case 0:
+                *(uint32_t *)p = 0xce6c8eef;
+                break;
+        case 1:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 2:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 3:
+                *(uint32_t *)p = 0x10011001;
+                break;
+        case 4:
+                *(uint32_t *)p = 0xdfbdbfff;
+                break;
+        case 5:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 6:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 7:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 8:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 9:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        }
+}
+
 static void test_atom_ops(struct context *ctx, bool global)
 {
         const char *src = "COMP\n"
@@ -1381,58 +1477,26 @@ static void test_atom_ops(struct context *ctx, bool global)
                 "       RET\n"
                 "    ENDSUB\n";
 
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xbad;
-        }
-        void expect(void *p, int s, int x, int y) {
-                switch (x) {
-                case 0:
-                        *(uint32_t *)p = 0xce6c8eef;
-                        break;
-                case 1:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 2:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 3:
-                        *(uint32_t *)p = 0x10011001;
-                        break;
-                case 4:
-                        *(uint32_t *)p = 0xdfbdbfff;
-                        break;
-                case 5:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 6:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 7:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 8:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 9:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                }
-        }
-
         printf("- %s (%s)\n", __func__, global ? "global" : "local");
 
         init_prog(ctx, 40, 0, 0, src,
                   (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 40, 0, init);
+                 40, 0, test_atom_ops_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){10, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_atom_ops_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_atom_race */
+static void test_atom_race_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x & 0x20 ? 0x11111111 : 0xffffffff;
+}
+
 static void test_atom_race(struct context *ctx, bool global)
 {
         const char *src = "COMP\n"
@@ -1551,22 +1615,15 @@ static void test_atom_race(struct context *ctx, bool global)
                 "       RET\n"
                 "    ENDSUB\n";
 
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x & 0x20 ? 0x11111111 : 0xffffffff;
-        }
-
         printf("- %s (%s)\n", __func__, global ? "global" : "local");
 
         init_prog(ctx, 256, 0, 0, src,
                   (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_atom_race_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c
index 4dc32366d61..dae121e4053 100644
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -251,7 +251,7 @@ vmw_swc_flush(struct svga_winsys_context *swc,
    vswc->must_flush = FALSE;
    debug_flush_flush(vswc->fctx);
 #endif
-   swc->hints &= ~SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints &= ~SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    vswc->preemptive_flush = FALSE;
    vswc->seen_surfaces = 0;
    vswc->seen_regions = 0;
@@ -373,7 +373,7 @@ vmw_swc_region_relocation(struct svga_winsys_context *swc,
 
    if (vmw_swc_add_validate_buffer(vswc, reloc->buffer, flags)) {
       vswc->seen_regions += reloc->buffer->size;
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
           vswc->seen_regions >= VMW_GMR_POOL_SIZE/5)
          vswc->preemptive_flush = TRUE;
    }
@@ -416,7 +416,7 @@ vmw_swc_mob_relocation(struct svga_winsys_context *swc,
    if (vmw_swc_add_validate_buffer(vswc, pb_buffer, flags)) {
       vswc->seen_mobs += pb_buffer->size;
 
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
           vswc->seen_mobs >=
             vswc->vws->ioctl.max_mob_memory / VMW_MAX_MOB_MEM_FACTOR)
          vswc->preemptive_flush = TRUE;
@@ -479,7 +479,7 @@ vmw_swc_surface_only_relocation(struct svga_winsys_context *swc,
       ++vswc->surface.staged;
 
       vswc->seen_surfaces += vsurf->size;
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
           vswc->seen_surfaces >=
             vswc->vws->ioctl.max_surface_memory / VMW_MAX_SURF_MEM_FACTOR)
          vswc->preemptive_flush = TRUE;
diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 1396758b7c2..2e180a545ae 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -145,8 +145,8 @@ libglsl_la_SOURCES =					\
 	$(LIBGLSL_FILES)				\
 	$(NIR_FILES)					\
 	$(SPIRV_FILES)					\
-	$(NIR_GENERATED_FILES)
-
+	$(NIR_GENERATED_FILES)				\
+	$(GLSL_TO_NIR_FILES)
 
 libnir_la_SOURCES =					\
 	$(NIR_FILES)					\
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 89113bcd1c6..80d21a0718d 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -18,8 +18,6 @@ NIR_GENERATED_FILES = \
 	nir/nir_opt_algebraic.c
 
 NIR_FILES = \
-	nir/glsl_to_nir.cpp \
-	nir/glsl_to_nir.h \
 	nir/glsl_types.cpp \
 	nir/glsl_types.h \
 	nir/builtin_type_macros.h \
@@ -225,6 +223,11 @@ LIBGLSL_FILES = \
 	s_expression.cpp \
 	s_expression.h
 
+# glsl to nir pass
+GLSL_TO_NIR_FILES = \
+	nir/glsl_to_nir.cpp \
+	nir/glsl_to_nir.h
+
 # glsl_compiler
 
 GLSL_COMPILER_CXX_FILES = \
diff --git a/src/glsl/SConscript b/src/glsl/SConscript
index 70bf5b09c3c..a9d38c163b7 100644
--- a/src/glsl/SConscript
+++ b/src/glsl/SConscript
@@ -65,6 +65,7 @@ for l in ('LIBGLCPP_FILES', 'LIBGLSL_FILES'):
 # XXX: Remove this once we build NIR and NIR_FILES.
 glsl_sources += [
     'nir/glsl_types.cpp',
+    'nir/shader_enums.c',
 ]
 
 if env['msvc']:
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 376dfda2c84..b5e08c048c5 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3106,7 +3106,7 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
                _mesa_glsl_error(loc, state,
                                 "misaligned atomic counter offset");
 
-            var->data.atomic.offset = *offset;
+            var->data.offset = *offset;
             *offset += var->type->atomic_size();
 
          } else {
@@ -3518,7 +3518,7 @@ get_variable_being_redeclared(ir_variable *var, YYLTYPE loc,
               state->is_version(150, 0))
               && strcmp(var->name, "gl_FragCoord") == 0
               && earlier->type == var->type
-              && earlier->data.mode == var->data.mode) {
+              && var->data.mode == ir_var_shader_in) {
       /* Allow redeclaration of gl_FragCoord for ARB_fcc layout
        * qualifiers.
        */
@@ -6170,7 +6170,7 @@ ast_type_specifier::hir(exec_list *instructions,
  * The number of fields processed.  A pointer to the array structure fields is
  * stored in \c *fields_ret.
  */
-unsigned
+static unsigned
 ast_process_struct_or_iface_block_members(exec_list *instructions,
                                           struct _mesa_glsl_parse_state *state,
                                           exec_list *declarations,
@@ -6376,12 +6376,13 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
             if (process_qualifier_constant(state, &loc, "location",
                                            qual->location, &qual_location)) {
                fields[i].location = VARYING_SLOT_VAR0 + qual_location;
-               expl_location = fields[i].location + 1;
+               expl_location = fields[i].location +
+                  fields[i].type->count_attribute_slots(false);
             }
          } else {
             if (layout && layout->flags.q.explicit_location) {
                fields[i].location = expl_location;
-               expl_location = expl_location + 1;
+               expl_location += fields[i].type->count_attribute_slots(false);
             } else {
                fields[i].location = -1;
             }
@@ -6485,7 +6486,7 @@ ast_struct_specifier::hir(exec_list *instructions,
 
    state->struct_specifier_depth++;
 
-   unsigned expl_location = -1;
+   unsigned expl_location = 0;
    if (layout && layout->flags.q.explicit_location) {
       if (!process_qualifier_constant(state, &loc, "location",
                                       layout->location, &expl_location)) {
@@ -6672,7 +6673,7 @@ ast_interface_block::hir(exec_list *instructions,
       return NULL;
    }
 
-   unsigned expl_location = -1;
+   unsigned expl_location = 0;
    if (layout.flags.q.explicit_location) {
       if (!process_qualifier_constant(state, &loc, "location",
                                       layout.location, &expl_location)) {
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index e82c99ee3bb..221aab0043b 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -1057,8 +1057,16 @@ builtin_variable_generator::generate_fs_special_vars()
 {
    ir_variable *var;
 
-   add_input(VARYING_SLOT_POS, vec4_t, "gl_FragCoord");
-   add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing");
+   if (this->state->ctx->Const.GLSLFragCoordIsSysVal)
+      add_system_value(SYSTEM_VALUE_FRAG_COORD, vec4_t, "gl_FragCoord");
+   else
+      add_input(VARYING_SLOT_POS, vec4_t, "gl_FragCoord");
+
+   if (this->state->ctx->Const.GLSLFrontFacingIsSysVal)
+      add_system_value(SYSTEM_VALUE_FRONT_FACE, bool_t, "gl_FrontFacing");
+   else
+      add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing");
+
    if (state->is_version(120, 100))
       add_input(VARYING_SLOT_PNTC, vec2_t, "gl_PointCoord");
 
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index 2fd4cf04079..ef1a6575aaa 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2506,6 +2506,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 
               if (extensions->ARB_shader_subroutine)
                  add_builtin_define(parser, "GL_ARB_shader_subroutine", 1);
+
+              if (extensions->ARB_shader_draw_parameters)
+                 add_builtin_define(parser, "GL_ARB_shader_draw_parameters", 1);
 	   }
 	}
 
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index fb53f6e2e2d..131a5641f8f 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -414,44 +414,6 @@ _mesa_glsl_parse_state::process_version_directive(YYLTYPE *locp, int version,
 }
 
 
-/**
- * Translate a gl_shader_stage to a short shader stage name for debug
- * printouts and error messages.
- */
-const char *
-_mesa_shader_stage_to_string(unsigned stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:   return "vertex";
-   case MESA_SHADER_FRAGMENT: return "fragment";
-   case MESA_SHADER_GEOMETRY: return "geometry";
-   case MESA_SHADER_COMPUTE:  return "compute";
-   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
-   case MESA_SHADER_TESS_EVAL: return "tess eval";
-   }
-
-   unreachable("Unknown shader stage.");
-}
-
-/**
- * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
- * for debug printouts and error messages.
- */
-const char *
-_mesa_shader_stage_to_abbrev(unsigned stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:   return "VS";
-   case MESA_SHADER_FRAGMENT: return "FS";
-   case MESA_SHADER_GEOMETRY: return "GS";
-   case MESA_SHADER_COMPUTE:  return "CS";
-   case MESA_SHADER_TESS_CTRL: return "TCS";
-   case MESA_SHADER_TESS_EVAL: return "TES";
-   }
-
-   unreachable("Unknown shader stage.");
-}
-
 /* This helper function will append the given message to the shader's
    info log and report it via GL_ARB_debug_output. Per that extension,
    'type' is one of the enum values classifying the message, and
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index afb99afa5cd..ecc29920918 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -731,16 +731,6 @@ extern bool _mesa_glsl_process_extension(const char *name, YYLTYPE *name_locp,
 extern "C" {
 #endif
 
-/**
- * Get the textual name of the specified shader stage (which is a
- * gl_shader_stage).
- */
-extern const char *
-_mesa_shader_stage_to_string(unsigned stage);
-
-extern const char *
-_mesa_shader_stage_to_abbrev(unsigned stage);
-
 extern int glcpp_preprocess(void *ctx, const char **shader, char **info_log,
                       const struct gl_extensions *extensions, struct gl_context *gl_ctx);
 
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 70227070ca7..d82bccd5d2f 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -1674,7 +1674,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
    this->data.mode = mode;
    this->data.interpolation = INTERP_QUALIFIER_NONE;
    this->data.max_array_access = 0;
-   this->data.atomic.offset = 0;
+   this->data.offset = 0;
    this->data.precision = GLSL_PRECISION_NONE;
    this->data.image_read_only = false;
    this->data.image_write_only = false;
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 159f94d9edd..a728c036e6b 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -871,9 +871,7 @@ public:
       /**
        * Location an atomic counter is stored at.
        */
-      struct {
-         unsigned offset;
-      } atomic;
+      unsigned offset;
 
       /**
        * Highest element accessed with a constant expression array index
@@ -1726,6 +1724,8 @@ public:
       return operation == ir_binop_all_equal ||
              operation == ir_binop_any_nequal ||
              operation == ir_binop_dot ||
+             operation == ir_binop_vector_extract ||
+             operation == ir_triop_vector_insert ||
              operation == ir_quadop_vector;
    }
 
diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 5bf5ce54f78..f02e959bd18 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -1824,9 +1824,7 @@ ir_swizzle::constant_expression_value(struct hash_table *variable_context)
 ir_constant *
 ir_dereference_variable::constant_expression_value(struct hash_table *variable_context)
 {
-   /* This may occur during compile and var->type is glsl_type::error_type */
-   if (!var)
-      return NULL;
+   assert(var);
 
    /* Give priority to the context hashtable, if it exists */
    if (variable_context) {
diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp
index 3aa52dbd18a..277d4737ab7 100644
--- a/src/glsl/link_atomics.cpp
+++ b/src/glsl/link_atomics.cpp
@@ -83,16 +83,16 @@ namespace {
       const active_atomic_counter *const first = (active_atomic_counter *) a;
       const active_atomic_counter *const second = (active_atomic_counter *) b;
 
-      return int(first->var->data.atomic.offset) - int(second->var->data.atomic.offset);
+      return int(first->var->data.offset) - int(second->var->data.offset);
    }
 
    bool
    check_atomic_counters_overlap(const ir_variable *x, const ir_variable *y)
    {
-      return ((x->data.atomic.offset >= y->data.atomic.offset &&
-               x->data.atomic.offset < y->data.atomic.offset + y->type->atomic_size()) ||
-              (y->data.atomic.offset >= x->data.atomic.offset &&
-               y->data.atomic.offset < x->data.atomic.offset + x->type->atomic_size()));
+      return ((x->data.offset >= y->data.offset &&
+               x->data.offset < y->data.offset + y->type->atomic_size()) ||
+              (y->data.offset >= x->data.offset &&
+               y->data.offset < x->data.offset + x->type->atomic_size()));
    }
 
    void
@@ -158,7 +158,7 @@ namespace {
             ir_variable *var = node->as_variable();
 
             if (var && var->type->contains_atomic()) {
-               int offset = var->data.atomic.offset;
+               int offset = var->data.offset;
                unsigned uniform_loc = var->data.location;
                process_atomic_variable(var->type, prog, &uniform_loc,
                                        var, buffers, num_buffers, &offset, i);
@@ -185,7 +185,7 @@ namespace {
                linker_error(prog, "Atomic counter %s declared at offset %d "
                             "which is already in use.",
                             buffers[i].counters[j].var->name,
-                            buffers[i].counters[j].var->data.atomic.offset);
+                            buffers[i].counters[j].var->data.offset);
             }
          }
       }
@@ -237,7 +237,7 @@ link_assign_atomic_counter_resources(struct gl_context *ctx,
             var->data.binding = i;
 
          storage->atomic_buffer_index = i;
-         storage->offset = var->data.atomic.offset;
+         storage->offset = var->data.offset;
          storage->array_stride = (var->type->is_array() ?
                                   var->type->without_array()->atomic_size() : 0);
          if (!var->type->is_matrix())
diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp
index 422739af063..54fea700b53 100644
--- a/src/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/glsl/link_uniform_block_active_visitor.cpp
@@ -24,7 +24,7 @@
 #include "link_uniform_block_active_visitor.h"
 #include "program.h"
 
-link_uniform_block_active *
+static link_uniform_block_active *
 process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
 {
    const hash_entry *const existing_block =
@@ -92,7 +92,7 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
  * and not over complicating the code we will end up with a count of 8.
  * Here each dimension has 2 different indices counted so we end up with 2*2*2
  */
-struct uniform_block_array_elements **
+static struct uniform_block_array_elements **
 process_arrays(void *mem_ctx, ir_dereference_array *ir,
                struct link_uniform_block_active *block)
 {
diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp
index d5d30bb0a0d..7d755765852 100644
--- a/src/glsl/link_uniform_blocks.cpp
+++ b/src/glsl/link_uniform_blocks.cpp
@@ -266,7 +266,7 @@ process_block_array(struct uniform_block_array_elements *ub_array, char **name,
 /* This function resizes the array types of the block so that later we can use
  * this new size to correctly calculate the offest for indirect indexing.
  */
-const glsl_type *
+static const glsl_type *
 resize_block_array(const glsl_type *type,
                    struct uniform_block_array_elements *ub_array)
 {
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 9cc77feb78a..3853abdb8e6 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -41,6 +41,29 @@
 
 
 /**
+ * Get the varying type stripped of the outermost array if we're processing
+ * a stage whose varyings are arrays indexed by a vertex number (such as
+ * geometry shader inputs).
+ */
+static const glsl_type *
+get_varying_type(const ir_variable *var, gl_shader_stage stage)
+{
+   const glsl_type *type = var->type;
+
+   if (!var->data.patch &&
+       ((var->data.mode == ir_var_shader_out &&
+         stage == MESA_SHADER_TESS_CTRL) ||
+        (var->data.mode == ir_var_shader_in &&
+         (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_TESS_EVAL ||
+          stage == MESA_SHADER_GEOMETRY)))) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   return type;
+}
+
+/**
  * Validate the types and qualifiers of an output from one stage against the
  * matching input to another stage.
  */
@@ -309,6 +332,41 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
    }
 }
 
+/**
+ * Demote shader inputs and outputs that are not used in other stages, and
+ * remove them via dead code elimination.
+ */
+void
+remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
+                                        gl_shader *sh,
+                                        enum ir_variable_mode mode)
+{
+   if (is_separate_shader_object)
+      return;
+
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *const var = node->as_variable();
+
+      if ((var == NULL) || (var->data.mode != int(mode)))
+	 continue;
+
+      /* A shader 'in' or 'out' variable is only really an input or output if
+       * its value is used by other shader stages. This will cause the
+       * variable to have a location assigned.
+       */
+      if (var->data.is_unmatched_generic_inout) {
+         assert(var->data.mode != ir_var_temporary);
+	 var->data.mode = ir_var_auto;
+      }
+   }
+
+   /* Eliminate code that is now dead due to unused inputs/outputs being
+    * demoted.
+    */
+   while (do_dead_code(sh->ir, false))
+      ;
+
+}
 
 /**
  * Initialize this object based on a string that was passed to
@@ -767,7 +825,8 @@ public:
                    gl_shader_stage consumer_stage);
    ~varying_matches();
    void record(ir_variable *producer_var, ir_variable *consumer_var);
-   unsigned assign_locations(uint64_t reserved_slots, bool separate_shader);
+   unsigned assign_locations(struct gl_shader_program *prog,
+                             uint64_t reserved_slots, bool separate_shader);
    void store_locations() const;
 
 private:
@@ -946,32 +1005,14 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
    this->matches[this->num_matches].packing_order
       = this->compute_packing_order(var);
    if (this->disable_varying_packing) {
-      const struct glsl_type *type = var->type;
       unsigned slots;
+      gl_shader_stage stage =
+         (producer_var != NULL) ? producer_stage : consumer_stage;
 
-      /* Some shader stages have 2-dimensional varyings. Use the inner type. */
-      if (!var->data.patch &&
-          ((var == producer_var && producer_stage == MESA_SHADER_TESS_CTRL) ||
-           (var == consumer_var && (consumer_stage == MESA_SHADER_TESS_CTRL ||
-                                    consumer_stage == MESA_SHADER_TESS_EVAL ||
-                                    consumer_stage == MESA_SHADER_GEOMETRY)))) {
-         assert(type->is_array());
-         type = type->fields.array;
-      }
+      const glsl_type *type = get_varying_type(var, stage);
 
-      if (type->is_array()) {
-         slots = 1;
-         while (type->is_array()) {
-            slots *= type->length;
-            type = type->fields.array;
-         }
-         slots *= type->matrix_columns;
-      } else {
-         slots = type->matrix_columns;
-      }
-      if (type->without_array()->is_dual_slot_double())
-         slots *= 2;
-      this->matches[this->num_matches].num_components = 4 * slots;
+      slots = type->count_attribute_slots(false);
+      this->matches[this->num_matches].num_components = slots * 4;
    } else {
       this->matches[this->num_matches].num_components
          = var->type->component_slots();
@@ -991,7 +1032,9 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
  * passed to varying_matches::record().
  */
 unsigned
-varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
+varying_matches::assign_locations(struct gl_shader_program *prog,
+                                  uint64_t reserved_slots,
+                                  bool separate_shader)
 {
    /* We disable varying sorting for separate shader programs for the
     * following reasons:
@@ -1028,10 +1071,20 @@ varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
    for (unsigned i = 0; i < this->num_matches; i++) {
       unsigned *location = &generic_location;
 
-      if ((this->matches[i].consumer_var &&
-           this->matches[i].consumer_var->data.patch) ||
-          (this->matches[i].producer_var &&
-           this->matches[i].producer_var->data.patch))
+      const ir_variable *var;
+      const glsl_type *type;
+      bool is_vertex_input = false;
+      if (matches[i].consumer_var) {
+         var = matches[i].consumer_var;
+         type = get_varying_type(var, consumer_stage);
+         if (consumer_stage == MESA_SHADER_VERTEX)
+            is_vertex_input = true;
+      } else {
+         var = matches[i].producer_var;
+         type = get_varying_type(var, producer_stage);
+      }
+
+      if (var->data.patch)
          location = &generic_patch_location;
 
       /* Advance to the next slot if this varying has a different packing
@@ -1043,9 +1096,45 @@ varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
           != this->matches[i].packing_class) {
          *location = ALIGN(*location, 4);
       }
-      while ((*location < MAX_VARYING * 4u) &&
-            (reserved_slots & (1u << *location / 4u))) {
-         *location = ALIGN(*location + 1, 4);
+
+      unsigned num_elements =  type->count_attribute_slots(is_vertex_input);
+      unsigned slot_end = this->disable_varying_packing ? 4 :
+         type->without_array()->vector_elements;
+      slot_end += *location - 1;
+
+      /* FIXME: We could be smarter in the below code and loop back over
+       * trying to fill any locations that we skipped because we couldn't pack
+       * the varying between an explicit location. For now just let the user
+       * hit the linking error if we run out of room and suggest they use
+       * explicit locations.
+       */
+      for (unsigned j = 0; j < num_elements; j++) {
+         while ((slot_end < MAX_VARYING * 4u) &&
+                ((reserved_slots & (UINT64_C(1) << *location / 4u) ||
+                 (reserved_slots & (UINT64_C(1) << slot_end / 4u))))) {
+
+            *location = ALIGN(*location + 1, 4);
+            slot_end = *location;
+
+            /* reset the counter and try again */
+            j = 0;
+         }
+
+         /* Increase the slot to make sure there is enough room for next
+          * array element.
+          */
+         if (this->disable_varying_packing)
+            slot_end += 4;
+         else
+            slot_end += type->without_array()->vector_elements;
+      }
+
+      if (!var->data.patch && *location >= MAX_VARYING * 4u) {
+         linker_error(prog, "insufficient contiguous locations available for "
+                      "%s it is possible an array or struct could not be "
+                      "packed between varyings with explicit locations. Try "
+                      "using an explicit location for arrays and structs.",
+                      var->name);
       }
 
       this->matches[i].generic_location = *location;
@@ -1429,12 +1518,20 @@ reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode)
    foreach_in_list(ir_instruction, node, stage->ir) {
       ir_variable *const var = node->as_variable();
 
-      if (var == NULL || var->data.mode != io_mode || !var->data.explicit_location)
+      if (var == NULL || var->data.mode != io_mode ||
+          !var->data.explicit_location ||
+          var->data.location < VARYING_SLOT_VAR0)
          continue;
 
       var_slot = var->data.location - VARYING_SLOT_VAR0;
-      if (var_slot >= 0 && var_slot < MAX_VARYING)
-         slots |= 1u << var_slot;
+
+      unsigned num_elements = get_varying_type(var, stage->Stage)
+         ->count_attribute_slots(stage->Stage == MESA_SHADER_VERTEX);
+      for (unsigned i = 0; i < num_elements; i++) {
+         if (var_slot >= 0 && var_slot < MAX_VARYING)
+            slots |= UINT64_C(1) << var_slot;
+         var_slot += 1;
+      }
    }
 
    return slots;
@@ -1620,7 +1717,7 @@ assign_varying_locations(struct gl_context *ctx,
       reserved_varying_slot(producer, ir_var_shader_out) |
       reserved_varying_slot(consumer, ir_var_shader_in);
 
-   const unsigned slots_used = matches.assign_locations(reserved_slots,
+   const unsigned slots_used = matches.assign_locations(prog, reserved_slots,
                                                         prog->SeparateShader);
    matches.store_locations();
 
@@ -1640,17 +1737,6 @@ assign_varying_locations(struct gl_context *ctx,
    hash_table_dtor(consumer_inputs);
    hash_table_dtor(consumer_interface_inputs);
 
-   if (!disable_varying_packing) {
-      if (producer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
-                               0, producer);
-      }
-      if (consumer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
-                               consumer_vertices, consumer);
-      }
-   }
-
    if (consumer && producer) {
       foreach_in_list(ir_instruction, node, consumer->ir) {
          ir_variable *const var = node->as_variable();
@@ -1691,13 +1777,29 @@ assign_varying_locations(struct gl_context *ctx,
 			    var->name,
                             _mesa_shader_stage_to_string(producer->Stage));
             }
-
-            /* An 'in' variable is only really a shader input if its
-             * value is written by the previous stage.
-             */
-            var->data.mode = ir_var_auto;
          }
       }
+
+      /* Now that validation is done its safe to remove unused varyings. As
+       * we have both a producer and consumer its safe to remove unused
+       * varyings even if the program is a SSO because the stages are being
+       * linked together i.e. we have a multi-stage SSO.
+       */
+      remove_unused_shader_inputs_and_outputs(false, producer,
+                                              ir_var_shader_out);
+      remove_unused_shader_inputs_and_outputs(false, consumer,
+                                              ir_var_shader_in);
+   }
+
+   if (!disable_varying_packing) {
+      if (producer) {
+         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
+                               0, producer);
+      }
+      if (consumer) {
+         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
+                               consumer_vertices, consumer);
+      }
    }
 
    return true;
diff --git a/src/glsl/link_varyings.h b/src/glsl/link_varyings.h
index 1d12978fa30..b2812614ecc 100644
--- a/src/glsl/link_varyings.h
+++ b/src/glsl/link_varyings.h
@@ -268,6 +268,11 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
                       const void *mem_ctx, unsigned num_names,
                       char **varying_names, tfeedback_decl *decls);
 
+void
+remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
+                                        gl_shader *sh,
+                                        enum ir_variable_mode mode);
+
 bool
 store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                      unsigned num_tfeedback_decls,
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index c7e69765335..418bd09e49e 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1014,7 +1014,7 @@ cross_validate_globals(struct gl_shader_program *prog,
             }
 
             if (var->type->contains_atomic() &&
-                var->data.atomic.offset != existing->data.atomic.offset) {
+                var->data.offset != existing->data.offset) {
                linker_error(prog, "offset specifications for %s "
                             "`%s' have differing values\n",
                             mode_string(var), var->name);
@@ -2723,30 +2723,6 @@ match_explicit_outputs_to_inputs(struct gl_shader_program *prog,
 }
 
 /**
- * Demote shader inputs and outputs that are not used in other stages
- */
-void
-demote_shader_inputs_and_outputs(gl_shader *sh, enum ir_variable_mode mode)
-{
-   foreach_in_list(ir_instruction, node, sh->ir) {
-      ir_variable *const var = node->as_variable();
-
-      if ((var == NULL) || (var->data.mode != int(mode)))
-	 continue;
-
-      /* A shader 'in' or 'out' variable is only really an input or output if
-       * its value is used by other shader stages.  This will cause the variable
-       * to have a location assigned.
-       */
-      if (var->data.is_unmatched_generic_inout) {
-         assert(var->data.mode != ir_var_temporary);
-	 var->data.mode = ir_var_auto;
-      }
-   }
-}
-
-
-/**
  * Store the gl_FragDepth layout in the gl_shader_program struct.
  */
 static void
@@ -3433,7 +3409,7 @@ add_interface_variables(struct gl_shader_program *shProg,
 }
 
 static bool
-add_packed_varyings(struct gl_shader_program *shProg, int stage)
+add_packed_varyings(struct gl_shader_program *shProg, int stage, GLenum type)
 {
    struct gl_shader *sh = shProg->_LinkedShaders[stage];
    GLenum iface;
@@ -3454,10 +3430,13 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage)
          default:
             unreachable("unexpected type");
          }
-         if (!add_program_resource(shProg, iface, var,
-                                   build_stageref(shProg, var->name,
-                                                  var->data.mode)))
-            return false;
+
+         if (type == iface) {
+            if (!add_program_resource(shProg, iface, var,
+                                      build_stageref(shProg, var->name,
+                                                     var->data.mode)))
+               return false;
+         }
       }
    }
    return true;
@@ -3724,9 +3703,9 @@ build_program_resource_list(struct gl_shader_program *shProg)
 
    /* Program interface needs to expose varyings in case of SSO. */
    if (shProg->SeparateShader) {
-      if (!add_packed_varyings(shProg, input_stage))
+      if (!add_packed_varyings(shProg, input_stage, GL_PROGRAM_INPUT))
          return;
-      if (!add_packed_varyings(shProg, output_stage))
+      if (!add_packed_varyings(shProg, output_stage, GL_PROGRAM_OUTPUT))
          return;
    }
 
@@ -3942,8 +3921,10 @@ split_ubos_and_ssbos(void *mem_ctx,
                      unsigned num_blocks,
                      struct gl_uniform_block ***ubos,
                      unsigned *num_ubos,
+                     unsigned **ubo_interface_block_indices,
                      struct gl_uniform_block ***ssbos,
-                     unsigned *num_ssbos)
+                     unsigned *num_ssbos,
+                     unsigned **ssbo_interface_block_indices)
 {
    unsigned num_ubo_blocks = 0;
    unsigned num_ssbo_blocks = 0;
@@ -3961,11 +3942,25 @@ split_ubos_and_ssbos(void *mem_ctx,
    *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
    *num_ssbos = 0;
 
+   if (ubo_interface_block_indices)
+      *ubo_interface_block_indices =
+         ralloc_array(mem_ctx, unsigned, num_ubo_blocks);
+
+   if (ssbo_interface_block_indices)
+      *ssbo_interface_block_indices =
+         ralloc_array(mem_ctx, unsigned, num_ssbo_blocks);
+
    for (unsigned i = 0; i < num_blocks; i++) {
       if (blocks[i].IsShaderStorage) {
-         (*ssbos)[(*num_ssbos)++] = &blocks[i];
+         (*ssbos)[*num_ssbos] = &blocks[i];
+         if (ssbo_interface_block_indices)
+            (*ssbo_interface_block_indices)[*num_ssbos] = i;
+         (*num_ssbos)++;
       } else {
-         (*ubos)[(*num_ubos)++] = &blocks[i];
+         (*ubos)[*num_ubos] = &blocks[i];
+         if (ubo_interface_block_indices)
+            (*ubo_interface_block_indices)[*num_ubos] = i;
+         (*num_ubos)++;
       }
    }
 
@@ -4443,14 +4438,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       do_dead_builtin_varyings(ctx, sh, NULL,
                                num_tfeedback_decls, tfeedback_decls);
 
-      if (!prog->SeparateShader) {
-         demote_shader_inputs_and_outputs(sh, ir_var_shader_out);
-         /* Eliminate code that is now dead due to unused outputs being
-          * demoted.
-          */
-         while (do_dead_code(sh->ir, false))
-            ;
-      }
+      remove_unused_shader_inputs_and_outputs(prog->SeparateShader, sh,
+                                              ir_var_shader_out);
    }
    else if (first == MESA_SHADER_FRAGMENT) {
       /* If the program only contains a fragment shader...
@@ -4468,12 +4457,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                                        NULL /* tfeedback_decls */))
             goto done;
       } else {
-         demote_shader_inputs_and_outputs(sh, ir_var_shader_in);
-         /* Eliminate code that is now dead due to unused inputs being
-          * demoted.
-          */
-         while (do_dead_code(sh->ir, false))
-            ;
+         remove_unused_shader_inputs_and_outputs(false, sh,
+                                                 ir_var_shader_in);
       }
    }
 
@@ -4494,16 +4479,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                 next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
                 tfeedback_decls);
 
-      demote_shader_inputs_and_outputs(sh_i, ir_var_shader_out);
-      demote_shader_inputs_and_outputs(sh_next, ir_var_shader_in);
-
-      /* Eliminate code that is now dead due to unused outputs being demoted.
-       */
-      while (do_dead_code(sh_i->ir, false))
-         ;
-      while (do_dead_code(sh_next->ir, false))
-         ;
-
       /* This must be done after all dead varyings are eliminated. */
       if (!check_against_output_limit(ctx, prog, sh_i))
          goto done;
@@ -4577,8 +4552,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                               sh->NumBufferInterfaceBlocks,
                               &sh->UniformBlocks,
                               &sh->NumUniformBlocks,
+                              NULL,
                               &sh->ShaderStorageBlocks,
-                              &sh->NumShaderStorageBlocks);
+                              &sh->NumShaderStorageBlocks,
+                              NULL);
       }
    }
 
@@ -4587,8 +4564,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                         prog->NumBufferInterfaceBlocks,
                         &prog->UniformBlocks,
                         &prog->NumUniformBlocks,
+                        &prog->UboInterfaceBlockIndex,
                         &prog->ShaderStorageBlocks,
-                        &prog->NumShaderStorageBlocks);
+                        &prog->NumShaderStorageBlocks,
+                        &prog->SsboInterfaceBlockIndex);
 
    /* FINISHME: Assign fragment shader output locations. */
 
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 3cc42d229a0..e9035b4e1ba 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -402,7 +402,7 @@ nir_visitor::visit(ir_variable *ir)
    var->data.index = ir->data.index;
    var->data.descriptor_set = 0;
    var->data.binding = ir->data.binding;
-   var->data.atomic.offset = ir->data.atomic.offset;
+   var->data.offset = ir->data.offset;
    var->data.image.read_only = ir->data.image_read_only;
    var->data.image.write_only = ir->data.image_write_only;
    var->data.image.coherent = ir->data.image_coherent;
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 61e51da5867..a2c642ec6fc 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -296,9 +296,7 @@ typedef struct {
       /**
        * Location an atomic counter is stored at.
        */
-      struct {
-         unsigned offset;
-      } atomic;
+      unsigned offset;
 
       /**
        * ARB_shader_image_load_store qualifiers.
@@ -1503,8 +1501,8 @@ typedef struct nir_function {
 } nir_function;
 
 typedef struct nir_shader_compiler_options {
-   bool lower_ffma;
    bool lower_fdiv;
+   bool lower_ffma;
    bool lower_flrp;
    bool lower_fpow;
    bool lower_fsat;
@@ -2004,12 +2002,46 @@ nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
 void nir_validate_shader(nir_shader *shader);
 void nir_metadata_set_validation_flag(nir_shader *shader);
 void nir_metadata_check_validation_flag(nir_shader *shader);
+
+#include "util/debug.h"
+static inline bool
+should_clone_nir(void)
+{
+   static int should_clone = -1;
+   if (should_clone < 0)
+      should_clone = env_var_as_boolean("NIR_TEST_CLONE", false);
+
+   return should_clone;
+}
 #else
 static inline void nir_validate_shader(nir_shader *shader) { (void) shader; }
 static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
 static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
+static inline bool should_clone_nir(void) { return false; }
 #endif /* DEBUG */
 
+#define _PASS(nir, do_pass) do {                                     \
+   do_pass                                                           \
+   nir_validate_shader(nir);                                         \
+   if (should_clone_nir()) {                                         \
+      nir_shader *clone = nir_shader_clone(ralloc_parent(nir), nir); \
+      ralloc_free(nir);                                              \
+      nir = clone;                                                   \
+   }                                                                 \
+} while (0)
+
+#define NIR_PASS(progress, nir, pass, ...) _PASS(nir,                \
+   nir_metadata_set_validation_flag(nir);                            \
+   if (pass(nir, ##__VA_ARGS__)) {                                   \
+      progress = true;                                               \
+      nir_metadata_check_validation_flag(nir);                       \
+   }                                                                 \
+)
+
+#define NIR_PASS_V(nir, pass, ...) _PASS(nir,                        \
+   pass(nir, ##__VA_ARGS__);                                         \
+)
+
 void nir_calc_dominance_impl(nir_function_impl *impl);
 void nir_calc_dominance(nir_shader *shader);
 
diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c
index 259c154149b..1aa78e18a85 100644
--- a/src/glsl/nir/nir_lower_atomics.c
+++ b/src/glsl/nir/nir_lower_atomics.c
@@ -74,7 +74,7 @@ lower_instr(nir_intrinsic_instr *instr,
       state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index;
 
    nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
-   offset_const->value.u[0] = instr->variables[0]->var->data.atomic.offset;
+   offset_const->value.u[0] = instr->variables[0]->var->data.offset;
 
    nir_instr_insert_before(&instr->instr, &offset_const->instr);
 
diff --git a/src/glsl/nir/nir_lower_clip.c b/src/glsl/nir/nir_lower_clip.c
index f84a02410a8..0ca6a289396 100644
--- a/src/glsl/nir/nir_lower_clip.c
+++ b/src/glsl/nir/nir_lower_clip.c
@@ -72,6 +72,7 @@ store_clipdist_output(nir_builder *b, nir_variable *out, nir_ssa_def **val)
    store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
    store->num_components = 4;
    store->const_index[0] = out->data.driver_location;
+   store->const_index[1] = 0xf;   /* wrmask */
    store->src[0].ssa = nir_vec4(b, val[0], val[1], val[2], val[3]);
    store->src[0].is_ssa = true;
    store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 4bc6d16cbad..d31507fe531 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -361,12 +361,12 @@ binop("udiv", tuint, "", "src0 / src1")
 # returns a boolean representing the carry resulting from the addition of
 # the two unsigned arguments.
 
-binop_convert("uadd_carry", tbool, tuint, commutative, "src0 + src1 < src0")
+binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 
 # returns a boolean representing the borrow resulting from the subtraction
 # of the two unsigned arguments.
 
-binop_convert("usub_borrow", tbool, tuint, "", "src0 < src1")
+binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 
 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index f4a863239b6..db18fc9619b 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -184,6 +184,7 @@ optimizations = [
    (('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
    # Division and reciprocal
    (('fdiv', 1.0, a), ('frcp', a)),
+   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
    (('frcp', ('frcp', a)), a),
    (('frcp', ('fsqrt', a)), ('frsq', a)),
    (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
@@ -226,8 +227,8 @@ optimizations = [
    # Misc. lowering
    (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
    (('bitfield_insert', a, b, c, d), ('bfi', ('bfm', d, c), b, a), 'options->lower_bitfield_insert'),
-   (('uadd_carry', a, b), ('ult', ('iadd', a, b), a), 'options->lower_uadd_carry'),
-   (('usub_borrow', a, b), ('ult', a, b), 'options->lower_usub_borrow'),
+   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
+   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
 ]
 
 # Add optimizations to handle the case where the result of a ternary is
diff --git a/src/glsl/nir/shader_enums.c b/src/glsl/nir/shader_enums.c
index f1dbe0158ab..1410a504484 100644
--- a/src/glsl/nir/shader_enums.c
+++ b/src/glsl/nir/shader_enums.c
@@ -47,6 +47,42 @@ const char * gl_shader_stage_name(gl_shader_stage stage)
    return NAME(stage);
 }
 
+/**
+ * Translate a gl_shader_stage to a short shader stage name for debug
+ * printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_string(unsigned stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:   return "vertex";
+   case MESA_SHADER_FRAGMENT: return "fragment";
+   case MESA_SHADER_GEOMETRY: return "geometry";
+   case MESA_SHADER_COMPUTE:  return "compute";
+   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
+   case MESA_SHADER_TESS_EVAL: return "tess eval";
+   }
+
+   unreachable("Unknown shader stage.");
+}
+
+/**
+ * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
+ * for debug printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_abbrev(unsigned stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:   return "VS";
+   case MESA_SHADER_FRAGMENT: return "FS";
+   case MESA_SHADER_GEOMETRY: return "GS";
+   case MESA_SHADER_COMPUTE:  return "CS";
+   case MESA_SHADER_TESS_CTRL: return "TCS";
+   case MESA_SHADER_TESS_EVAL: return "TES";
+   }
+
+   unreachable("Unknown shader stage.");
+}
+
 const char * gl_vert_attrib_name(gl_vert_attrib attrib)
 {
    static const char *names[] = {
diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h
index dfab0bb9952..bc6ea3844b6 100644
--- a/src/glsl/nir/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@@ -26,6 +26,10 @@
 #ifndef SHADER_ENUMS_H
 #define SHADER_ENUMS_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * Shader stages. Note that these will become 5 with tessellation.
  *
@@ -45,6 +49,18 @@ typedef enum
 
 const char * gl_shader_stage_name(gl_shader_stage stage);
 
+/**
+ * Translate a gl_shader_stage to a short shader stage name for debug
+ * printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_string(unsigned stage);
+
+/**
+ * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
+ * for debug printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_abbrev(unsigned stage);
+
 #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
 
 
@@ -412,7 +428,8 @@ typedef enum
     * \name Fragment shader system values
     */
    /*@{*/
-   SYSTEM_VALUE_FRONT_FACE,     /**< (not done yet) */
+   SYSTEM_VALUE_FRAG_COORD,
+   SYSTEM_VALUE_FRONT_FACE,
    SYSTEM_VALUE_SAMPLE_ID,
    SYSTEM_VALUE_SAMPLE_POS,
    SYSTEM_VALUE_SAMPLE_MASK_IN,
@@ -520,4 +537,8 @@ enum gl_frag_depth_layout
    FRAG_DEPTH_LAYOUT_UNCHANGED
 };
 
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
 #endif /* SHADER_ENUMS_H */
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 5ce804eed20..0f7a16a5e6f 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -130,6 +130,11 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
       shProg->InterfaceBlockStageIndex[i] = NULL;
    }
 
+   ralloc_free(shProg->UboInterfaceBlockIndex);
+   shProg->UboInterfaceBlockIndex = NULL;
+   ralloc_free(shProg->SsboInterfaceBlockIndex);
+   shProg->SsboInterfaceBlockIndex = NULL;
+
    ralloc_free(shProg->AtomicBuffers);
    shProg->AtomicBuffers = NULL;
    shProg->NumAtomicBuffers = 0;
diff --git a/src/mapi/glapi/gen/ARB_indirect_parameters.xml b/src/mapi/glapi/gen/ARB_indirect_parameters.xml
new file mode 100644
index 00000000000..20de9057707
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_indirect_parameters.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<OpenGLAPI>
+
+<category name="GL_ARB_draw_indirect" number="154">
+
+    <enum name="PARAMETER_BUFFER_ARB"                   value="0x80EE"/>
+    <enum name="PARAMETER_BUFFER_BINDING_ARB"           value="0x80EF"/>
+
+    <function name="MultiDrawArraysIndirectCountARB" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="indirect" type="GLintptr"/>
+        <param name="drawcount" type="GLintptr"/>
+        <param name="maxdrawcount" type="GLsizei"/>
+        <param name="stride" type="GLsizei"/>
+    </function>
+
+    <function name="MultiDrawElementsIndirectCountARB" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="type" type="GLenum"/>
+        <param name="indirect" type="GLintptr"/>
+        <param name="drawcount" type="GLintptr"/>
+        <param name="maxdrawcount" type="GLsizei"/>
+        <param name="stride" type="GLsizei"/>
+    </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 2da8f7ddd9d..900b61a5d45 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -137,6 +137,7 @@ API_XML = \
 	ARB_get_texture_sub_image.xml \
 	ARB_gpu_shader_fp64.xml \
 	ARB_gpu_shader5.xml \
+	ARB_indirect_parameters.xml \
 	ARB_instanced_arrays.xml \
 	ARB_internalformat_query.xml \
 	ARB_invalidate_subdata.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 21f6293bb6c..593ace49563 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8247,7 +8247,11 @@
 
 <xi:include href="ARB_multi_bind.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions 148 - 159 -->
+<!-- ARB extensions 148 - 153 -->
+
+<xi:include href="ARB_indirect_parameters.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<!-- ARB extensions 155 - 159 -->
 
 <xi:include href="ARB_clip_control.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc
index bb840eaba94..e1874c3f1dc 100644
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -37,18 +37,26 @@ TODO: document the other workarounds.
 
         <application name="Unigine Heaven (32-bit)" executable="heaven_x86">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 4.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
 	</application>
 
         <application name="Unigine Heaven (64-bit)" executable="heaven_x64">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 4.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
 	</application>
 
         <application name="Unigine Valley (32-bit)" executable="valley_x86">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 1.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
 	</application>
 
         <application name="Unigine Valley (64-bit)" executable="valley_x64">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 1.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
 	</application>
 
         <application name="Unigine OilRush (32-bit)" executable="OilRush_x86">
diff --git a/src/mesa/drivers/dri/i915/intel_buffer_objects.c b/src/mesa/drivers/dri/i915/intel_buffer_objects.c
index ef06743ed49..e6760964909 100644
--- a/src/mesa/drivers/dri/i915/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i915/intel_buffer_objects.c
@@ -99,7 +99,7 @@ intel_bufferobj_free(struct gl_context * ctx, struct gl_buffer_object *obj)
    _mesa_align_free(intel_obj->sys_buffer);
 
    drm_intel_bo_unreference(intel_obj->buffer);
-   free(intel_obj);
+   _mesa_delete_buffer_object(ctx, obj);
 }
 
 
diff --git a/src/mesa/drivers/dri/i915/intel_pixel_copy.c b/src/mesa/drivers/dri/i915/intel_pixel_copy.c
index a7185564e47..213cdbd0f53 100644
--- a/src/mesa/drivers/dri/i915/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_copy.c
@@ -138,7 +138,7 @@ do_blit_copypixels(struct gl_context * ctx,
    }
 
    if (ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F) {
-      perf_debug("glCopyPixles(): Unsupported pixel zoom\n");
+      perf_debug("glCopyPixels(): Unsupported pixel zoom\n");
       return false;
    }
 
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 3d5150d2f53..86777430a2e 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -2,7 +2,6 @@ i965_compiler_FILES = \
 	brw_cfg.cpp \
 	brw_cfg.h \
 	brw_compiler.h \
-	brw_cubemap_normalize.cpp \
 	brw_dead_control_flow.cpp \
 	brw_dead_control_flow.h \
 	brw_defines.h \
@@ -16,7 +15,6 @@ i965_compiler_FILES = \
 	brw_eu_util.c \
 	brw_eu_validate.c \
 	brw_fs_builder.h \
-	brw_fs_channel_expressions.cpp \
 	brw_fs_cmod_propagation.cpp \
 	brw_fs_combine_constants.cpp \
 	brw_fs_copy_propagation.cpp \
@@ -35,15 +33,12 @@ i965_compiler_FILES = \
 	brw_fs_surface_builder.cpp \
 	brw_fs_surface_builder.h \
 	brw_fs_validate.cpp \
-	brw_fs_vector_splitting.cpp \
 	brw_fs_visitor.cpp \
 	brw_inst.h \
 	brw_interpolation_map.c \
 	brw_ir_allocator.h \
 	brw_ir_fs.h \
 	brw_ir_vec4.h \
-	brw_lower_texture_gradients.cpp \
-	brw_lower_unnormalized_offset.cpp \
 	brw_nir.h \
 	brw_nir.c \
 	brw_nir_analyze_boolean_resolves.c \
@@ -115,6 +110,7 @@ i965_FILES = \
 	brw_context.h \
 	brw_cs.c \
 	brw_cs.h \
+	brw_cubemap_normalize.cpp \
 	brw_curbe.c \
 	brw_draw.c \
 	brw_draw.h \
@@ -122,11 +118,15 @@ i965_FILES = \
 	brw_ff_gs.c \
 	brw_ff_gs_emit.c \
 	brw_ff_gs.h \
+	brw_fs_channel_expressions.cpp \
+	brw_fs_vector_splitting.cpp \
 	brw_gs.c \
 	brw_gs.h \
 	brw_gs_state.c \
 	brw_gs_surface_state.c \
 	brw_link.cpp \
+	brw_lower_texture_gradients.cpp \
+	brw_lower_unnormalized_offset.cpp \
 	brw_meta_fast_clear.c \
 	brw_meta_stencil_blit.c \
 	brw_meta_updownsample.c \
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index db9acd64f4e..4eeca5cab95 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -402,6 +402,66 @@ static const struct brw_device_info brw_device_info_bxt = {
    }
 };
 
+/*
+ * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
+ * There's no KBL entry. Using the default SKL (GEN9) GS entries value.
+ */
+
+/*
+ * Both SKL and KBL support a maximum of 64 threads per
+ * Pixel Shader Dispatch (PSD) unit.
+ */
+#define  KBL_MAX_THREADS_PER_PSD 64
+
+static const struct brw_device_info brw_device_info_kbl_gt1 = {
+   GEN9_FEATURES,
+   .gt = 1,
+
+   .max_cs_threads = 7 * 6,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2,
+   .urb.size = 192,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt1_5 = {
+   GEN9_FEATURES,
+   .gt = 1,
+
+   .max_cs_threads = 7 * 6,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt2 = {
+   GEN9_FEATURES,
+   .gt = 2,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt3 = {
+   GEN9_FEATURES,
+   .gt = 3,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt4 = {
+   GEN9_FEATURES,
+   .gt = 4,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9,
+   /*
+    * From the "L3 Allocation and Programming" documentation:
+    *
+    * "URB is limited to 1008KB due to programming restrictions.  This
+    *  is not a restriction of the L3 implementation, but of the FF and
+    *  other clients.  Therefore, in a GT4 implementation it is
+    *  possible for the programmed allocation of the L3 data array to
+    *  provide 3*384KB=1152KB for URB, but only 1008KB of this
+    *  will be used."
+    */
+   .urb.size = 1008 / 3,
+};
+
 const struct brw_device_info *
 brw_get_device_info(int devid)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 5fb96626649..35d8039ed13 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -847,12 +847,11 @@ brw_alu2(struct brw_codegen *p, unsigned opcode,
 static int
 get_3src_subreg_nr(struct brw_reg reg)
 {
-   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
-      assert(brw_is_single_value_swizzle(reg.swizzle));
-      return reg.subnr / 4 + BRW_GET_SWZ(reg.swizzle, 0);
-   } else {
-      return reg.subnr / 4;
-   }
+   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
+    * use 32-bit units (components 0..7).  Since they only support F/D/UD
+    * types, this doesn't lose any flexibility, but uses fewer bits.
+    */
+   return reg.subnr / 4;
 }
 
 static brw_inst *
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 7675e9d299a..489461c6d95 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3455,8 +3455,7 @@ fs_visitor::lower_integer_multiplication()
              */
             assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
                    mul->src[1].type == BRW_REGISTER_TYPE_UD);
-            mul->src[1].type = (type_is_signed(mul->src[1].type) ?
-                                BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+            mul->src[1].type = BRW_REGISTER_TYPE_UW;
             mul->src[1].stride *= 2;
 
          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index c7bbae325a1..62133784dcf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1849,12 +1849,33 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
 
       fs_inst *inst;
       if (indirect_offset.file == BAD_FILE) {
-         /* Replicate the patch handle to all enabled channels */
-         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld.MOV(patch_handle, retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
-
-         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
-         inst->mlen = 1;
+         /* Arbitrarily only push up to 32 vec4 slots worth of data,
+          * which is 16 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 32;
+         if (imm_offset < max_push_slots) {
+            fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
+            for (int i = 0; i < instr->num_components; i++) {
+               bld.MOV(offset(dest, bld, i),
+                       component(src, 4 * (imm_offset % 2) + i));
+            }
+            tes_prog_data->base.urb_read_length =
+               MAX2(tes_prog_data->base.urb_read_length,
+                    DIV_ROUND_UP(imm_offset + 1, 2));
+         } else {
+            /* Replicate the patch handle to all enabled channels */
+            const fs_reg srcs[] = {
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
+            };
+            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
+
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
+            inst->mlen = 1;
+            inst->offset = imm_offset;
+            inst->base_mrf = -1;
+            inst->regs_written = instr->num_components;
+         }
       } else {
          /* Indirect indexing - use per-slot offsets as well. */
          const fs_reg srcs[] = {
@@ -1866,10 +1887,10 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
 
          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload);
          inst->mlen = 2;
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->regs_written = instr->num_components;
       }
-      inst->offset = imm_offset;
-      inst->base_mrf = -1;
-      inst->regs_written = instr->num_components;
       break;
    }
    default:
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 7cdc830f6b8..766c57ff60a 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -27,7 +27,6 @@
 #include "brw_nir.h"
 #include "brw_program.h"
 #include "glsl/ir_optimization.h"
-#include "glsl/glsl_parser_extras.h"
 #include "program/program.h"
 #include "main/shaderapi.h"
 #include "main/uniforms.h"
diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
index d571ecd4394..c83b2728b98 100644
--- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
+++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
@@ -279,7 +279,7 @@ lower_texture_grad_visitor::visit_leave(ir_texture *ir)
 
       /* 2. quotient rule */
       ir_variable *recip = temp(mem_ctx, glsl_type::float_type, "recip");
-      EMIT(assign(recip, div(new(mem_ctx) ir_constant(1.0f), swizzle_z(Q))));
+      EMIT(assign(recip, expr(ir_unop_rcp, swizzle_z(Q))));
 
       ir_variable *dx = temp(mem_ctx, glsl_type::vec2_type, "dx");
       ir_variable *dy = temp(mem_ctx, glsl_type::vec2_type, "dy");
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index e031173036a..f8b258bf96c 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -405,42 +405,15 @@ brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
    }
 }
 
-#include "util/debug.h"
-
-static bool
-should_clone_nir()
-{
-   static int should_clone = -1;
-   if (should_clone < 1)
-      should_clone = env_var_as_boolean("NIR_TEST_CLONE", false);
-
-   return should_clone;
-}
-
-#define _OPT(do_pass) (({                                            \
-   bool this_progress = true;                                        \
-   do_pass                                                           \
-   nir_validate_shader(nir);                                         \
-   if (should_clone_nir()) {                                         \
-      nir_shader *clone = nir_shader_clone(ralloc_parent(nir), nir); \
-      ralloc_free(nir);                                              \
-      nir = clone;                                                   \
-   }                                                                 \
-   this_progress;                                                    \
-}))
-
-#define OPT(pass, ...) _OPT(                   \
-   nir_metadata_set_validation_flag(nir);      \
-   this_progress = pass(nir ,##__VA_ARGS__);   \
-   if (this_progress) {                        \
-      progress = true;                         \
-      nir_metadata_check_validation_flag(nir); \
-   }                                           \
-)
-
-#define OPT_V(pass, ...) _OPT( \
-   pass(nir, ##__VA_ARGS__);   \
-)
+#define OPT(pass, ...) ({                                  \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress)                                      \
+      progress = true;                                     \
+   this_progress;                                          \
+})
+
+#define OPT_V(pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
 
 static nir_shader *
 nir_optimize(nir_shader *nir, bool is_scalar)
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 9f2ff9ae5ad..a2a4a40f373 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -287,33 +287,6 @@ type_sz(unsigned type)
    }
 }
 
-static inline bool
-type_is_signed(unsigned type)
-{
-   switch(type) {
-   case BRW_REGISTER_TYPE_D:
-   case BRW_REGISTER_TYPE_W:
-   case BRW_REGISTER_TYPE_F:
-   case BRW_REGISTER_TYPE_B:
-   case BRW_REGISTER_TYPE_V:
-   case BRW_REGISTER_TYPE_VF:
-   case BRW_REGISTER_TYPE_DF:
-   case BRW_REGISTER_TYPE_HF:
-   case BRW_REGISTER_TYPE_Q:
-      return true;
-
-   case BRW_REGISTER_TYPE_UD:
-   case BRW_REGISTER_TYPE_UW:
-   case BRW_REGISTER_TYPE_UB:
-   case BRW_REGISTER_TYPE_UV:
-   case BRW_REGISTER_TYPE_UQ:
-      return false;
-
-   default:
-      unreachable("not reached");
-   }
-}
-
 /**
  * Construct a brw_reg.
  * \param file      one of the BRW_x_REGISTER_FILE values
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 4ae403c2baa..9c7f4a7619f 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -27,7 +27,6 @@
 #include "brw_fs.h"
 #include "brw_nir.h"
 #include "brw_vec4_tes.h"
-#include "glsl/glsl_parser_extras.h"
 #include "main/shaderobj.h"
 #include "main/uniforms.h"
 #include "util/debug.h"
@@ -97,6 +96,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
    nir_shader_compiler_options *nir_options =
       rzalloc(compiler, nir_shader_compiler_options);
    nir_options->native_integers = true;
+   nir_options->lower_fdiv = true;
    /* In order to help allow for better CSE at the NIR level we tell NIR
     * to split all ffma instructions during opt_algebraic and we then
     * re-combine them as a later step.
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 2c925e7f572..7e414260284 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -307,7 +307,9 @@ brw_tcs_precompile(struct gl_context *ctx,
    /* Guess that the input and output patches have the same dimensionality. */
    key.input_vertices = shader_prog->TessCtrl.VerticesOut;
 
-   key.tes_primitive_mode = GL_TRIANGLES;
+   key.tes_primitive_mode =
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] ?
+      shader_prog->TessEval.PrimitiveMode : GL_TRIANGLES;
 
    key.outputs_written = prog->OutputsWritten;
    key.patch_outputs_written = prog->PatchOutputsWritten;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index dd223985d1c..c6a52c5d183 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1784,9 +1784,22 @@ vec4_visitor::convert_to_hw_regs()
          case ATTR:
             unreachable("not reached");
          }
+
          src = reg;
       }
 
+      if (inst->is_3src()) {
+         /* 3-src instructions with scalar sources support arbitrary subnr,
+          * but don't actually use swizzles.  Convert swizzle into subnr.
+          */
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) {
+               assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
+               inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
+            }
+         }
+      }
+
       dst_reg &dst = inst->dst;
       struct brw_reg reg;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index c6fa837dc74..b3236672ef2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -20,7 +20,6 @@
  * IN THE SOFTWARE.
  */
 
-#include "glsl/glsl_parser_extras.h"
 #include "brw_vec4.h"
 #include "brw_cfg.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index 09eadbcb54f..fea24368e8c 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -257,6 +257,7 @@ varying_name(brw_varying_slot slot)
       [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
    };
 
+   assert(slot < BRW_VARYING_SLOT_COUNT);
    return brw_names[slot - VARYING_SLOT_MAX];
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 76dc5775121..5ab2f7f09df 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -946,12 +946,15 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
       } else {
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
+         GLsizeiptr size = binding->BufferObject->Size - binding->Offset;
+         if (!binding->AutomaticSize)
+            size = MIN2(size, binding->Size);
          drm_intel_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset);
+                                   size);
          brw_create_constant_surface(brw, bo, binding->Offset,
-                                     binding->BufferObject->Size - binding->Offset,
+                                     size,
                                      &ubo_surf_offsets[i]);
       }
    }
@@ -968,12 +971,15 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
       } else {
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
+         GLsizeiptr size = binding->BufferObject->Size - binding->Offset;
+         if (!binding->AutomaticSize)
+            size = MIN2(size, binding->Size);
          drm_intel_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset);
+                                   size);
          brw_create_buffer_surface(brw, bo, binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset,
+                                   size,
                                    &ssbo_surf_offsets[i]);
       }
    }
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index da3b4cd90e8..4bc0a8598d6 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -32,7 +32,6 @@
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
 #include "intel_batchbuffer.h"
-#include "glsl/glsl_parser_extras.h"
 
 /**
  * Creates a streamed BO containing the push constants for the VS or GS on
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index bd204aa3ce8..6d29fbdde21 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -406,11 +406,6 @@ can_fast_copy_blit(struct brw_context *brw,
    if (brw->gen < 9)
       return false;
 
-   if (src_buffer->handle == dst_buffer->handle &&
-       _mesa_regions_overlap(src_x, src_y, src_x + w, src_y + h,
-                             dst_x, dst_y, dst_x + w, dst_y + h))
-      return false;
-
    /* Enable fast copy blit only if the surfaces are Yf/Ys tiled.
     * FIXME: Based on performance data, remove this condition later to
     * enable for all types of surfaces.
@@ -427,8 +422,10 @@ can_fast_copy_blit(struct brw_context *brw,
    if ((dst_offset | src_offset) & 63)
       return false;
 
-   /* Color depth greater than 128 bits not supported. */
-   if (cpp > 16)
+   /* Color depths which are not power of 2 or greater than 128 bits are
+    * not supported.
+    */
+   if (!_mesa_is_pow_two(cpp) || cpp > 16)
       return false;
 
    /* For Fast Copy Blits the pitch cannot be a negative number. So, bit 15
@@ -567,9 +564,10 @@ intelEmitCopyBlit(struct brw_context *brw,
                                            dst_offset, dst_pitch,
                                            dst_tiling, dst_tr_mode,
                                            w, h, cpp);
-   assert(use_fast_copy_blit ||
-          (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
-           dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE));
+   if (!use_fast_copy_blit &&
+       (src_tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
+        dst_tr_mode != INTEL_MIPTREE_TRMODE_NONE))
+      return false;
 
    if (use_fast_copy_blit) {
       /* When two sequential fast copy blits have different source surfaces,
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 7a5b3fca595..56da2da08a8 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -167,7 +167,7 @@ brw_delete_buffer(struct gl_context * ctx, struct gl_buffer_object *obj)
    _mesa_buffer_unmap_all_mappings(ctx, obj);
 
    drm_intel_bo_unreference(intel_obj->buffer);
-   free(intel_obj);
+   _mesa_delete_buffer_object(ctx, obj);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index e1338e92e15..889f7cbb5c1 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -346,6 +346,9 @@ intelInitExtensions(struct gl_context *ctx)
          ctx->Extensions.ARB_transform_feedback3 = true;
          ctx->Extensions.ARB_transform_feedback_instanced = true;
 
+         if (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024)
+            ctx->Extensions.ARB_compute_shader = true;
+
          if (brw->intelScreen->cmd_parser_version >= 2)
             brw->predicate.supported = true;
       }
@@ -357,8 +360,6 @@ intelInitExtensions(struct gl_context *ctx)
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
          ctx->Extensions.ARB_shader_subroutine = true;
-         if (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024)
-            ctx->Extensions.ARB_compute_shader = true;
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 88c0a19bed6..108dd87dd8b 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2697,13 +2697,17 @@ use_intel_mipree_map_blit(struct brw_context *brw,
 {
    if (brw->has_llc &&
       /* It's probably not worth swapping to the blit ring because of
-       * all the overhead involved.
+       * all the overhead involved. But, we must use blitter for the
+       * surfaces with INTEL_MIPTREE_TRMODE_{YF,YS}.
        */
-       !(mode & GL_MAP_WRITE_BIT) &&
+       (!(mode & GL_MAP_WRITE_BIT) ||
+        mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) &&
        !mt->compressed &&
        (mt->tiling == I915_TILING_X ||
         /* Prior to Sandybridge, the blitter can't handle Y tiling */
-        (brw->gen >= 6 && mt->tiling == I915_TILING_Y)) &&
+        (brw->gen >= 6 && mt->tiling == I915_TILING_Y) ||
+        /* Fast copy blit on skl+ supports all tiling formats. */
+        brw->gen >= 9) &&
        can_blit_slice(mt, level, slice))
       return true;
 
@@ -2772,6 +2776,8 @@ intel_miptree_map(struct brw_context *brw,
       intel_miptree_map_movntdqa(brw, mt, map, level, slice);
 #endif
    } else {
+      /* intel_miptree_map_gtt() doesn't support surfaces with Yf/Ys tiling. */
+      assert(mt->tr_mode == INTEL_MIPTREE_TRMODE_NONE);
       intel_miptree_map_gtt(brw, mt, map, level, slice);
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_copy.c b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
index 3b5bdb8f928..05c35bd61b3 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
@@ -142,7 +142,7 @@ do_blit_copypixels(struct gl_context * ctx,
    }
 
    if (ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F) {
-      perf_debug("glCopyPixles(): Unsupported pixel zoom\n");
+      perf_debug("glCopyPixels(): Unsupported pixel zoom\n");
       return false;
    }
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_driver.c b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
index 7f31b2851e4..998e751fc3c 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
@@ -35,7 +35,7 @@
 
 #include "drivers/common/meta.h"
 
-const char const *nouveau_vendor_string = "Nouveau";
+const char * const nouveau_vendor_string = "Nouveau";
 
 const char *
 nouveau_get_renderer_string(unsigned chipset)
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_driver.h b/src/mesa/drivers/dri/nouveau/nouveau_driver.h
index a4273a554bd..237e9563246 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.h
@@ -69,7 +69,7 @@ struct nouveau_driver {
 #define nouveau_error(format, ...) \
 	fprintf(stderr, "%s: " format, __func__, ## __VA_ARGS__)
 
-extern const char const *nouveau_vendor_string;
+extern const char * const nouveau_vendor_string;
 
 const char *
 nouveau_get_renderer_string(unsigned chipset);
diff --git a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
index d9d4f5ffc5e..2b76305dd45 100644
--- a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
+++ b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
@@ -71,7 +71,7 @@ radeonDeleteBufferObject(struct gl_context * ctx,
         radeon_bo_unref(radeon_obj->bo);
     }
 
-    free(radeon_obj);
+    _mesa_delete_buffer_object(ctx, obj);
 }
 
 
diff --git a/src/mesa/main/api_arrayelt.c b/src/mesa/main/api_arrayelt.c
index 92d8238f431..c84db5f97f6 100644
--- a/src/mesa/main/api_arrayelt.c
+++ b/src/mesa/main/api_arrayelt.c
@@ -65,7 +65,7 @@ typedef struct {
 typedef struct {
    AEarray arrays[32];
    AEattrib attribs[VERT_ATTRIB_MAX + 1];
-   GLuint NewState;
+   GLbitfield NewState;
 
    /* List of VBOs we need to map before executing ArrayElements */
    struct gl_buffer_object *vbo[VERT_ATTRIB_MAX];
@@ -1802,7 +1802,7 @@ _ae_ArrayElement(GLint elt)
 
 
 void
-_ae_invalidate_state(struct gl_context *ctx, GLuint new_state)
+_ae_invalidate_state(struct gl_context *ctx, GLbitfield new_state)
 {
    AEcontext *actx = AE_CONTEXT(ctx);
 
diff --git a/src/mesa/main/api_arrayelt.h b/src/mesa/main/api_arrayelt.h
index 39fdeb9d2bd..03cd9ecbd51 100644
--- a/src/mesa/main/api_arrayelt.h
+++ b/src/mesa/main/api_arrayelt.h
@@ -33,7 +33,7 @@
 
 extern GLboolean _ae_create_context( struct gl_context *ctx );
 extern void _ae_destroy_context( struct gl_context *ctx );
-extern void _ae_invalidate_state( struct gl_context *ctx, GLuint new_state );
+extern void _ae_invalidate_state( struct gl_context *ctx, GLbitfield new_state );
 extern void GLAPIENTRY _ae_ArrayElement( GLint elt );
 
 /* May optionally be called before a batch of element calls:
diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c
index a7fd82c531f..8b63d9c0e95 100644
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -629,7 +629,10 @@ _mesa_Vertex2sv( const GLshort *v )
 void GLAPIENTRY
 _mesa_Vertex3dv( const GLdouble *v )
 {
-   VERTEX3( (GLfloat) v[0], (GLfloat) v[1], (GLfloat) v[2] );
+   if (v[2] == 0.0)
+      VERTEX2( (GLfloat) v[0], (GLfloat) v[1] );
+   else
+      VERTEX3( (GLfloat) v[0], (GLfloat) v[1], (GLfloat) v[2] );
 }
 
 void GLAPIENTRY
diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index d693ec64ce4..2b629977644 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -920,6 +920,121 @@ _mesa_validate_MultiDrawElementsIndirect(struct gl_context *ctx,
    return GL_TRUE;
 }
 
+static GLboolean
+valid_draw_indirect_parameters(struct gl_context *ctx,
+                               const char *name,
+                               GLintptr drawcount)
+{
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_VALUE is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if <drawcount> is not a multiple of
+    *  four."
+    */
+   if (drawcount & 3) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(drawcount is not a multiple of 4)", name);
+      return GL_FALSE;
+   }
+
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_OPERATION is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if no buffer is bound to the
+    *  PARAMETER_BUFFER_ARB binding point."
+    */
+   if (!_mesa_is_bufferobj(ctx->ParameterBuffer)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s: no buffer bound to PARAMETER_BUFFER", name);
+      return GL_FALSE;
+   }
+
+   if (_mesa_check_disallowed_mapping(ctx->ParameterBuffer)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(PARAMETER_BUFFER is mapped)", name);
+      return GL_FALSE;
+   }
+
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_OPERATION is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if reading a <sizei> typed value
+    *  from the buffer bound to the PARAMETER_BUFFER_ARB target at the offset
+    *  specified by <drawcount> would result in an out-of-bounds access."
+    */
+   if (ctx->ParameterBuffer->Size < drawcount + sizeof(GLsizei)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(PARAMETER_BUFFER too small)", name);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+GLboolean
+_mesa_validate_MultiDrawArraysIndirectCount(struct gl_context *ctx,
+                                            GLenum mode,
+                                            GLintptr indirect,
+                                            GLintptr drawcount,
+                                            GLsizei maxdrawcount,
+                                            GLsizei stride)
+{
+   GLsizeiptr size = 0;
+   const unsigned drawArraysNumParams = 4;
+
+   FLUSH_CURRENT(ctx, 0);
+
+   /* caller has converted stride==0 to drawArraysNumParams * sizeof(GLuint) */
+   assert(stride != 0);
+
+   if (!valid_draw_indirect_multi(ctx, maxdrawcount, stride,
+                                  "glMultiDrawArraysIndirectCountARB"))
+      return GL_FALSE;
+
+   /* number of bytes of the indirect buffer which will be read */
+   size = maxdrawcount
+      ? (maxdrawcount - 1) * stride + drawArraysNumParams * sizeof(GLuint)
+      : 0;
+
+   if (!valid_draw_indirect(ctx, mode, (void *)indirect, size,
+                            "glMultiDrawArraysIndirectCountARB"))
+      return GL_FALSE;
+
+   return valid_draw_indirect_parameters(
+         ctx, "glMultiDrawArraysIndirectCountARB", drawcount);
+}
+
+GLboolean
+_mesa_validate_MultiDrawElementsIndirectCount(struct gl_context *ctx,
+                                              GLenum mode, GLenum type,
+                                              GLintptr indirect,
+                                              GLintptr drawcount,
+                                              GLsizei maxdrawcount,
+                                              GLsizei stride)
+{
+   GLsizeiptr size = 0;
+   const unsigned drawElementsNumParams = 5;
+
+   FLUSH_CURRENT(ctx, 0);
+
+   /* caller has converted stride==0 to drawElementsNumParams * sizeof(GLuint) */
+   assert(stride != 0);
+
+   if (!valid_draw_indirect_multi(ctx, maxdrawcount, stride,
+                                  "glMultiDrawElementsIndirectCountARB"))
+      return GL_FALSE;
+
+   /* number of bytes of the indirect buffer which will be read */
+   size = maxdrawcount
+      ? (maxdrawcount - 1) * stride + drawElementsNumParams * sizeof(GLuint)
+      : 0;
+
+   if (!valid_draw_indirect_elements(ctx, mode, type,
+                                     (void *)indirect, size,
+                                     "glMultiDrawElementsIndirectCountARB"))
+      return GL_FALSE;
+
+   return valid_draw_indirect_parameters(
+         ctx, "glMultiDrawElementsIndirectCountARB", drawcount);
+}
+
 static bool
 check_valid_to_compute(struct gl_context *ctx, const char *function)
 {
diff --git a/src/mesa/main/api_validate.h b/src/mesa/main/api_validate.h
index 5d030a7ba37..5b321e3ac99 100644
--- a/src/mesa/main/api_validate.h
+++ b/src/mesa/main/api_validate.h
@@ -106,6 +106,22 @@ _mesa_validate_MultiDrawElementsIndirect(struct gl_context *ctx,
                                          GLsizei stride);
 
 extern GLboolean
+_mesa_validate_MultiDrawArraysIndirectCount(struct gl_context *ctx,
+                                            GLenum mode,
+                                            GLintptr indirect,
+                                            GLintptr drawcount,
+                                            GLsizei maxdrawcount,
+                                            GLsizei stride);
+
+extern GLboolean
+_mesa_validate_MultiDrawElementsIndirectCount(struct gl_context *ctx,
+                                              GLenum mode, GLenum type,
+                                              GLintptr indirect,
+                                              GLintptr drawcount,
+                                              GLsizei maxdrawcount,
+                                              GLsizei stride);
+
+extern GLboolean
 _mesa_validate_DispatchCompute(struct gl_context *ctx,
                                const GLuint *num_groups);
 
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index e0639c87ef4..14ee8c8fc73 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -127,6 +127,11 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
          return &ctx->DrawIndirectBuffer;
       }
       break;
+   case GL_PARAMETER_BUFFER_ARB:
+      if (_mesa_has_ARB_indirect_parameters(ctx)) {
+         return &ctx->ParameterBuffer;
+      }
+      break;
    case GL_DISPATCH_INDIRECT_BUFFER:
       if (_mesa_has_compute_shaders(ctx)) {
          return &ctx->DispatchIndirectBuffer;
@@ -447,7 +452,7 @@ _mesa_new_buffer_object(struct gl_context *ctx, GLuint name)
  *
  * Default callback for the \c dd_function_table::DeleteBuffer() hook.
  */
-static void
+void
 _mesa_delete_buffer_object(struct gl_context *ctx,
                            struct gl_buffer_object *bufObj)
 {
@@ -866,6 +871,9 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
    _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer,
 				 ctx->Shared->NullBufferObj);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ParameterBuffer,
+				 ctx->Shared->NullBufferObj);
+
    _mesa_reference_buffer_object(ctx, &ctx->DispatchIndirectBuffer,
 				 ctx->Shared->NullBufferObj);
 
@@ -913,6 +921,8 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 
    _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer, NULL);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ParameterBuffer, NULL);
+
    _mesa_reference_buffer_object(ctx, &ctx->DispatchIndirectBuffer, NULL);
 
    for (i = 0; i < MAX_COMBINED_UNIFORM_BUFFERS; i++) {
@@ -1205,9 +1215,10 @@ _mesa_BindBuffer(GLenum target, GLuint buffer)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (MESA_VERBOSE & VERBOSE_API)
+   if (MESA_VERBOSE & VERBOSE_API) {
       _mesa_debug(ctx, "glBindBuffer(%s, %u)\n",
                   _mesa_enum_to_string(target), buffer);
+   }
 
    bind_buffer_object(ctx, target, buffer);
 }
@@ -1260,6 +1271,11 @@ _mesa_DeleteBuffers(GLsizei n, const GLuint *ids)
             _mesa_BindBuffer( GL_DRAW_INDIRECT_BUFFER, 0 );
          }
 
+         /* unbind ARB_indirect_parameters binding point */
+         if (ctx->ParameterBuffer == bufObj) {
+            _mesa_BindBuffer(GL_PARAMETER_BUFFER_ARB, 0);
+         }
+
          /* unbind ARB_compute_shader binding point */
          if (ctx->DispatchIndirectBuffer == bufObj) {
             _mesa_BindBuffer(GL_DISPATCH_INDIRECT_BUFFER, 0);
@@ -1562,12 +1578,13 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
 {
    bool valid_usage;
 
-   if (MESA_VERBOSE & VERBOSE_API)
+   if (MESA_VERBOSE & VERBOSE_API) {
       _mesa_debug(ctx, "%s(%s, %ld, %p, %s)\n",
                   func,
                   _mesa_enum_to_string(target),
                   (long int) size, data,
                   _mesa_enum_to_string(usage));
+   }
 
    if (size < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size < 0)", func);
@@ -3112,147 +3129,15 @@ unbind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
 }
 
 static void
-bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
-                          const GLuint *buffers)
+bind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count,
+                     const GLuint *buffers,
+                     bool range,
+                     const GLintptr *offsets, const GLsizeiptr *sizes,
+                     const char *caller)
 {
    GLint i;
 
-   if (!error_check_bind_uniform_buffers(ctx, first, count, "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_uniform_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_uniform_buffer_binding *binding =
-          &ctx->UniformBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj) {
-         if (bufObj == ctx->Shared->NullBufferObj)
-            set_ubo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
-         else
-            set_ubo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
-      }
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_shader_storage_buffers_base(struct gl_context *ctx, GLuint first,
-                                 GLsizei count, const GLuint *buffers)
-{
-   GLint i;
-
-   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
-                                                "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_shader_storage_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_shader_storage_buffer_binding *binding =
-          &ctx->ShaderStorageBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj) {
-         if (bufObj == ctx->Shared->NullBufferObj)
-            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
-         else
-            set_ssbo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
-      }
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
-                           const GLuint *buffers,
-                           const GLintptr *offsets, const GLsizeiptr *sizes)
-{
-   GLint i;
-
-   if (!error_check_bind_uniform_buffers(ctx, first, count,
-                                         "glBindBuffersRange"))
+   if (!error_check_bind_uniform_buffers(ctx, first, count, caller))
       return;
 
    /* Assume that at least one binding will be changed */
@@ -3297,52 +3182,57 @@ bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
       struct gl_uniform_buffer_binding *binding =
          &ctx->UniformBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
 
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
 
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Uniform buffer array bindings (see sec. 7.6)                  │
-       *      ├─────────────────────┬─────────────────────────────────────────┤
-       *      │  ...                │  ...                                    │
-       *      │  offset restriction │  multiple of value of UNIFORM_BUFFER_-  │
-       *      │                     │  OFFSET_ALIGNMENT                       │
-       *      │  ...                │  ...                                    │
-       *      │  size restriction   │  none                                   │
-       *      └─────────────────────┴─────────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ctx->Const.UniformBufferOffsetAlignment - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of the value of "
-                     "GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT=%u when "
-                     "target=GL_UNIFORM_BUFFER)",
-                     i, (int64_t) offsets[i],
-                     ctx->Const.UniformBufferOffsetAlignment);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Uniform buffer array bindings (see sec. 7.6)                  │
+          *      ├─────────────────────┬─────────────────────────────────────────┤
+          *      │  ...                │  ...                                    │
+          *      │  offset restriction │  multiple of value of UNIFORM_BUFFER_-  │
+          *      │                     │  OFFSET_ALIGNMENT                       │
+          *      │  ...                │  ...                                    │
+          *      │  size restriction   │  none                                   │
+          *      └─────────────────────┴─────────────────────────────────────────┘"
+          */
+         if (offsets[i] & (ctx->Const.UniformBufferOffsetAlignment - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of the value of "
+                        "GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT=%u when "
+                        "target=GL_UNIFORM_BUFFER)",
+                        i, (int64_t) offsets[i],
+                        ctx->Const.UniformBufferOffsetAlignment);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
       }
 
       if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
          bufObj = binding->BufferObject;
       else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
 
       if (bufObj) {
          if (bufObj == ctx->Shared->NullBufferObj)
-            set_ubo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+            set_ubo_binding(ctx, binding, bufObj, -1, -1, !range);
          else
-            set_ubo_binding(ctx, binding, bufObj,
-                            offsets[i], sizes[i], GL_FALSE);
+            set_ubo_binding(ctx, binding, bufObj, offset, size, !range);
       }
    }
 
@@ -3350,15 +3240,16 @@ bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
 }
 
 static void
-bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
-                                  GLsizei count, const GLuint *buffers,
-                                  const GLintptr *offsets,
-                                  const GLsizeiptr *sizes)
+bind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
+                            GLsizei count, const GLuint *buffers,
+                            bool range,
+                            const GLintptr *offsets,
+                            const GLsizeiptr *sizes,
+                            const char *caller)
 {
    GLint i;
 
-   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
-                                                "glBindBuffersRange"))
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count, caller))
       return;
 
    /* Assume that at least one binding will be changed */
@@ -3403,52 +3294,57 @@ bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
       struct gl_shader_storage_buffer_binding *binding =
          &ctx->ShaderStorageBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
+
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
+
+         /* The ARB_multi_bind spec says:
+         *
+         *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+         *      pair of values in <offsets> and <sizes> does not respectively
+         *      satisfy the constraints described for those parameters for the
+         *      specified target, as described in section 6.7.1 (per binding)."
+         *
+         * Section 6.7.1 refers to table 6.5, which says:
+         *
+         *     "┌───────────────────────────────────────────────────────────────┐
+         *      │ Shader storage buffer array bindings (see sec. 7.8)           │
+         *      ├─────────────────────┬─────────────────────────────────────────┤
+         *      │  ...                │  ...                                    │
+         *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
+         *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
+         *      │  ...                │  ...                                    │
+         *      │  size restriction   │  none                                   │
+         *      └─────────────────────┴─────────────────────────────────────────┘"
+         */
+         if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of the value of "
+                        "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
+                        "target=GL_SHADER_STORAGE_BUFFER)",
+                        i, (int64_t) offsets[i],
+                        ctx->Const.ShaderStorageBufferOffsetAlignment);
+            continue;
+         }
 
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
-
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Shader storage buffer array bindings (see sec. 7.8)           │
-       *      ├─────────────────────┬─────────────────────────────────────────┤
-       *      │  ...                │  ...                                    │
-       *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
-       *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
-       *      │  ...                │  ...                                    │
-       *      │  size restriction   │  none                                   │
-       *      └─────────────────────┴─────────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of the value of "
-                     "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
-                     "target=GL_SHADER_STORAGE_BUFFER)",
-                     i, (int64_t) offsets[i],
-                     ctx->Const.ShaderStorageBufferOffsetAlignment);
-         continue;
+         offset = offsets[i];
+         size = sizes[i];
       }
 
       if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
          bufObj = binding->BufferObject;
       else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
 
       if (bufObj) {
          if (bufObj == ctx->Shared->NullBufferObj)
-            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, !range);
          else
-            set_ssbo_binding(ctx, binding, bufObj,
-                             offsets[i], sizes[i], GL_FALSE);
+            set_ssbo_binding(ctx, binding, bufObj, offset, size, !range);
       }
    }
 
@@ -3520,84 +3416,19 @@ unbind_xfb_buffers(struct gl_context *ctx,
 }
 
 static void
-bind_xfb_buffers_base(struct gl_context *ctx,
-                      GLuint first, GLsizei count,
-                      const GLuint *buffers)
-{
-   struct gl_transform_feedback_object *tfObj =
-      ctx->TransformFeedback.CurrentObject;
-   GLint i;
-
-   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count,
-                                     "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewTransformFeedback;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_xfb_buffers(ctx, tfObj, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_buffer_object * const boundBufObj = tfObj->Buffers[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (boundBufObj && boundBufObj->Name == buffers[i])
-         bufObj = boundBufObj;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj)
-         _mesa_set_transform_feedback_binding(ctx, tfObj, first + i,
-                                              bufObj, 0, 0);
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_xfb_buffers_range(struct gl_context *ctx,
-                       GLuint first, GLsizei count,
-                       const GLuint *buffers,
-                       const GLintptr *offsets,
-                       const GLsizeiptr *sizes)
+bind_xfb_buffers(struct gl_context *ctx,
+                 GLuint first, GLsizei count,
+                 const GLuint *buffers,
+                 bool range,
+                 const GLintptr *offsets,
+                 const GLsizeiptr *sizes,
+                 const char *caller)
 {
    struct gl_transform_feedback_object *tfObj =
        ctx->TransformFeedback.CurrentObject;
    GLint i;
 
-   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count,
-                                     "glBindBuffersRange"))
+   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count, caller))
       return;
 
    /* Assume that at least one binding will be changed */
@@ -3642,55 +3473,64 @@ bind_xfb_buffers_range(struct gl_context *ctx,
       const GLuint index = first + i;
       struct gl_buffer_object * const boundBufObj = tfObj->Buffers[index];
       struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
 
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         offset = offsets[i];
+         size = sizes[i];
 
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Transform feedback array bindings (see sec. 13.2.2)           │
-       *      ├───────────────────────┬───────────────────────────────────────┤
-       *      │    ...                │    ...                                │
-       *      │    offset restriction │    multiple of 4                      │
-       *      │    ...                │    ...                                │
-       *      │    size restriction   │    multiple of 4                      │
-       *      └───────────────────────┴───────────────────────────────────────┘"
-       */
-      if (offsets[i] & 0x3) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of 4 when "
-                     "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
-                     i, (int64_t) offsets[i]);
-         continue;
-      }
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
 
-      if (sizes[i] & 0x3) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(sizes[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of 4 when "
-                     "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
-                     i, (int64_t) sizes[i]);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Transform feedback array bindings (see sec. 13.2.2)           │
+          *      ├───────────────────────┬───────────────────────────────────────┤
+          *      │    ...                │    ...                                │
+          *      │    offset restriction │    multiple of 4                      │
+          *      │    ...                │    ...                                │
+          *      │    size restriction   │    multiple of 4                      │
+          *      └───────────────────────┴───────────────────────────────────────┘"
+          */
+         if (offsets[i] & 0x3) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of 4 when "
+                        "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
+                        i, (int64_t) offsets[i]);
+            continue;
+         }
+
+         if (sizes[i] & 0x3) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(sizes[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of 4 when "
+                        "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
+                        i, (int64_t) sizes[i]);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
       }
 
       if (boundBufObj && boundBufObj->Name == buffers[i])
          bufObj = boundBufObj;
       else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
 
       if (bufObj)
          _mesa_set_transform_feedback_binding(ctx, tfObj, index, bufObj,
-                                              offsets[i], sizes[i]);
+                                              offset, size);
    }
 
    _mesa_end_bufferobj_lookups(ctx);
@@ -3740,82 +3580,18 @@ unbind_atomic_buffers(struct gl_context *ctx, GLuint first, GLsizei count)
 }
 
 static void
-bind_atomic_buffers_base(struct gl_context *ctx,
-                         GLuint first,
-                         GLsizei count,
-                         const GLuint *buffers)
+bind_atomic_buffers(struct gl_context *ctx,
+                    GLuint first,
+                    GLsizei count,
+                    const GLuint *buffers,
+                    bool range,
+                    const GLintptr *offsets,
+                    const GLsizeiptr *sizes,
+                    const char *caller)
 {
    GLint i;
 
-   if (!error_check_bind_atomic_buffers(ctx, first, count,
-                                        "glBindBuffersBase"))
-     return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewAtomicBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_atomic_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_atomic_buffer_binding *binding =
-         &ctx->AtomicBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj)
-         set_atomic_buffer_binding(ctx, binding, bufObj, 0, 0);
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_atomic_buffers_range(struct gl_context *ctx,
-                          GLuint first,
-                          GLsizei count,
-                          const GLuint *buffers,
-                          const GLintptr *offsets,
-                          const GLsizeiptr *sizes)
-{
-   GLint i;
-
-   if (!error_check_bind_atomic_buffers(ctx, first, count,
-                                        "glBindBuffersRange"))
+   if (!error_check_bind_atomic_buffers(ctx, first, count, caller))
      return;
 
    /* Assume that at least one binding will be changed */
@@ -3860,45 +3636,51 @@ bind_atomic_buffers_range(struct gl_context *ctx,
       struct gl_atomic_buffer_binding *binding =
          &ctx->AtomicBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
 
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
 
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Atomic counter array bindings (see sec. 7.7.2)                │
-       *      ├───────────────────────┬───────────────────────────────────────┤
-       *      │    ...                │    ...                                │
-       *      │    offset restriction │    multiple of 4                      │
-       *      │    ...                │    ...                                │
-       *      │    size restriction   │    none                               │
-       *      └───────────────────────┴───────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ATOMIC_COUNTER_SIZE - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of %d when "
-                     "target=GL_ATOMIC_COUNTER_BUFFER)",
-                     i, (int64_t) offsets[i], ATOMIC_COUNTER_SIZE);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Atomic counter array bindings (see sec. 7.7.2)                │
+          *      ├───────────────────────┬───────────────────────────────────────┤
+          *      │    ...                │    ...                                │
+          *      │    offset restriction │    multiple of 4                      │
+          *      │    ...                │    ...                                │
+          *      │    size restriction   │    none                               │
+          *      └───────────────────────┴───────────────────────────────────────┘"
+          */
+         if (offsets[i] & (ATOMIC_COUNTER_SIZE - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of %d when "
+                        "target=GL_ATOMIC_COUNTER_BUFFER)",
+                        i, (int64_t) offsets[i], ATOMIC_COUNTER_SIZE);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
       }
 
       if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
          bufObj = binding->BufferObject;
       else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
 
       if (bufObj)
-         set_atomic_buffer_binding(ctx, binding, bufObj, offsets[i], sizes[i]);
+         set_atomic_buffer_binding(ctx, binding, bufObj, offset, size);
    }
 
    _mesa_end_bufferobj_lookups(ctx);
@@ -3911,6 +3693,11 @@ _mesa_BindBufferRange(GLenum target, GLuint index,
    GET_CURRENT_CONTEXT(ctx);
    struct gl_buffer_object *bufObj;
 
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBufferRange(%s, %u, %u, %ld, %ld)\n",
+                  _mesa_enum_to_string(target), index, buffer, offset, size);
+   }
+
    if (buffer == 0) {
       bufObj = ctx->Shared->NullBufferObj;
    } else {
@@ -3963,6 +3750,11 @@ _mesa_BindBufferBase(GLenum target, GLuint index, GLuint buffer)
    GET_CURRENT_CONTEXT(ctx);
    struct gl_buffer_object *bufObj;
 
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBufferBase(%s, %u, %u)\n",
+                  _mesa_enum_to_string(target), index, buffer);
+   }
+
    if (buffer == 0) {
       bufObj = ctx->Shared->NullBufferObj;
    } else {
@@ -4033,20 +3825,28 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBuffersRange(%s, %u, %d, %p, %p, %p)\n",
+                  _mesa_enum_to_string(target), first, count,
+                  buffers, offsets, sizes);
+   }
+
    switch (target) {
    case GL_TRANSFORM_FEEDBACK_BUFFER:
-      bind_xfb_buffers_range(ctx, first, count, buffers, offsets, sizes);
+      bind_xfb_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                       "glBindBuffersRange");
       return;
    case GL_UNIFORM_BUFFER:
-      bind_uniform_buffers_range(ctx, first, count, buffers, offsets, sizes);
+      bind_uniform_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                           "glBindBuffersRange");
       return;
    case GL_SHADER_STORAGE_BUFFER:
-      bind_shader_storage_buffers_range(ctx, first, count, buffers, offsets,
-                                        sizes);
+      bind_shader_storage_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                                  "glBindBuffersRange");
       return;
    case GL_ATOMIC_COUNTER_BUFFER:
-      bind_atomic_buffers_range(ctx, first, count, buffers,
-                                offsets, sizes);
+      bind_atomic_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                          "glBindBuffersRange");
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersRange(target=%s)",
@@ -4061,18 +3861,27 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBuffersBase(%s, %u, %d, %p)\n",
+                  _mesa_enum_to_string(target), first, count, buffers);
+   }
+
    switch (target) {
    case GL_TRANSFORM_FEEDBACK_BUFFER:
-      bind_xfb_buffers_base(ctx, first, count, buffers);
+      bind_xfb_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                       "glBindBuffersBase");
       return;
    case GL_UNIFORM_BUFFER:
-      bind_uniform_buffers_base(ctx, first, count, buffers);
+      bind_uniform_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                           "glBindBuffersBase");
       return;
    case GL_SHADER_STORAGE_BUFFER:
-      bind_shader_storage_buffers_base(ctx, first, count, buffers);
+      bind_shader_storage_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                                  "glBindBuffersBase");
       return;
    case GL_ATOMIC_COUNTER_BUFFER:
-      bind_atomic_buffers_base(ctx, first, count, buffers);
+      bind_atomic_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                          "glBindBuffersBase");
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersBase(target=%s)",
diff --git a/src/mesa/main/bufferobj.h b/src/mesa/main/bufferobj.h
index 3eac96df23e..a5bfe886b39 100644
--- a/src/mesa/main/bufferobj.h
+++ b/src/mesa/main/bufferobj.h
@@ -109,6 +109,10 @@ _mesa_initialize_buffer_object(struct gl_context *ctx,
                                GLuint name);
 
 extern void
+_mesa_delete_buffer_object(struct gl_context *ctx,
+                           struct gl_buffer_object *bufObj);
+
+extern void
 _mesa_reference_buffer_object_(struct gl_context *ctx,
                                struct gl_buffer_object **ptr,
                                struct gl_buffer_object *bufObj);
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 789b55a3c8d..aeccb017423 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -70,6 +70,7 @@ EXT(ARB_gpu_shader5                         , ARB_gpu_shader5
 EXT(ARB_gpu_shader_fp64                     , ARB_gpu_shader_fp64                    ,  x , GLC,  x ,  x , 2010)
 EXT(ARB_half_float_pixel                    , dummy_true                             , GLL, GLC,  x ,  x , 2003)
 EXT(ARB_half_float_vertex                   , ARB_half_float_vertex                  , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_indirect_parameters                 , ARB_indirect_parameters                ,  x , GLC,  x ,  x , 2013)
 EXT(ARB_instanced_arrays                    , ARB_instanced_arrays                   , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_internalformat_query                , ARB_internalformat_query               , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_invalidate_subdata                  , dummy_true                             , GLL, GLC,  x ,  x , 2012)
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index c6a2e5b912c..95cb18c8ee8 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -423,6 +423,7 @@ EXTRA_EXT(ARB_framebuffer_no_attachments);
 EXTRA_EXT(ARB_tessellation_shader);
 EXTRA_EXT(ARB_shader_subroutine);
 EXTRA_EXT(ARB_shader_storage_buffer_object);
+EXTRA_EXT(ARB_indirect_parameters);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -1032,6 +1033,10 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
    case GL_DRAW_INDIRECT_BUFFER_BINDING:
       v->value_int = ctx->DrawIndirectBuffer->Name;
       break;
+   /* GL_ARB_indirect_parameters */
+   case GL_PARAMETER_BUFFER_BINDING_ARB:
+      v->value_int = ctx->ParameterBuffer->Name;
+      break;
    /* GL_ARB_separate_shader_objects */
    case GL_PROGRAM_PIPELINE_BINDING:
       if (ctx->Pipeline.Current) {
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 7a48ed2f414..af7a8f4a906 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -887,6 +887,10 @@ descriptor=[
 # GL_ARB_shader_subroutine
   [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), extra_ARB_shader_subroutine" ],
   [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), extra_ARB_shader_subroutine" ],
+
+# GL_ARB_indirect_parameters
+  [ "PARAMETER_BUFFER_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_indirect_parameters" ],
+
 ]}
 
 ]
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 3f28d3313b1..39e3cfdc047 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2741,6 +2741,13 @@ struct gl_shader_program
    int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
 
    /**
+    * Indices into the BufferInterfaceBlocks[] array for Uniform Buffer
+    * Objects and Shader Storage Buffer Objects.
+    */
+   unsigned *UboInterfaceBlockIndex;
+   unsigned *SsboInterfaceBlockIndex;
+
+   /**
     * Map of active uniform names to locations
     *
     * Maps any active uniform that is not an array element to a location.
@@ -3520,6 +3527,10 @@ struct gl_constants
     */
    GLboolean GLSLSkipStrictMaxUniformLimitCheck;
 
+   /** Whether gl_FragCoord and gl_FrontFacing are system values. */
+   bool GLSLFragCoordIsSysVal;
+   bool GLSLFrontFacingIsSysVal;
+
    /**
     * Always use the GetTransformFeedbackVertexCount() driver hook, rather
     * than passing the transform feedback object to the drawing function.
@@ -3705,6 +3716,7 @@ struct gl_extensions
    GLboolean ARB_gpu_shader5;
    GLboolean ARB_gpu_shader_fp64;
    GLboolean ARB_half_float_vertex;
+   GLboolean ARB_indirect_parameters;
    GLboolean ARB_instanced_arrays;
    GLboolean ARB_internalformat_query;
    GLboolean ARB_map_buffer_range;
@@ -4353,6 +4365,7 @@ struct gl_context
    struct gl_perf_monitor_state PerfMonitor;
 
    struct gl_buffer_object *DrawIndirectBuffer; /** < GL_ARB_draw_indirect */
+   struct gl_buffer_object *ParameterBuffer; /** < GL_ARB_indirect_parameters */
    struct gl_buffer_object *DispatchIndirectBuffer; /** < GL_ARB_compute_shader */
 
    struct gl_buffer_object *CopyReadBuffer; /**< GL_ARB_copy_buffer */
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index b7e25fe3840..9a15cfe70b8 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -70,6 +70,13 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
                             GLenum pname, GLint *params)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramInterfaceiv(%u, %s, %s, %p)\n",
+                  program, _mesa_enum_to_string(programInterface),
+                  _mesa_enum_to_string(pname), params);
+   }
+
    unsigned i;
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program,
@@ -226,6 +233,12 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
                               const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceIndex(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
    unsigned array_index = 0;
    struct gl_program_resource *res;
    struct gl_shader_program *shProg =
@@ -290,6 +303,13 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
                              GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceName(%u, %s, %u, %d, %p, %p)\n",
+                  program, _mesa_enum_to_string(programInterface), index,
+                  bufSize, length, name);
+   }
+
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program,
                                       "glGetProgramResourceName");
@@ -315,6 +335,13 @@ _mesa_GetProgramResourceiv(GLuint program, GLenum programInterface,
                            GLsizei *length, GLint *params)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceiv(%u, %s, %u, %d, %p, %d, %p, %p)\n",
+                  program, _mesa_enum_to_string(programInterface), index,
+                  propCount, props, bufSize, length, params);
+   }
+
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program, "glGetProgramResourceiv");
 
@@ -361,6 +388,12 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
                                  const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceLocation(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocation");
 
@@ -411,6 +444,12 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
                                       const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceLocationIndex(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocationIndex");
 
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index e526119db19..b25732a2e3b 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -764,8 +764,12 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
 static bool
 add_index_to_name(struct gl_program_resource *res)
 {
-   bool add_index = !(((res->Type == GL_PROGRAM_INPUT) &&
-                       res->StageReferences & (1 << MESA_SHADER_GEOMETRY)));
+   bool add_index = !((res->Type == GL_PROGRAM_INPUT &&
+                       res->StageReferences & (1 << MESA_SHADER_GEOMETRY |
+                                               1 << MESA_SHADER_TESS_CTRL |
+                                               1 << MESA_SHADER_TESS_EVAL)) ||
+                      (res->Type == GL_PROGRAM_OUTPUT &&
+                       res->StageReferences & 1 << MESA_SHADER_TESS_CTRL));
 
    /* Transform feedback varyings have array index already appended
     * in their names.
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index c4ebf4201fb..040e9fd6e3c 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -738,8 +738,10 @@ _mesa_MemoryBarrierByRegion(GLbitfield barriers)
        * That is, if barriers is the special value GL_ALL_BARRIER_BITS, then all
        * barriers allowed by glMemoryBarrierByRegion should be activated."
        */
-      if (barriers == GL_ALL_BARRIER_BITS)
-         return ctx->Driver.MemoryBarrier(ctx, all_allowed_bits);
+      if (barriers == GL_ALL_BARRIER_BITS) {
+         ctx->Driver.MemoryBarrier(ctx, all_allowed_bits);
+         return;
+      }
 
       /* From section 7.11.2 of the OpenGL ES 3.1 specification:
        *
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index d288b1dbe94..7610bcbd701 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -1844,6 +1844,10 @@ const struct function gl_core_functions_possible[] = {
    { "glGetQueryBufferObjecti64v", 45, -1 },
    { "glGetQueryBufferObjectui64v", 45, -1 },
 
+   /* GL_ARB_indirect_parameters */
+   { "glMultiDrawArraysIndirectCountARB", 31, -1 },
+   { "glMultiDrawElementsIndirectCountARB", 31, -1 },
+
    { NULL, 0, -1 }
 };
 
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 73b3318e948..50141be8693 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -2247,6 +2247,22 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
                      _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
+   } else {
+      /*
+       * Section 8.6 (Alternate Texture Image Specification Commands) of the
+       * OpenGL 4.5 (Compatibility Profile) spec says:
+       *
+       *     "Parameters level, internalformat, and border are specified using
+       *     the same values, with the same meanings, as the corresponding
+       *     arguments of TexImage2D, except that internalformat may not be
+       *     specified as 1, 2, 3, or 4."
+       */
+      if (internalFormat >= 1 && internalFormat <= 4) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glCopyTexImage%dD(internalFormat=%d)", dimensions,
+                     internalFormat);
+         return GL_TRUE;
+      }
    }
 
    baseFormat = _mesa_base_tex_format(ctx, internalFormat);
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 758ca2456df..47f80ce2001 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1002,10 +1002,10 @@ _mesa_UniformBlockBinding(GLuint program,
    if (!shProg)
       return;
 
-   if (uniformBlockIndex >= shProg->NumBufferInterfaceBlocks) {
+   if (uniformBlockIndex >= shProg->NumUniformBlocks) {
       _mesa_error(ctx, GL_INVALID_VALUE,
 		  "glUniformBlockBinding(block index %u >= %u)",
-		  uniformBlockIndex, shProg->NumBufferInterfaceBlocks);
+		  uniformBlockIndex, shProg->NumUniformBlocks);
       return;
    }
 
@@ -1016,17 +1016,22 @@ _mesa_UniformBlockBinding(GLuint program,
       return;
    }
 
-   if (shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding !=
+   if (shProg->UniformBlocks[uniformBlockIndex]->Binding !=
        uniformBlockBinding) {
       int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
 
-      shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding;
+      const int interface_block_index =
+         shProg->UboInterfaceBlockIndex[uniformBlockIndex];
+
+      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+         uniformBlockBinding;
 
       for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index = shProg->InterfaceBlockStageIndex[i][uniformBlockIndex];
+	 int stage_index =
+            shProg->InterfaceBlockStageIndex[i][interface_block_index];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = shProg->_LinkedShaders[i];
@@ -1054,10 +1059,10 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
    if (!shProg)
       return;
 
-   if (shaderStorageBlockIndex >= shProg->NumBufferInterfaceBlocks) {
+   if (shaderStorageBlockIndex >= shProg->NumShaderStorageBlocks) {
       _mesa_error(ctx, GL_INVALID_VALUE,
 		  "glShaderStorageBlockBinding(block index %u >= %u)",
-		  shaderStorageBlockIndex, shProg->NumBufferInterfaceBlocks);
+		  shaderStorageBlockIndex, shProg->NumShaderStorageBlocks);
       return;
    }
 
@@ -1069,17 +1074,22 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
       return;
    }
 
-   if (shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding !=
+   if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding !=
        shaderStorageBlockBinding) {
       int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
 
-      shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding;
+      const int interface_block_index =
+         shProg->SsboInterfaceBlockIndex[shaderStorageBlockIndex];
+
+      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+         shaderStorageBlockBinding;
 
       for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index = shProg->InterfaceBlockStageIndex[i][shaderStorageBlockIndex];
+	 int stage_index =
+            shProg->InterfaceBlockStageIndex[i][interface_block_index];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = shProg->_LinkedShaders[i];
diff --git a/src/mesa/math/m_matrix.c b/src/mesa/math/m_matrix.c
index 6522200b345..b3cfcd26a14 100644
--- a/src/mesa/math/m_matrix.c
+++ b/src/mesa/math/m_matrix.c
@@ -654,7 +654,7 @@ static GLboolean invert_matrix_3d_no_rot( GLmatrix *mat )
    if (MAT(in,0,0) == 0 || MAT(in,1,1) == 0 || MAT(in,2,2) == 0 )
       return GL_FALSE;
 
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
    MAT(out,0,0) = 1.0F / MAT(in,0,0);
    MAT(out,1,1) = 1.0F / MAT(in,1,1);
    MAT(out,2,2) = 1.0F / MAT(in,2,2);
@@ -687,7 +687,7 @@ static GLboolean invert_matrix_2d_no_rot( GLmatrix *mat )
    if (MAT(in,0,0) == 0 || MAT(in,1,1) == 0)
       return GL_FALSE;
 
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
    MAT(out,0,0) = 1.0F / MAT(in,0,0);
    MAT(out,1,1) = 1.0F / MAT(in,1,1);
 
@@ -709,7 +709,7 @@ static GLboolean invert_matrix_perspective( GLmatrix *mat )
    if (MAT(in,2,3) == 0)
       return GL_FALSE;
 
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
 
    MAT(out,0,0) = 1.0F / MAT(in,0,0);
    MAT(out,1,1) = 1.0F / MAT(in,1,1);
@@ -802,7 +802,7 @@ _math_matrix_rotate( GLmatrix *mat,
    s = sinf( angle * M_PI / 180.0 );
    c = cosf( angle * M_PI / 180.0 );
 
-   memcpy(m, Identity, sizeof(GLfloat)*16);
+   memcpy(m, Identity, sizeof(Identity));
    optimized = GL_FALSE;
 
 #define M(row,col)  m[col*4+row]
@@ -1136,8 +1136,8 @@ _math_matrix_viewport(GLmatrix *m, const float scale[3],
 void
 _math_matrix_set_identity( GLmatrix *mat )
 {
-   memcpy( mat->m, Identity, 16*sizeof(GLfloat) );
-   memcpy( mat->inv, Identity, 16*sizeof(GLfloat) );
+   memcpy( mat->m, Identity, sizeof(Identity) );
+   memcpy( mat->inv, Identity, sizeof(Identity) );
 
    mat->type = MATRIX_IDENTITY;
    mat->flags &= ~(MAT_DIRTY_FLAGS|
@@ -1437,7 +1437,7 @@ _math_matrix_is_dirty( const GLmatrix *m )
 void
 _math_matrix_copy( GLmatrix *to, const GLmatrix *from )
 {
-   memcpy( to->m, from->m, sizeof(Identity) );
+   memcpy(to->m, from->m, 16 * sizeof(GLfloat));
    memcpy(to->inv, from->inv, 16 * sizeof(GLfloat));
    to->flags = from->flags;
    to->type = from->type;
diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 53e9813e6fd..e98946b9387 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -89,6 +89,37 @@ _mesa_free_parameter_list(struct gl_program_parameter_list *paramList)
 
 
 /**
+ * Make sure there are enough unused parameter slots. Reallocate the list
+ * if needed.
+ *
+ * \param paramList        where to reserve parameter slots
+ * \param reserve_slots    number of slots to reserve
+ */
+void
+_mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList,
+                                unsigned reserve_slots)
+{
+   const GLuint oldNum = paramList->NumParameters;
+
+   if (oldNum + reserve_slots > paramList->Size) {
+      /* Need to grow the parameter list array (alloc some extra) */
+      paramList->Size = paramList->Size + 4 * reserve_slots;
+
+      /* realloc arrays */
+      paramList->Parameters =
+         realloc(paramList->Parameters,
+                 paramList->Size * sizeof(struct gl_program_parameter));
+
+      paramList->ParameterValues = (gl_constant_value (*)[4])
+         _mesa_align_realloc(paramList->ParameterValues,         /* old buf */
+                             oldNum * 4 * sizeof(gl_constant_value),/* old sz */
+                             paramList->Size*4*sizeof(gl_constant_value),/*new*/
+                             16);
+   }
+}
+
+
+/**
  * Add a new parameter to a parameter list.
  * Note that parameter values are usually 4-element GLfloat vectors.
  * When size > 4 we'll allocate a sequential block of parameters to
@@ -115,21 +146,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList,
 
    assert(size > 0);
 
-   if (oldNum + sz4 > paramList->Size) {
-      /* Need to grow the parameter list array (alloc some extra) */
-      paramList->Size = paramList->Size + 4 * sz4;
-
-      /* realloc arrays */
-      paramList->Parameters =
-         realloc(paramList->Parameters,
-                 paramList->Size * sizeof(struct gl_program_parameter));
-
-      paramList->ParameterValues = (gl_constant_value (*)[4])
-         _mesa_align_realloc(paramList->ParameterValues,         /* old buf */
-                             oldNum * 4 * sizeof(gl_constant_value),/* old sz */
-                             paramList->Size*4*sizeof(gl_constant_value),/*new*/
-                             16);
-   }
+   _mesa_reserve_parameter_storage(paramList, sz4);
 
    if (!paramList->Parameters ||
        !paramList->ParameterValues) {
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index 74a5fd91804..44700b710d7 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -112,6 +112,10 @@ _mesa_num_parameters(const struct gl_program_parameter_list *list)
    return list ? list->NumParameters : 0;
 }
 
+extern void
+_mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList,
+                                unsigned reserve_slots);
+
 extern GLint
 _mesa_add_parameter(struct gl_program_parameter_list *paramList,
                     gl_register_file type, const char *name,
diff --git a/src/mesa/program/programopt.c b/src/mesa/program/programopt.c
index af78150d594..24dde57725e 100644
--- a/src/mesa/program/programopt.c
+++ b/src/mesa/program/programopt.c
@@ -589,3 +589,30 @@ _mesa_remove_output_reads(struct gl_program *prog, gl_register_file type)
       }
    }
 }
+
+void
+_mesa_program_fragment_position_to_sysval(struct gl_program *prog)
+{
+   GLuint i;
+
+   if (prog->Target != GL_FRAGMENT_PROGRAM_ARB ||
+       !(prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_POS)))
+      return;
+
+   prog->InputsRead &= ~BITFIELD64_BIT(VARYING_SLOT_POS);
+   prog->SystemValuesRead |= 1 << SYSTEM_VALUE_FRAG_COORD;
+
+   for (i = 0; i < prog->NumInstructions; i++) {
+      struct prog_instruction *inst = prog->Instructions + i;
+      const GLuint numSrc = _mesa_num_inst_src_regs(inst->Opcode);
+      GLuint j;
+
+      for (j = 0; j < numSrc; j++) {
+         if (inst->SrcReg[j].File == PROGRAM_INPUT &&
+             inst->SrcReg[j].Index == VARYING_SLOT_POS) {
+            inst->SrcReg[j].File = PROGRAM_SYSTEM_VALUE;
+            inst->SrcReg[j].Index = SYSTEM_VALUE_FRAG_COORD;
+         }
+      }
+   }
+}
diff --git a/src/mesa/program/programopt.h b/src/mesa/program/programopt.h
index 757421edfe1..1520d161ea8 100644
--- a/src/mesa/program/programopt.h
+++ b/src/mesa/program/programopt.h
@@ -51,6 +51,8 @@ _mesa_count_texture_instructions(struct gl_program *prog);
 extern void
 _mesa_remove_output_reads(struct gl_program *prog, gl_register_file type);
 
+extern void
+_mesa_program_fragment_position_to_sysval(struct gl_program *prog);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 43dbadd4a7e..03097225bb2 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -33,7 +33,6 @@
 #include "pipe/p_defines.h"
 #include "st_context.h"
 #include "st_atom.h"
-#include "st_cb_bitmap.h"
 #include "st_program.h"
 #include "st_manager.h"
 
@@ -96,27 +95,26 @@ void st_destroy_atoms( struct st_context *st )
 }
 
 
-/***********************************************************************
- */
 
-static GLboolean check_state( const struct st_state_flags *a,
-			      const struct st_state_flags *b )
+static bool
+check_state(const struct st_state_flags *a, const struct st_state_flags *b)
 {
-   return ((a->mesa & b->mesa) ||
-	   (a->st & b->st));
+   return (a->mesa & b->mesa) || (a->st & b->st);
 }
 
-static void accumulate_state( struct st_state_flags *a,
-			      const struct st_state_flags *b )
+
+static void
+accumulate_state(struct st_state_flags *a, const struct st_state_flags *b)
 {
    a->mesa |= b->mesa;
    a->st |= b->st;
 }
 
 
-static void xor_states( struct st_state_flags *result,
-			     const struct st_state_flags *a,
-			      const struct st_state_flags *b )
+static void
+xor_states(struct st_state_flags *result,
+           const struct st_state_flags *a,
+           const struct st_state_flags *b)
 {
    result->mesa = a->mesa ^ b->mesa;
    result->st = a->st ^ b->st;
@@ -181,14 +179,11 @@ void st_validate_state( struct st_context *st )
 
    check_attrib_edgeflag(st);
 
-   if (state->mesa)
-      st_flush_bitmap_cache(st);
-
    check_program_state( st );
 
    st_manager_validate_framebuffers(st);
 
-   if (state->st == 0)
+   if (state->st == 0 && state->mesa == 0)
       return;
 
    /*printf("%s %x/%x\n", __func__, state->mesa, state->st);*/
@@ -245,6 +240,3 @@ void st_validate_state( struct st_context *st )
 
    memset(state, 0, sizeof(*state));
 }
-
-
-
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 20f8b3df99d..66811d29c29 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -84,6 +84,7 @@ void st_upload_constants( struct st_context *st,
          cb.buffer = NULL;
          cb.user_buffer = NULL;
          u_upload_data(st->constbuf_uploader, 0, paramBytes,
+                       st->ctx->Const.UniformBufferOffsetAlignment,
                        params->ParameterValues, &cb.buffer_offset, &cb.buffer);
          u_upload_unmap(st->constbuf_uploader);
       } else {
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index 55d5e66243c..c20cadf508f 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -220,13 +220,13 @@ static void update_raster_state( struct st_context *st )
    raster->line_smooth = ctx->Line.SmoothFlag;
    if (ctx->Line.SmoothFlag) {
       raster->line_width = CLAMP(ctx->Line.Width,
-                                ctx->Const.MinLineWidthAA,
-                                ctx->Const.MaxLineWidthAA);
+                                 ctx->Const.MinLineWidthAA,
+                                 ctx->Const.MaxLineWidthAA);
    }
    else {
       raster->line_width = CLAMP(ctx->Line.Width,
-                                ctx->Const.MinLineWidth,
-                                ctx->Const.MaxLineWidth);
+                                 ctx->Const.MinLineWidth,
+                                 ctx->Const.MaxLineWidth);
    }
 
    raster->line_stipple_enable = ctx->Line.StippleFlag;
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index cbc6845d771..d8c3dbdd793 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -204,7 +204,7 @@ setup_bitmap_vertex_data(struct st_context *st, bool normalized,
       tBot = (GLfloat) height;
    }
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]),
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), 4,
                   vbuf_offset, vbuf, (void **) &vertices);
    if (!*vbuf) {
       return;
@@ -287,7 +287,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
       GLfloat colorSave[4];
       COPY_4V(colorSave, ctx->Current.Attrib[VERT_ATTRIB_COLOR0]);
       COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], color);
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
       COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], colorSave);
    }
 
@@ -404,6 +405,9 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_restore_stream_outputs(cso);
 
    pipe_resource_reference(&vbuf, NULL);
+
+   /* We uploaded modified constants, need to invalidate them. */
+   st->dirty.mesa |= _NEW_PROGRAM_CONSTANTS;
 }
 
 
@@ -490,15 +494,15 @@ st_flush_bitmap_cache(struct st_context *st)
 
       assert(cache->xmin <= cache->xmax);
 
-/*    printf("flush size %d x %d  at %d, %d\n",
-             cache->xmax - cache->xmin,
-             cache->ymax - cache->ymin,
-             cache->xpos, cache->ypos);
-*/
+      if (0)
+         printf("flush bitmap, size %d x %d  at %d, %d\n",
+                cache->xmax - cache->xmin,
+                cache->ymax - cache->ymin,
+                cache->xpos, cache->ypos);
 
       /* The texture transfer has been mapped until now.
-          * So unmap and release the texture transfer before drawing.
-          */
+       * So unmap and release the texture transfer before drawing.
+       */
       if (cache->trans && cache->buffer) {
          if (0)
             print_cache(cache);
@@ -615,10 +619,17 @@ st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
    struct st_context *st = st_context(ctx);
    struct pipe_resource *pt;
 
-   if (width == 0 || height == 0)
-      return;
+   assert(width > 0);
+   assert(height > 0);
 
-   st_validate_state(st);
+   /* We only need to validate state of the st dirty flags are set or
+    * any non-_NEW_PROGRAM_CONSTANTS mesa flags are set.  The VS we use
+    * for bitmap drawing uses no constants and the FS constants are
+    * explicitly uploaded in the draw_bitmap_quad() function.
+    */
+   if ((st->dirty.mesa & ~_NEW_PROGRAM_CONSTANTS) || st->dirty.st) {
+      st_validate_state(st);
+   }
 
    if (!st->bitmap.vs) {
       /* create pass-through vertex shader now */
diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c
index 5d20b26d26e..8ca7e4c379b 100644
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -83,9 +83,7 @@ st_bufferobj_free(struct gl_context *ctx, struct gl_buffer_object *obj)
    if (st_obj->buffer)
       pipe_resource_reference(&st_obj->buffer, NULL);
 
-   mtx_destroy(&st_obj->Base.Mutex);
-   free(st_obj->Base.Label);
-   free(st_obj);
+   _mesa_delete_buffer_object(ctx, obj);
 }
 
 
@@ -230,6 +228,7 @@ st_bufferobj_data(struct gl_context *ctx,
       bind = PIPE_BIND_CONSTANT_BUFFER;
       break;
    case GL_DRAW_INDIRECT_BUFFER:
+   case GL_PARAMETER_BUFFER_ARB:
       bind = PIPE_BIND_COMMAND_ARGS_BUFFER;
       break;
    default:
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 18efd14a57c..7b6d10e76b1 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -41,6 +41,7 @@
 #include "program/prog_instruction.h"
 #include "st_context.h"
 #include "st_atom.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_clear.h"
 #include "st_cb_fbo.h"
 #include "st_format.h"
@@ -184,7 +185,7 @@ draw_quad(struct st_context *st,
 
    vb.stride = 8 * sizeof(float);
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]),
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), 4,
                   &vb.buffer_offset, &vb.buffer,
                   (void **) &vertices);
    if (!vb.buffer) {
@@ -466,6 +467,8 @@ st_Clear(struct gl_context *ctx, GLbitfield mask)
    GLbitfield clear_buffers = 0x0;
    GLuint i;
 
+   st_flush_bitmap_cache(st);
+
    /* This makes sure the pipe has the latest scissor, etc values */
    st_validate_state( st );
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 262ad809c58..7ed52dd2600 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -50,6 +50,7 @@
 
 #include "st_atom.h"
 #include "st_atom_constbuf.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_drawpixels.h"
 #include "st_cb_readpixels.h"
 #include "st_cb_fbo.h"
@@ -457,7 +458,7 @@ draw_quad(struct gl_context *ctx, GLfloat x0, GLfloat y0, GLfloat z,
    struct pipe_resource *buf = NULL;
    unsigned offset;
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(verts[0]), &offset,
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(verts[0]), 4, &offset,
                   &buf, (void **) &verts);
    if (!buf) {
       return;
@@ -1063,6 +1064,8 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
    /* Mesa state should be up to date by now */
    assert(ctx->NewState == 0x0);
 
+   st_flush_bitmap_cache(st);
+
    st_validate_state(st);
 
    /* Limit the size of the glDrawPixels to the max texture size.
@@ -1110,8 +1113,11 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
          num_sampler_view++;
       }
 
-      /* update fragment program constants */
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      /* compiling a new fragment shader variant added new state constants
+       * into the constant buffer, we need to update them
+       */
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
    }
 
    /* Put glDrawPixels image into a texture */
@@ -1419,6 +1425,8 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
    GLint readX, readY, readW, readH;
    struct gl_pixelstore_attrib pack = ctx->DefaultPacking;
 
+   st_flush_bitmap_cache(st);
+
    st_validate_state(st);
 
    if (type == GL_DEPTH_STENCIL) {
@@ -1463,8 +1471,11 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
          num_sampler_view++;
       }
 
-      /* update fragment program constants */
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      /* compiling a new fragment shader variant added new state constants
+       * into the constant buffer, we need to update them
+       */
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
    }
    else {
       assert(type == GL_DEPTH);
diff --git a/src/mesa/state_tracker/st_cb_drawtex.c b/src/mesa/state_tracker/st_cb_drawtex.c
index 2634b09d777..e6ab77fb521 100644
--- a/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/src/mesa/state_tracker/st_cb_drawtex.c
@@ -21,6 +21,7 @@
 
 #include "st_context.h"
 #include "st_atom.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_drawtex.h"
 
 #include "pipe/p_context.h"
@@ -113,6 +114,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
    struct pipe_vertex_element velements[2 + MAX_TEXTURE_UNITS];
    unsigned offset;
 
+   st_flush_bitmap_cache(st);
+
    st_validate_state(st);
 
    /* determine if we need vertex color */
@@ -150,7 +153,7 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
       GLuint attr;
 
       u_upload_alloc(st->uploader, 0,
-                     numAttribs * 4 * 4 * sizeof(GLfloat),
+                     numAttribs * 4 * 4 * sizeof(GLfloat), 4,
                      &offset, &vbuffer, (void **) &vbuf);
       if (!vbuffer) {
          return;
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 62f149aa0fb..f8b367989e7 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -388,6 +388,57 @@ guess_base_level_size(GLenum target,
 
 
 /**
+ * Try to determine whether we should allocate memory for a full texture
+ * mipmap.  The problem is when we get a glTexImage(level=0) call, we
+ * can't immediately know if other mipmap levels are coming next.  Here
+ * we try to guess whether to allocate memory for a mipmap or just the
+ * 0th level.
+ *
+ * If we guess incorrectly here we'll later reallocate the right amount of
+ * memory either in st_AllocTextureImageBuffer() or st_finalize_texture().
+ *
+ * \param stObj  the texture object we're going to allocate memory for.
+ * \param stImage  describes the incoming image which we need to store.
+ */
+static boolean
+allocate_full_mipmap(const struct st_texture_object *stObj,
+                     const struct st_texture_image *stImage)
+{
+   switch (stObj->base.Target) {
+   case GL_TEXTURE_RECTANGLE_NV:
+   case GL_TEXTURE_BUFFER:
+   case GL_TEXTURE_EXTERNAL_OES:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      /* these texture types cannot be mipmapped */
+      return FALSE;
+   }
+
+   if (stImage->base.Level > 0 || stObj->base.GenerateMipmap)
+      return TRUE;
+
+   if (stImage->base._BaseFormat == GL_DEPTH_COMPONENT ||
+       stImage->base._BaseFormat == GL_DEPTH_STENCIL_EXT)
+      /* depth/stencil textures are seldom mipmapped */
+      return FALSE;
+
+   if (stObj->base.BaseLevel == 0 && stObj->base.MaxLevel == 0)
+      return FALSE;
+
+   if (stObj->base.Sampler.MinFilter == GL_NEAREST ||
+       stObj->base.Sampler.MinFilter == GL_LINEAR)
+      /* not a mipmap minification filter */
+      return FALSE;
+
+   if (stObj->base.Target == GL_TEXTURE_3D)
+      /* 3D textures are seldom mipmapped */
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/**
  * Try to allocate a pipe_resource object for the given st_texture_object.
  *
  * We use the given st_texture_image as a clue to determine the size of the
@@ -431,22 +482,15 @@ guess_and_alloc_texture(struct st_context *st,
     * to re-allocating a texture buffer with space for more (or fewer)
     * mipmap levels later.
     */
-   if ((stObj->base.Sampler.MinFilter == GL_NEAREST ||
-        stObj->base.Sampler.MinFilter == GL_LINEAR ||
-        (stObj->base.BaseLevel == 0 &&
-         stObj->base.MaxLevel == 0) ||
-        stImage->base._BaseFormat == GL_DEPTH_COMPONENT ||
-        stImage->base._BaseFormat == GL_DEPTH_STENCIL_EXT) &&
-       !stObj->base.GenerateMipmap &&
-       stImage->base.Level == 0) {
-      /* only alloc space for a single mipmap level */
-      lastLevel = 0;
-   }
-   else {
+   if (allocate_full_mipmap(stObj, stImage)) {
       /* alloc space for a full mipmap */
       lastLevel = _mesa_get_tex_max_num_levels(stObj->base.Target,
                                                width, height, depth) - 1;
    }
+   else {
+      /* only alloc space for a single mipmap level */
+      lastLevel = 0;
+   }
 
    /* Save the level=0 dimensions */
    stObj->width0 = width;
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 1459f258f94..87193a9d478 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -80,9 +80,26 @@ DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE)
 
 
 /**
+ * Called via ctx->Driver.Enable()
+ */
+static void st_Enable(struct gl_context * ctx, GLenum cap, GLboolean state)
+{
+   struct st_context *st = st_context(ctx);
+
+   switch (cap) {
+   case GL_DEBUG_OUTPUT:
+      st_enable_debug_output(st, state);
+      break;
+   default:
+      break;
+   }
+}
+
+
+/**
  * Called via ctx->Driver.UpdateState()
  */
-void st_invalidate_state(struct gl_context * ctx, GLuint new_state)
+void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state)
 {
    struct st_context *st = st_context(ctx);
 
@@ -172,20 +189,19 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
    /* Create upload manager for vertex data for glBitmap, glDrawPixels,
     * glClear, etc.
     */
-   st->uploader = u_upload_create(st->pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+   st->uploader = u_upload_create(st->pipe, 65536, PIPE_BIND_VERTEX_BUFFER,
+                                  PIPE_USAGE_STREAM);
 
    if (!screen->get_param(screen, PIPE_CAP_USER_INDEX_BUFFERS)) {
-      st->indexbuf_uploader = u_upload_create(st->pipe, 128 * 1024, 4,
-                                              PIPE_BIND_INDEX_BUFFER);
+      st->indexbuf_uploader = u_upload_create(st->pipe, 128 * 1024,
+                                              PIPE_BIND_INDEX_BUFFER,
+                                              PIPE_USAGE_STREAM);
    }
 
-   if (!screen->get_param(screen, PIPE_CAP_USER_CONSTANT_BUFFERS)) {
-      unsigned alignment =
-         screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
-
-      st->constbuf_uploader = u_upload_create(pipe, 128 * 1024, alignment,
-                                              PIPE_BIND_CONSTANT_BUFFER);
-   }
+   if (!screen->get_param(screen, PIPE_CAP_USER_CONSTANT_BUFFERS))
+      st->constbuf_uploader = u_upload_create(pipe, 128 * 1024,
+                                              PIPE_BIND_CONSTANT_BUFFER,
+                                              PIPE_USAGE_STREAM);
 
    st->cso_context = cso_create_context(pipe);
 
@@ -249,6 +265,10 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
           PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600));
    st->has_time_elapsed =
       screen->get_param(screen, PIPE_CAP_QUERY_TIME_ELAPSED);
+   st->has_half_float_packing =
+      screen->get_param(screen, PIPE_CAP_TGSI_PACK_HALF_FLOAT);
+   st->has_multi_draw_indirect =
+      screen->get_param(screen, PIPE_CAP_MULTI_DRAW_INDIRECT);
 
    /* GL limits and extensions */
    st_init_limits(st->pipe->screen, &ctx->Const, &ctx->Extensions);
@@ -456,5 +476,6 @@ void st_init_driver_functions(struct pipe_screen *screen,
 
    st_init_vdpau_functions(functions);
 
+   functions->Enable = st_Enable;
    functions->UpdateState = st_invalidate_state;
 }
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 60a9a4bb0d5..9db5f11beb5 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -65,8 +65,8 @@ struct u_upload_mgr;
 
 
 struct st_state_flags {
-   GLuint mesa;
-   uint64_t st;
+   GLbitfield mesa;  /**< Mask of _NEW_x flags */
+   uint64_t st;      /**< Mask of ST_NEW_x flags */
 };
 
 struct st_tracked_state {
@@ -101,6 +101,8 @@ struct st_context
    boolean prefer_blit_based_texture_transfer;
    boolean force_persample_in_shader;
    boolean has_shareable_shaders;
+   boolean has_half_float_packing;
+   boolean has_multi_draw_indirect;
 
    /**
     * If a shader can be created when we get its source.
@@ -251,7 +253,7 @@ struct st_framebuffer
 extern void st_init_driver_functions(struct pipe_screen *screen,
                                      struct dd_function_table *functions);
 
-void st_invalidate_state(struct gl_context * ctx, GLuint new_state);
+void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state);
 
 
 
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 6d859c6ab5b..134366db09d 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -104,3 +104,75 @@ st_print_current(void)
 }
 
 
+/**
+ * Installed as pipe_debug_callback when GL_DEBUG_OUTPUT is enabled.
+ */
+static void
+st_debug_message(void *data,
+                 unsigned *id,
+                 enum pipe_debug_type ptype,
+                 const char *fmt,
+                 va_list args)
+{
+   struct st_context *st = data;
+   enum mesa_debug_source source;
+   enum mesa_debug_type type;
+   enum mesa_debug_severity severity;
+
+   switch (ptype) {
+   case PIPE_DEBUG_TYPE_OUT_OF_MEMORY:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_ERROR;
+      severity = MESA_DEBUG_SEVERITY_MEDIUM;
+      break;
+   case PIPE_DEBUG_TYPE_ERROR:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_ERROR;
+      severity = MESA_DEBUG_SEVERITY_MEDIUM;
+      break;
+   case PIPE_DEBUG_TYPE_SHADER_INFO:
+      source = MESA_DEBUG_SOURCE_SHADER_COMPILER;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_PERF_INFO:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_PERFORMANCE;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_INFO:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_FALLBACK:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_PERFORMANCE;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_CONFORMANCE:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   default:
+      unreachable("invalid debug type");
+   }
+   _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args);
+}
+
+void
+st_enable_debug_output(struct st_context *st, boolean enable)
+{
+   struct pipe_context *pipe = st->pipe;
+
+   if (!pipe->set_debug_callback)
+      return;
+
+   if (enable) {
+      struct pipe_debug_callback cb = { st_debug_message, st };
+      pipe->set_debug_callback(pipe, &cb);
+   } else {
+      pipe->set_debug_callback(pipe, NULL);
+   }
+}
diff --git a/src/mesa/state_tracker/st_debug.h b/src/mesa/state_tracker/st_debug.h
index 288eccf9f9c..ed3ead82914 100644
--- a/src/mesa/state_tracker/st_debug.h
+++ b/src/mesa/state_tracker/st_debug.h
@@ -32,6 +32,8 @@
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
 
+struct st_context;
+
 extern void
 st_print_current(void);
 
@@ -59,6 +61,8 @@ extern int ST_DEBUG;
 
 void st_debug_init( void );
 
+void st_enable_debug_output(struct st_context *st, boolean enable);
+
 static inline void
 ST_DBG( unsigned flag, const char *fmt, ... )
 {
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index f4b273bf93f..03788f33468 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -48,6 +48,7 @@
 
 #include "st_context.h"
 #include "st_atom.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_bufferobjects.h"
 #include "st_cb_xformfb.h"
 #include "st_debug.h"
@@ -107,7 +108,7 @@ setup_index_buffer(struct st_context *st,
    else if (st->indexbuf_uploader) {
       /* upload indexes from user memory into a real buffer */
       u_upload_data(st->indexbuf_uploader, 0,
-                    ib->count * ibuffer->index_size, ib->ptr,
+                    ib->count * ibuffer->index_size, 4, ib->ptr,
                     &ibuffer->offset, &ibuffer->buffer);
       if (!ibuffer->buffer) {
          /* out of memory */
@@ -197,6 +198,8 @@ st_draw_vbo(struct gl_context *ctx,
    /* Mesa core state should have been validated already */
    assert(ctx->NewState == 0x0);
 
+   st_flush_bitmap_cache(st);
+
    /* Validate state. */
    if (st->dirty.st || ctx->NewDriverState) {
       st_validate_state(st);
@@ -249,13 +252,7 @@ st_draw_vbo(struct gl_context *ctx,
       }
    }
 
-   if (indirect) {
-      info.indirect = st_buffer_object(indirect)->buffer;
-
-      /* Primitive restart is not handled by the VBO module in this case. */
-      info.primitive_restart = ctx->Array._PrimitiveRestart;
-      info.restart_index = ctx->Array.RestartIndex;
-   }
+   assert(!indirect);
 
    /* do actual drawing */
    for (i = 0; i < nr_prims; i++) {
@@ -266,11 +263,11 @@ st_draw_vbo(struct gl_context *ctx,
       info.instance_count = prims[i].num_instances;
       info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
       info.index_bias = prims[i].basevertex;
+      info.drawid = prims[i].draw_id;
       if (!ib) {
          info.min_index = info.start;
          info.max_index = info.start + info.count - 1;
       }
-      info.indirect_offset = prims[i].indirect_offset;
 
       if (ST_DEBUG & DEBUG_DRAW) {
          debug_printf("st/draw: mode %s  start %u  count %u  indexed %d\n",
@@ -280,7 +277,7 @@ st_draw_vbo(struct gl_context *ctx,
                       info.indexed);
       }
 
-      if (info.count_from_stream_output || info.indirect) {
+      if (info.count_from_stream_output) {
          cso_draw_vbo(st->cso_context, &info);
       }
       else if (info.primitive_restart) {
@@ -297,6 +294,84 @@ st_draw_vbo(struct gl_context *ctx,
    }
 }
 
+static void
+st_indirect_draw_vbo(struct gl_context *ctx,
+                     GLuint mode,
+                     struct gl_buffer_object *indirect_data,
+                     GLsizeiptr indirect_offset,
+                     unsigned draw_count,
+                     unsigned stride,
+                     struct gl_buffer_object *indirect_params,
+                     GLsizeiptr indirect_params_offset,
+                     const struct _mesa_index_buffer *ib)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_index_buffer ibuffer = {0};
+   struct pipe_draw_info info;
+
+   /* Mesa core state should have been validated already */
+   assert(ctx->NewState == 0x0);
+   assert(stride);
+
+   /* Validate state. */
+   if (st->dirty.st || ctx->NewDriverState) {
+      st_validate_state(st);
+   }
+
+   if (st->vertex_array_out_of_memory) {
+      return;
+   }
+
+   util_draw_init_info(&info);
+
+   if (ib) {
+      if (!setup_index_buffer(st, ib, &ibuffer)) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sDrawElementsIndirect%s",
+                     (draw_count > 1) ? "Multi" : "",
+                     indirect_params ? "CountARB" : "");
+         return;
+      }
+
+      info.indexed = TRUE;
+   }
+
+   info.mode = translate_prim(ctx, mode);
+   info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
+   info.indirect = st_buffer_object(indirect_data)->buffer;
+   info.indirect_offset = indirect_offset;
+
+   /* Primitive restart is not handled by the VBO module in this case. */
+   info.primitive_restart = ctx->Array._PrimitiveRestart;
+   info.restart_index = ctx->Array.RestartIndex;
+
+   if (ST_DEBUG & DEBUG_DRAW) {
+      debug_printf("st/draw indirect: mode %s drawcount %d indexed %d\n",
+                   u_prim_name(info.mode),
+                   draw_count,
+                   info.indexed);
+   }
+
+   if (!st->has_multi_draw_indirect) {
+      int i;
+
+      assert(!indirect_params);
+      info.indirect_count = 1;
+      for (i = 0; i < draw_count; i++) {
+         info.drawid = i;
+         cso_draw_vbo(st->cso_context, &info);
+         info.indirect_offset += stride;
+      }
+   } else {
+      info.indirect_count = draw_count;
+      info.indirect_stride = stride;
+      if (indirect_params) {
+         info.indirect_params = st_buffer_object(indirect_params)->buffer;
+         info.indirect_params_offset = indirect_params_offset;
+      }
+      cso_draw_vbo(st->cso_context, &info);
+   }
+}
+
 
 void
 st_init_draw(struct st_context *st)
@@ -304,6 +379,7 @@ st_init_draw(struct st_context *st)
    struct gl_context *ctx = st->ctx;
 
    vbo_set_draw_func(ctx, st_draw_vbo);
+   vbo_set_indirect_draw_func(ctx, st_indirect_draw_vbo);
 
    st->draw = draw_create(st->pipe); /* for selection/feedback */
 
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index 88c10a8f150..b6e6dea5b27 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -33,6 +33,7 @@
 
 #include "st_context.h"
 #include "st_atom.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_bufferobjects.h"
 #include "st_draw.h"
 #include "st_program.h"
@@ -137,6 +138,8 @@ st_feedback_draw_vbo(struct gl_context *ctx,
 
    assert(draw);
 
+   st_flush_bitmap_cache(st);
+
    st_validate_state(st);
 
    if (!index_bounds_valid)
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index a2418e28a91..2a3e52362e4 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -314,10 +314,11 @@ void st_init_limits(struct pipe_screen *screen,
    c->GLSLSkipStrictMaxUniformLimitCheck =
       screen->get_param(screen, PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS);
 
+   c->UniformBufferOffsetAlignment =
+      screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
+
    if (can_ubo) {
       extensions->ARB_uniform_buffer_object = GL_TRUE;
-      c->UniformBufferOffsetAlignment =
-         screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
       c->MaxCombinedUniformBlocks = c->MaxUniformBufferBindings =
          c->Program[MESA_SHADER_VERTEX].MaxUniformBlocks +
          c->Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks +
@@ -326,6 +327,11 @@ void st_init_limits(struct pipe_screen *screen,
          c->Program[MESA_SHADER_FRAGMENT].MaxUniformBlocks;
       assert(c->MaxCombinedUniformBlocks <= MAX_COMBINED_UNIFORM_BUFFERS);
    }
+
+   c->GLSLFragCoordIsSysVal =
+      screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL);
+   c->GLSLFrontFacingIsSysVal =
+      screen->get_param(screen, PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL);
 }
 
 
@@ -440,34 +446,47 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
       { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
       { o(ARB_clear_texture),                PIPE_CAP_CLEAR_TEXTURE                    },
+      { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
       { o(ARB_color_buffer_float),           PIPE_CAP_VERTEX_COLOR_UNCLAMPED           },
+      { o(ARB_conditional_render_inverted),  PIPE_CAP_CONDITIONAL_RENDER_INVERTED      },
       { o(ARB_copy_image),                   PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS },
       { o(ARB_depth_clamp),                  PIPE_CAP_DEPTH_CLIP_DISABLE               },
       { o(ARB_depth_texture),                PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_derivative_control),           PIPE_CAP_TGSI_FS_FINE_DERIVATIVE          },
       { o(ARB_draw_buffers_blend),           PIPE_CAP_INDEP_BLEND_FUNC                 },
+      { o(ARB_draw_indirect),                PIPE_CAP_DRAW_INDIRECT                    },
       { o(ARB_draw_instanced),               PIPE_CAP_TGSI_INSTANCEID                  },
       { o(ARB_fragment_program_shadow),      PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_framebuffer_object),           PIPE_CAP_MIXED_FRAMEBUFFER_SIZES          },
+      { o(ARB_indirect_parameters),          PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS       },
       { o(ARB_instanced_arrays),             PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR  },
       { o(ARB_occlusion_query),              PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_pipeline_statistics_query),    PIPE_CAP_QUERY_PIPELINE_STATISTICS        },
       { o(ARB_point_sprite),                 PIPE_CAP_POINT_SPRITE                     },
+      { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
       { o(ARB_seamless_cube_map),            PIPE_CAP_SEAMLESS_CUBE_MAP                },
+      { o(ARB_shader_draw_parameters),       PIPE_CAP_DRAW_PARAMETERS                  },
       { o(ARB_shader_stencil_export),        PIPE_CAP_SHADER_STENCIL_EXPORT            },
       { o(ARB_shader_texture_image_samples), PIPE_CAP_TGSI_TXQS                        },
       { o(ARB_shader_texture_lod),           PIPE_CAP_SM3                              },
       { o(ARB_shadow),                       PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_texture_buffer_object),        PIPE_CAP_TEXTURE_BUFFER_OBJECTS           },
+      { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
       { o(ARB_texture_gather),               PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS    },
       { o(ARB_texture_mirror_clamp_to_edge), PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
+      { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
       { o(ARB_texture_non_power_of_two),     PIPE_CAP_NPOT_TEXTURES                    },
+      { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
+      { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
       { o(ARB_timer_query),                  PIPE_CAP_QUERY_TIMESTAMP                  },
       { o(ARB_transform_feedback2),          PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME       },
       { o(ARB_transform_feedback3),          PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME       },
 
       { o(EXT_blend_equation_separate),      PIPE_CAP_BLEND_EQUATION_SEPARATE          },
+      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
       { o(EXT_draw_buffers2),                PIPE_CAP_INDEP_BLEND_ENABLE               },
+      { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
       { o(EXT_stencil_two_side),             PIPE_CAP_TWO_SIDED_STENCIL                },
       { o(EXT_texture_array),                PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS         },
       { o(EXT_texture_filter_anisotropic),   PIPE_CAP_ANISOTROPIC_FILTER               },
@@ -488,17 +507,6 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(OES_standard_derivatives),         PIPE_CAP_SM3                              },
       { o(OES_texture_float_linear),         PIPE_CAP_TEXTURE_FLOAT_LINEAR             },
       { o(OES_texture_half_float_linear),    PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR        },
-      { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
-      { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
-      { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
-      { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
-      { o(ARB_draw_indirect),                PIPE_CAP_DRAW_INDIRECT                    },
-      { o(ARB_derivative_control),           PIPE_CAP_TGSI_FS_FINE_DERIVATIVE          },
-      { o(ARB_conditional_render_inverted),  PIPE_CAP_CONDITIONAL_RENDER_INVERTED      },
-      { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
-      { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
-      { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
-      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
    };
 
    /* Required: render target and sampler support */
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 5a6be08185f..27a0a4f51e1 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -85,6 +85,7 @@ public:
       this->has_index2 = false;
       this->double_reg2 = false;
       this->array_id = 0;
+      this->is_double_vertex_input = false;
    }
 
    st_src_reg(gl_register_file file, int index, int type)
@@ -100,6 +101,7 @@ public:
       this->has_index2 = false;
       this->double_reg2 = false;
       this->array_id = 0;
+      this->is_double_vertex_input = false;
    }
 
    st_src_reg(gl_register_file file, int index, int type, int index2D)
@@ -115,6 +117,7 @@ public:
       this->has_index2 = false;
       this->double_reg2 = false;
       this->array_id = 0;
+      this->is_double_vertex_input = false;
    }
 
    st_src_reg()
@@ -130,6 +133,7 @@ public:
       this->has_index2 = false;
       this->double_reg2 = false;
       this->array_id = 0;
+      this->is_double_vertex_input = false;
    }
 
    explicit st_src_reg(st_dst_reg reg);
@@ -150,6 +154,7 @@ public:
     */
    bool double_reg2;
    unsigned array_id;
+   bool is_double_vertex_input;
 };
 
 class st_dst_reg {
@@ -224,6 +229,7 @@ st_src_reg::st_src_reg(st_dst_reg reg)
    this->has_index2 = reg.has_index2;
    this->double_reg2 = false;
    this->array_id = reg.array_id;
+   this->is_double_vertex_input = false;
 }
 
 st_dst_reg::st_dst_reg(st_src_reg reg)
@@ -334,8 +340,24 @@ struct array_decl {
    unsigned mesa_index;
    unsigned array_id;
    unsigned array_size;
+   unsigned array_type;
 };
 
+static unsigned
+find_array_type(struct array_decl *arrays, unsigned count, unsigned array_id)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      struct array_decl *decl = &arrays[i];
+
+      if (array_id == decl->array_id) {
+         return decl->array_type;
+      }
+   }
+   return GLSL_TYPE_ERROR;
+}
+
 struct rename_reg_pair {
    int old_reg;
    int new_reg;
@@ -555,6 +577,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
 {
    glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
    int num_reladdr = 0, i, j;
+   bool dst_is_double[2];
 
    op = get_opcode(ir, op, dst, src0, src1);
 
@@ -658,7 +681,18 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
     * GLSL [0].z -> TGSI [1].xy
     * GLSL [0].w -> TGSI [1].zw
     */
-   if (inst->dst[0].type == GLSL_TYPE_DOUBLE || inst->dst[1].type == GLSL_TYPE_DOUBLE ||
+   for (j = 0; j < 2; j++) {
+      dst_is_double[j] = false;
+      if (inst->dst[j].type == GLSL_TYPE_DOUBLE)
+         dst_is_double[j] = true;
+      else if (inst->dst[j].file == PROGRAM_OUTPUT && inst->dst[j].type == GLSL_TYPE_ARRAY) {
+         unsigned type = find_array_type(this->output_arrays, this->num_output_arrays, inst->dst[j].array_id);
+         if (type == GLSL_TYPE_DOUBLE)
+            dst_is_double[j] = true;
+      }
+   }
+
+   if (dst_is_double[0] || dst_is_double[1] ||
        inst->src[0].type == GLSL_TYPE_DOUBLE) {
       glsl_to_tgsi_instruction *dinst = NULL;
       int initial_src_swz[4], initial_src_idx[4];
@@ -699,7 +733,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
 
          /* modify the destination if we are splitting */
          for (j = 0; j < 2; j++) {
-            if (dinst->dst[j].type == GLSL_TYPE_DOUBLE) {
+            if (dst_is_double[j]) {
                dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
                dinst->dst[j].index = initial_dst_idx[j];
                if (i > 1)
@@ -732,7 +766,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
                   - F2D is a float src0, DLDEXP is integer src1 */
                if (op == TGSI_OPCODE_F2D ||
                    op == TGSI_OPCODE_DLDEXP ||
-                   (op == TGSI_OPCODE_UCMP && dinst->dst[0].type == GLSL_TYPE_DOUBLE)) {
+                   (op == TGSI_OPCODE_UCMP && dst_is_double[0])) {
                   dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
                }
             }
@@ -1057,7 +1091,7 @@ glsl_to_tgsi_visitor::st_src_reg_for_type(int type, int val)
 }
 
 static int
-type_size(const struct glsl_type *type)
+attrib_type_size(const struct glsl_type *type, bool is_vs_input)
 {
    unsigned int i;
    int size;
@@ -1080,7 +1114,7 @@ type_size(const struct glsl_type *type)
       break;
    case GLSL_TYPE_DOUBLE:
       if (type->is_matrix()) {
-         if (type->vector_elements <= 2)
+         if (type->vector_elements <= 2 || is_vs_input)
             return type->matrix_columns;
          else
             return type->matrix_columns * 2;
@@ -1088,7 +1122,7 @@ type_size(const struct glsl_type *type)
          /* For doubles if we have a double or dvec2 they fit in one
           * vec4, else they need 2 vec4s.
           */
-         if (type->vector_elements <= 2)
+         if (type->vector_elements <= 2 || is_vs_input)
             return 1;
          else
             return 2;
@@ -1096,11 +1130,11 @@ type_size(const struct glsl_type *type)
       break;
    case GLSL_TYPE_ARRAY:
       assert(type->length > 0);
-      return type_size(type->fields.array) * type->length;
+      return attrib_type_size(type->fields.array, is_vs_input) * type->length;
    case GLSL_TYPE_STRUCT:
       size = 0;
       for (i = 0; i < type->length; i++) {
-         size += type_size(type->fields.structure[i].type);
+         size += attrib_type_size(type->fields.structure[i].type, is_vs_input);
       }
       return size;
    case GLSL_TYPE_SAMPLER:
@@ -1120,6 +1154,11 @@ type_size(const struct glsl_type *type)
    return 0;
 }
 
+static int
+type_size(const struct glsl_type *type)
+{
+  return attrib_type_size(type, false);
+}
 
 /**
  * If the given GLSL type is an array or matrix or a structure containing
@@ -1413,7 +1452,7 @@ glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
    if (reg->reladdr2) emit_arl(ir, address_reg2, *reg->reladdr2);
 
    if (*num_reladdr != 1) {
-      st_src_reg temp = get_temp(glsl_type::vec4_type);
+      st_src_reg temp = get_temp(reg->type == GLSL_TYPE_DOUBLE ? glsl_type::dvec4_type : glsl_type::vec4_type);
 
       emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
       *reg = temp;
@@ -2124,15 +2163,20 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       break;
 
+   case ir_unop_pack_half_2x16:
+      emit_asm(ir, TGSI_OPCODE_PK2H, result_dst, op[0]);
+      break;
+   case ir_unop_unpack_half_2x16:
+      emit_asm(ir, TGSI_OPCODE_UP2H, result_dst, op[0]);
+      break;
+
    case ir_unop_pack_snorm_2x16:
    case ir_unop_pack_unorm_2x16:
-   case ir_unop_pack_half_2x16:
    case ir_unop_pack_snorm_4x8:
    case ir_unop_pack_unorm_4x8:
 
    case ir_unop_unpack_snorm_2x16:
    case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_half_2x16:
    case ir_unop_unpack_half_2x16_split_x:
    case ir_unop_unpack_half_2x16_split_y:
    case ir_unop_unpack_snorm_4x8:
@@ -2263,10 +2307,13 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
             decl->mesa_index = var->data.location;
             decl->array_id = num_input_arrays + 1;
-            if (is_2d)
+            if (is_2d) {
                decl->array_size = type_size(var->type->fields.array);
-            else
+               decl->array_type = var->type->fields.array->without_array()->base_type;
+            } else {
                decl->array_size = type_size(var->type);
+               decl->array_type = var->type->without_array()->base_type;
+            }
             num_input_arrays++;
 
             entry = new(mem_ctx) variable_storage(var,
@@ -2289,10 +2336,13 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
             decl->mesa_index = var->data.location;
             decl->array_id = num_output_arrays + 1;
-            if (is_2d)
+            if (is_2d) {
                decl->array_size = type_size(var->type->fields.array);
-            else
+               decl->array_type = var->type->fields.array->without_array()->base_type;
+            } else {
                decl->array_size = type_size(var->type);
+               decl->array_type = var->type->without_array()->base_type;
+            }
             num_output_arrays++;
 
             entry = new(mem_ctx) variable_storage(var,
@@ -2331,6 +2381,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
    this->result = st_src_reg(entry->file, entry->index, var->type);
    this->result.array_id = entry->array_id;
+   if (this->shader->Stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in && var->type->is_double())
+      this->result.is_double_vertex_input = true;
    if (!native_integers)
       this->result.type = GLSL_TYPE_FLOAT;
 }
@@ -2338,6 +2390,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 static void
 shrink_array_declarations(struct array_decl *arrays, unsigned count,
                           GLbitfield64 usage_mask,
+                          GLbitfield64 double_usage_mask,
                           GLbitfield patch_usage_mask)
 {
    unsigned i, j;
@@ -2358,6 +2411,8 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
          else {
             if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
                break;
+            if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
+               break;
          }
 
          decl->mesa_index++;
@@ -2375,6 +2430,8 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
          else {
             if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
                break;
+            if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
+               break;
          }
 
          decl->array_size--;
@@ -2415,6 +2472,10 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
       element_size = 1;
 
    if (index) {
+
+      if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
+	  src.file == PROGRAM_INPUT)
+	 element_size = attrib_type_size(ir->type, true);
       if (is_2D) {
          src.index2D = index->value.i[0];
          src.has_index2 = true;
@@ -2666,7 +2727,7 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
    if (type->is_matrix()) {
       const struct glsl_type *vec_type;
 
-      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
+      vec_type = glsl_type::get_instance(type->is_double() ? GLSL_TYPE_DOUBLE : GLSL_TYPE_FLOAT,
                                          type->vector_elements, 1);
 
       for (int i = 0; i < type->matrix_columns; i++) {
@@ -2696,6 +2757,11 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
    }
    l->index++;
    r->index++;
+   if (type->is_dual_slot_double()) {
+      l->index++;
+      if (r->is_double_vertex_input == false)
+	 r->index++;
+   }
 }
 
 void
@@ -2715,7 +2781,26 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
     */
    if (ir->write_mask == 0) {
       assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
-      l.writemask = WRITEMASK_XYZW;
+
+      if (ir->lhs->type->is_array() || ir->lhs->type->without_array()->is_matrix()) {
+         if (ir->lhs->type->without_array()->is_double()) {
+            switch (ir->lhs->type->without_array()->vector_elements) {
+            case 1:
+               l.writemask = WRITEMASK_X;
+               break;
+            case 2:
+               l.writemask = WRITEMASK_XY;
+               break;
+            case 3:
+               l.writemask = WRITEMASK_XYZ;
+               break;
+            case 4:
+               l.writemask = WRITEMASK_XYZW;
+               break;
+            }
+         } else
+            l.writemask = WRITEMASK_XYZW;
+      }
    } else if (ir->lhs->type->is_scalar() &&
               !ir->lhs->type->is_double() &&
               ir->lhs->variable_referenced()->data.mode == ir_var_shader_out) {
@@ -2849,20 +2934,57 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
       st_dst_reg mat_column = st_dst_reg(mat);
 
       for (i = 0; i < ir->type->matrix_columns; i++) {
-         assert(ir->type->base_type == GLSL_TYPE_FLOAT);
-         values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
-
-         src = st_src_reg(file, -1, ir->type->base_type);
-         src.index = add_constant(file,
-                                  values,
-                                  ir->type->vector_elements,
-                                  GL_FLOAT,
-                                  &src.swizzle);
-         emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+         switch (ir->type->base_type) {
+         case GLSL_TYPE_FLOAT:
+            values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
 
+            src = st_src_reg(file, -1, ir->type->base_type);
+            src.index = add_constant(file,
+                                     values,
+                                     ir->type->vector_elements,
+                                     GL_FLOAT,
+                                     &src.swizzle);
+            emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            break;
+         case GLSL_TYPE_DOUBLE:
+            values = (gl_constant_value *) &ir->value.d[i * ir->type->vector_elements];
+            src = st_src_reg(file, -1, ir->type->base_type);
+            src.index = add_constant(file,
+                                     values,
+                                     ir->type->vector_elements,
+                                     GL_DOUBLE,
+                                     &src.swizzle);
+            if (ir->type->vector_elements >= 2) {
+               mat_column.writemask = WRITEMASK_XY;
+               src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
+               emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            } else {
+               mat_column.writemask = WRITEMASK_X;
+               src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+               emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            }
+            src.index++;
+            if (ir->type->vector_elements > 2) {
+               if (ir->type->vector_elements == 4) {
+                  mat_column.writemask = WRITEMASK_ZW;
+                  src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
+                  emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+               } else {
+                  mat_column.writemask = WRITEMASK_Z;
+                  src.swizzle = MAKE_SWIZZLE4(SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y);
+                  emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+                  mat_column.writemask = WRITEMASK_XYZW;
+                  src.swizzle = SWIZZLE_XYZW;
+               }
+               mat_column.index++;
+            }
+            break;
+         default:
+            unreachable("Illegal matrix constant type.\n");
+            break;
+         }
          mat_column.index++;
       }
-
       this->result = mat;
       return;
    }
@@ -4328,8 +4450,8 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
    TGSI_SEMANTIC_INSTANCEID,
    TGSI_SEMANTIC_VERTEXID_NOBASE,
    TGSI_SEMANTIC_BASEVERTEX,
-   0, /* SYSTEM_VALUE_BASE_INSTANCE */
-   0, /* SYSTEM_VALUE_DRAW_ID */
+   TGSI_SEMANTIC_BASEINSTANCE,
+   TGSI_SEMANTIC_DRAWID,
 
    /* Geometry shader
     */
@@ -4337,6 +4459,7 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
 
    /* Fragment shader
     */
+   TGSI_SEMANTIC_POSITION,
    TGSI_SEMANTIC_FACE,
    TGSI_SEMANTIC_SAMPLEID,
    TGSI_SEMANTIC_SAMPLEPOS,
@@ -4545,7 +4668,7 @@ src_register(struct st_translate *t, const st_src_reg *reg)
       if (!reg->array_id) {
          assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
          assert(t->inputs[t->inputMapping[index]].File != TGSI_FILE_NULL);
-         return t->inputs[t->inputMapping[index]];
+         return t->inputs[t->inputMapping[index] + double_reg2];
       }
       else {
          struct array_decl *decl = &t->input_arrays[reg->array_id-1];
@@ -4554,7 +4677,7 @@ src_register(struct st_translate *t, const st_src_reg *reg)
 
          assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
          assert(t->inputs[slot].ArrayID == reg->array_id);
-         return ureg_src_array_offset(t->inputs[slot], index - mesa_index);
+         return ureg_src_array_offset(t->inputs[slot], index + double_reg2 - mesa_index);
       }
 
    case PROGRAM_ADDRESS:
@@ -4783,10 +4906,11 @@ compile_tgsi_instruction(struct st_translate *t,
  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
  */
 static void
-emit_wpos_adjustment( struct st_translate *t,
-                      int wpos_transform_const,
-                      boolean invert,
-                      GLfloat adjX, GLfloat adjY[2])
+emit_wpos_adjustment(struct gl_context *ctx,
+                     struct st_translate *t,
+                     int wpos_transform_const,
+                     boolean invert,
+                     GLfloat adjX, GLfloat adjY[2])
 {
    struct ureg_program *ureg = t->ureg;
 
@@ -4798,7 +4922,11 @@ emit_wpos_adjustment( struct st_translate *t,
     */
    struct ureg_src wpostrans = ureg_DECL_constant(ureg, wpos_transform_const);
    struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src wpos_input = t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src *wpos =
+      ctx->Const.GLSLFragCoordIsSysVal ?
+         &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
+         &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src wpos_input = *wpos;
 
    /* First, apply the coordinate shift: */
    if (adjX || adjY[0] || adjY[1]) {
@@ -4849,7 +4977,7 @@ emit_wpos_adjustment( struct st_translate *t,
 
    /* Use wpos_temp as position input from here on:
     */
-   t->inputs[t->inputMapping[VARYING_SLOT_POS]] = ureg_src(wpos_temp);
+   *wpos = ureg_src(wpos_temp);
 }
 
 
@@ -4958,7 +5086,7 @@ emit_wpos(struct st_context *st,
 
    /* we invert after adjustment so that we avoid the MOV to temporary,
     * and reuse the adjustment ADD instead */
-   emit_wpos_adjustment(t, wpos_transform_const, invert, adjX, adjY);
+   emit_wpos_adjustment(st->ctx, t, wpos_transform_const, invert, adjX, adjY);
 }
 
 /**
@@ -5248,11 +5376,13 @@ st_translate_program(
     */
    {
       GLbitfield sysInputs = proginfo->SystemValuesRead;
-      unsigned numSys = 0;
+
       for (i = 0; sysInputs; i++) {
          if (sysInputs & (1 << i)) {
             unsigned semName = _mesa_sysval_to_semantic[i];
-            t->systemValues[i] = ureg_DECL_system_value(ureg, numSys, semName, 0);
+
+            t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
+
             if (semName == TGSI_SEMANTIC_INSTANCEID ||
                 semName == TGSI_SEMANTIC_VERTEXID) {
                /* From Gallium perspective, these system values are always
@@ -5273,7 +5403,12 @@ st_translate_program(
                   t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
                }
             }
-            numSys++;
+
+            if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                semName == TGSI_SEMANTIC_POSITION)
+               emit_wpos(st_context(ctx), t, proginfo, ureg,
+                         program->wpos_transform_const);
+
             sysInputs &= ~(1 << i);
          }
       }
@@ -5553,14 +5688,15 @@ get_mesa_program(struct gl_context *ctx,
 
    do_set_program_inouts(shader->ir, prog, shader->Stage);
    shrink_array_declarations(v->input_arrays, v->num_input_arrays,
-                             prog->InputsRead, prog->PatchInputsRead);
+                             prog->InputsRead, prog->DoubleInputsRead, prog->PatchInputsRead);
    shrink_array_declarations(v->output_arrays, v->num_output_arrays,
-                             prog->OutputsWritten, prog->PatchOutputsWritten);
+                             prog->OutputsWritten, 0ULL, prog->PatchOutputsWritten);
    count_resources(v, prog);
 
    /* This must be done before the uniform storage is associated. */
    if (shader->Type == GL_FRAGMENT_SHADER &&
-       prog->InputsRead & VARYING_BIT_POS){
+       (prog->InputsRead & VARYING_BIT_POS ||
+        prog->SystemValuesRead & (1 << SYSTEM_VALUE_FRAG_COORD))) {
       static const gl_state_index wposTransformState[STATE_LENGTH] = {
          STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM
       };
@@ -5571,6 +5707,12 @@ get_mesa_program(struct gl_context *ctx,
 
    _mesa_reference_program(ctx, &shader->Program, prog);
 
+   /* Avoid reallocation of the program parameter list, because the uniform
+    * storage is only associated with the original parameter list.
+    * This should be enough for Bitmap and DrawPixels constants.
+    */
+   _mesa_reserve_parameter_storage(prog->Parameters, 8);
+
    /* This has to be done last.  Any operation the can cause
     * prog->ParameterValues to get reallocated (e.g., anything that adds a
     * program constant) has to happen before creating this linkage.
@@ -5730,13 +5872,14 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                                LOWER_PACK_SNORM_4x8 |
                                LOWER_UNPACK_SNORM_4x8 |
                                LOWER_UNPACK_UNORM_4x8 |
-                               LOWER_PACK_UNORM_4x8 |
-                               LOWER_PACK_HALF_2x16 |
-                               LOWER_UNPACK_HALF_2x16;
+                               LOWER_PACK_UNORM_4x8;
 
          if (ctx->Extensions.ARB_gpu_shader5)
             lower_inst |= LOWER_PACK_USE_BFI |
                           LOWER_PACK_USE_BFE;
+         if (!ctx->st->has_half_float_packing)
+            lower_inst |= LOWER_PACK_HALF_2x16 |
+                          LOWER_UNPACK_HALF_2x16;
 
          lower_packing_builtins(ir, lower_inst);
       }
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index d0d261f4fde..385e26b946e 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -39,6 +39,7 @@
 #include "st_texture.h"
 
 #include "st_context.h"
+#include "st_debug.h"
 #include "st_extensions.h"
 #include "st_format.h"
 #include "st_cb_fbo.h"
@@ -623,58 +624,6 @@ st_context_destroy(struct st_context_iface *stctxi)
    st_destroy_context(st);
 }
 
-static void
-st_debug_message(void *data,
-                 unsigned *id,
-                 enum pipe_debug_type ptype,
-                 const char *fmt,
-                 va_list args)
-{
-   struct st_context *st = data;
-   enum mesa_debug_source source;
-   enum mesa_debug_type type;
-   enum mesa_debug_severity severity;
-
-   switch (ptype) {
-   case PIPE_DEBUG_TYPE_OUT_OF_MEMORY:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_ERROR;
-      severity = MESA_DEBUG_SEVERITY_MEDIUM;
-      break;
-   case PIPE_DEBUG_TYPE_ERROR:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_ERROR;
-      severity = MESA_DEBUG_SEVERITY_MEDIUM;
-      break;
-   case PIPE_DEBUG_TYPE_SHADER_INFO:
-      source = MESA_DEBUG_SOURCE_SHADER_COMPILER;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_PERF_INFO:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_PERFORMANCE;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_INFO:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_FALLBACK:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_PERFORMANCE;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_CONFORMANCE:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   }
-   _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args);
-}
-
 static struct st_context_iface *
 st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
                       const struct st_context_attribs *attribs,
@@ -723,17 +672,15 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
       return NULL;
    }
 
-   if (attribs->flags & ST_CONTEXT_FLAG_DEBUG){
+   if (attribs->flags & ST_CONTEXT_FLAG_DEBUG) {
       if (!_mesa_set_debug_state_int(st->ctx, GL_DEBUG_OUTPUT, GL_TRUE)) {
          *error = ST_CONTEXT_ERROR_NO_MEMORY;
          return NULL;
       }
+
       st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_DEBUG_BIT;
 
-      if (pipe->set_debug_callback) {
-         struct pipe_debug_callback cb = { st_debug_message, st };
-         pipe->set_debug_callback(pipe, &cb);
-      }
+      st_enable_debug_output(st, TRUE);
    }
 
    if (attribs->flags & ST_CONTEXT_FLAG_FORWARD_COMPATIBLE)
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 4b9dc994ea5..be47823a048 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -475,24 +475,6 @@ static void emit_swz( struct st_translate *t,
 }
 
 
-/**
- * Negate the value of DDY to match GL semantics where (0,0) is the
- * lower-left corner of the window.
- * Note that the GL_ARB_fragment_coord_conventions extension will
- * effect this someday.
- */
-static void emit_ddy( struct st_translate *t,
-                      struct ureg_dst dst,
-                      const struct prog_src_register *SrcReg )
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_src src = translate_src( t, SrcReg );
-   src = ureg_negate( src );
-   ureg_DDY( ureg, dst, src );
-}
-
-
-
 static unsigned
 translate_opcode( unsigned op )
 {
@@ -714,10 +696,6 @@ compile_instruction(
        */
       ureg_MOV( ureg, dst[0], ureg_imm1f(ureg, 0.5) );
       break;
-		 
-   case OPCODE_DDY:
-      emit_ddy( t, dst[0], &inst->SrcReg[0] );
-      break;
 
    case OPCODE_RSQ:
       ureg_RSQ( ureg, dst[0], ureg_abs(src[0]) );
@@ -739,10 +717,11 @@ compile_instruction(
  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
  */
 static void
-emit_wpos_adjustment( struct st_translate *t,
-                      const struct gl_program *program,
-                      boolean invert,
-                      GLfloat adjX, GLfloat adjY[2])
+emit_wpos_adjustment(struct gl_context *ctx,
+                     struct st_translate *t,
+                     const struct gl_program *program,
+                     boolean invert,
+                     GLfloat adjX, GLfloat adjY[2])
 {
    struct ureg_program *ureg = t->ureg;
 
@@ -762,7 +741,11 @@ emit_wpos_adjustment( struct st_translate *t,
 
    struct ureg_src wpostrans = ureg_DECL_constant( ureg, wposTransConst );
    struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src wpos_input = t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src *wpos =
+      ctx->Const.GLSLFragCoordIsSysVal ?
+         &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
+         &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src wpos_input = *wpos;
 
    /* First, apply the coordinate shift: */
    if (adjX || adjY[0] || adjY[1]) {
@@ -813,7 +796,7 @@ emit_wpos_adjustment( struct st_translate *t,
 
    /* Use wpos_temp as position input from here on:
     */
-   t->inputs[t->inputMapping[VARYING_SLOT_POS]] = ureg_src(wpos_temp);
+   *wpos = ureg_src(wpos_temp);
 }
 
 
@@ -921,32 +904,7 @@ emit_wpos(struct st_context *st,
 
    /* we invert after adjustment so that we avoid the MOV to temporary,
     * and reuse the adjustment ADD instead */
-   emit_wpos_adjustment(t, program, invert, adjX, adjY);
-}
-
-
-/**
- * OpenGL's fragment gl_FrontFace input is 1 for front-facing, 0 for back.
- * TGSI uses +1 for front, -1 for back.
- * This function converts the TGSI value to the GL value.  Simply clamping/
- * saturating the value to [0,1] does the job.
- */
-static void
-emit_face_var( struct st_translate *t,
-               const struct gl_program *program )
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_dst face_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src face_input = t->inputs[t->inputMapping[VARYING_SLOT_FACE]];
-
-   /* MOV_SAT face_temp, input[face]
-    */
-   face_temp = ureg_saturate( face_temp );
-   ureg_MOV( ureg, face_temp, face_input );
-
-   /* Use face_temp as face input from here on:
-    */
-   t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
+   emit_wpos_adjustment(st->ctx, t, program, invert, adjX, adjY);
 }
 
 
@@ -1020,10 +978,6 @@ st_translate_mesa_program(
          emit_wpos(st_context(ctx), t, program, ureg);
       }
 
-      if (program->InputsRead & VARYING_BIT_FACE) {
-         emit_face_var( t, program );
-      }
-
       /*
        * Declare output attributes.
        */
@@ -1100,11 +1054,13 @@ st_translate_mesa_program(
     */
    {
       GLbitfield sysInputs = program->SystemValuesRead;
-      unsigned numSys = 0;
+
       for (i = 0; sysInputs; i++) {
          if (sysInputs & (1 << i)) {
             unsigned semName = _mesa_sysval_to_semantic[i];
-            t->systemValues[i] = ureg_DECL_system_value(ureg, numSys, semName, 0);
+
+            t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
+
             if (semName == TGSI_SEMANTIC_INSTANCEID ||
                 semName == TGSI_SEMANTIC_VERTEXID) {
                /* From Gallium perspective, these system values are always
@@ -1125,7 +1081,11 @@ st_translate_mesa_program(
                   t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
                }
             }
-            numSys++;
+
+            if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                semName == TGSI_SEMANTIC_POSITION)
+               emit_wpos(st_context(ctx), t, program, ureg);
+
             sysInputs &= ~(1 << i);
          }
       }
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 75ccaf2f26b..b3954547418 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -112,8 +112,6 @@ delete_fp_variant(struct st_context *st, struct st_fp_variant *fpv)
 {
    if (fpv->driver_shader) 
       cso_delete_fragment_shader(st->cso_context, fpv->driver_shader);
-   if (fpv->parameters)
-      _mesa_free_parameter_list(fpv->parameters);
    free(fpv);
 }
 
@@ -583,8 +581,11 @@ st_translate_fragment_program(struct st_context *st,
 
    memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr));
 
-   if (!stfp->glsl_to_tgsi)
+   if (!stfp->glsl_to_tgsi) {
       _mesa_remove_output_reads(&stfp->Base.Base, PROGRAM_OUTPUT);
+      if (st->ctx->Const.GLSLFragCoordIsSysVal)
+         _mesa_program_fragment_position_to_sysval(&stfp->Base.Base);
+   }
 
    /*
     * Convert Mesa program inputs to TGSI input register semantics.
@@ -914,8 +915,6 @@ st_create_fp_variant(struct st_context *st,
          if (tgsi.tokens != stfp->tgsi.tokens)
             tgsi_free_tokens(tgsi.tokens);
          tgsi.tokens = tokens;
-         variant->parameters =
-            _mesa_clone_parameter_list(stfp->Base.Base.Parameters);
       } else
          fprintf(stderr, "mesa: cannot create a shader for glBitmap\n");
    }
@@ -924,6 +923,7 @@ st_create_fp_variant(struct st_context *st,
    if (key->drawpixels) {
       const struct tgsi_token *tokens;
       unsigned scale_const = 0, bias_const = 0, texcoord_const = 0;
+      struct gl_program_parameter_list *params = stfp->Base.Base.Parameters;
 
       /* Find the first unused slot. */
       variant->drawpix_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1;
@@ -935,27 +935,21 @@ st_create_fp_variant(struct st_context *st,
          variant->pixelmap_sampler = ffs(~samplers_used) - 1;
       }
 
-      variant->parameters =
-         _mesa_clone_parameter_list(stfp->Base.Base.Parameters);
-
       if (key->scaleAndBias) {
          static const gl_state_index scale_state[STATE_LENGTH] =
             { STATE_INTERNAL, STATE_PT_SCALE };
          static const gl_state_index bias_state[STATE_LENGTH] =
             { STATE_INTERNAL, STATE_PT_BIAS };
 
-         scale_const = _mesa_add_state_reference(variant->parameters,
-                                                 scale_state);
-         bias_const = _mesa_add_state_reference(variant->parameters,
-                                                bias_state);
+         scale_const = _mesa_add_state_reference(params, scale_state);
+         bias_const = _mesa_add_state_reference(params, bias_state);
       }
 
       {
          static const gl_state_index state[STATE_LENGTH] =
             { STATE_INTERNAL, STATE_CURRENT_ATTRIB, VERT_ATTRIB_TEX0 };
 
-         texcoord_const = _mesa_add_state_reference(variant->parameters,
-                                                    state);
+         texcoord_const = _mesa_add_state_reference(params, state);
       }
 
       tokens = st_get_drawpix_shader(tgsi.tokens,
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index d9b53ac008c..a74531581b4 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -80,7 +80,6 @@ struct st_fp_variant
    void *driver_shader;
 
    /** For glBitmap variants */
-   struct gl_program_parameter_list *parameters;
    uint bitmap_sampler;
 
    /** For glDrawPixels variants */
@@ -406,12 +405,12 @@ st_get_gp_variant(struct st_context *st,
 
 extern struct st_tcp_variant *
 st_get_tcp_variant(struct st_context *st,
-                   struct st_tessctrl_program *stgp,
+                   struct st_tessctrl_program *sttcp,
                    const struct st_tcp_variant_key *key);
 
 extern struct st_tep_variant *
 st_get_tep_variant(struct st_context *st,
-                   struct st_tesseval_program *stgp,
+                   struct st_tesseval_program *sttep,
                    const struct st_tep_variant_key *key);
 
 extern void
@@ -428,11 +427,11 @@ st_release_gp_variants(struct st_context *st,
 
 extern void
 st_release_tcp_variants(struct st_context *st,
-                        struct st_tessctrl_program *stgp);
+                        struct st_tessctrl_program *sttcp);
 
 extern void
 st_release_tep_variants(struct st_context *st,
-                        struct st_tesseval_program *stgp);
+                        struct st_tesseval_program *sttep);
 
 extern void
 st_destroy_program_variants(struct st_context *st);
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index cef3b8cd792..0b8b6a9de56 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -78,7 +78,7 @@ struct _mesa_index_buffer {
 
 GLboolean _vbo_CreateContext( struct gl_context *ctx );
 void _vbo_DestroyContext( struct gl_context *ctx );
-void _vbo_InvalidateState( struct gl_context *ctx, GLuint new_state );
+void _vbo_InvalidateState( struct gl_context *ctx, GLbitfield new_state );
 
 
 void
@@ -110,6 +110,18 @@ typedef void (*vbo_draw_func)( struct gl_context *ctx,
 			       struct gl_buffer_object *indirect);
 
 
+typedef void (*vbo_indirect_draw_func)(
+   struct gl_context *ctx,
+   GLuint mode,
+   struct gl_buffer_object *indirect_data,
+   GLsizeiptr indirect_offset,
+   unsigned draw_count,
+   unsigned stride,
+   struct gl_buffer_object *indirect_params,
+   GLsizeiptr indirect_params_offset,
+   const struct _mesa_index_buffer *ib);
+
+
 
 
 /* Utility function to cope with various constraints on tnl modules or
@@ -179,6 +191,9 @@ void vbo_always_unmap_buffers(struct gl_context *ctx);
 
 void vbo_set_draw_func(struct gl_context *ctx, vbo_draw_func func);
 
+void vbo_set_indirect_draw_func(struct gl_context *ctx,
+                                vbo_indirect_draw_func func);
+
 void vbo_check_buffers_are_unmapped(struct gl_context *ctx);
 
 void vbo_bind_arrays(struct gl_context *ctx);
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index 5e1a760eb2c..9f807a17512 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -135,6 +135,48 @@ static void init_mat_currval(struct gl_context *ctx)
    }
 }
 
+static void
+vbo_draw_indirect_prims(struct gl_context *ctx,
+                        GLuint mode,
+                        struct gl_buffer_object *indirect_data,
+                        GLsizeiptr indirect_offset,
+                        unsigned draw_count,
+                        unsigned stride,
+                        struct gl_buffer_object *indirect_params,
+                        GLsizeiptr indirect_params_offset,
+                        const struct _mesa_index_buffer *ib)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct _mesa_prim *prim;
+   GLsizei i;
+
+   prim = calloc(draw_count, sizeof(*prim));
+   if (prim == NULL) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sDraw%sIndirect%s",
+                  (draw_count > 1) ? "Multi" : "",
+                  ib ? "Elements" : "Arrays",
+                  indirect_params ? "CountARB" : "");
+      return;
+   }
+
+   prim[0].begin = 1;
+   prim[draw_count - 1].end = 1;
+   for (i = 0; i < draw_count; ++i, indirect_offset += stride) {
+      prim[i].mode = mode;
+      prim[i].indexed = !!ib;
+      prim[i].indirect_offset = indirect_offset;
+      prim[i].is_indirect = 1;
+      prim[i].draw_id = i;
+   }
+
+   vbo->draw_prims(ctx, prim, draw_count,
+                   ib, GL_TRUE, 0, ~0,
+                   NULL, 0,
+                   ctx->DrawIndirectBuffer);
+
+   free(prim);
+}
+
 
 GLboolean _vbo_CreateContext( struct gl_context *ctx )
 {
@@ -152,6 +194,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
    init_legacy_currval( ctx );
    init_generic_currval( ctx );
    init_mat_currval( ctx );
+   vbo_set_indirect_draw_func(ctx, vbo_draw_indirect_prims);
 
    /* Build mappings from VERT_ATTRIB -> VBO_ATTRIB depending on type
     * of vertex program active.
@@ -186,7 +229,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
 }
 
 
-void _vbo_InvalidateState( struct gl_context *ctx, GLuint new_state )
+void _vbo_InvalidateState( struct gl_context *ctx, GLbitfield new_state )
 {
    vbo_exec_invalidate_state(ctx, new_state);
 }
@@ -223,3 +266,10 @@ void vbo_set_draw_func(struct gl_context *ctx, vbo_draw_func func)
    vbo->draw_prims = func;
 }
 
+
+void vbo_set_indirect_draw_func(struct gl_context *ctx,
+                                vbo_indirect_draw_func func)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   vbo->draw_indirect_prims = func;
+}
diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index 6293a8b9edc..11f9b17c7c4 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -76,6 +76,12 @@ struct vbo_context {
     * is responsible for initiating any fallback actions required:
     */
    vbo_draw_func draw_prims;
+
+   /* Optional callback for indirect draws. This allows multidraws to not be
+    * broken up, as well as for the actual count to be passed in as a separate
+    * indirect parameter.
+    */
+   vbo_indirect_draw_func draw_indirect_prims;
 };
 
 
diff --git a/src/mesa/vbo/vbo_exec.c b/src/mesa/vbo/vbo_exec.c
index a301c6c9a22..4db4f4088b9 100644
--- a/src/mesa/vbo/vbo_exec.c
+++ b/src/mesa/vbo/vbo_exec.c
@@ -73,7 +73,7 @@ void vbo_exec_destroy( struct gl_context *ctx )
  * invoked according to the state flags.  That will have to wait for a
  * mesa rework:
  */ 
-void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state )
+void vbo_exec_invalidate_state( struct gl_context *ctx, GLbitfield new_state )
 {
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h
index a80b2c908d1..27bff4a2aa9 100644
--- a/src/mesa/vbo/vbo_exec.h
+++ b/src/mesa/vbo/vbo_exec.h
@@ -146,7 +146,7 @@ struct vbo_exec_context
  */
 void vbo_exec_init( struct gl_context *ctx );
 void vbo_exec_destroy( struct gl_context *ctx );
-void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state );
+void vbo_exec_invalidate_state( struct gl_context *ctx, GLbitfield new_state );
 
 
 /* Internal functions:
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 502b2885892..02139ef881f 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -1546,27 +1546,14 @@ vbo_validated_drawarraysindirect(struct gl_context *ctx,
 {
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_prim prim[1];
 
    vbo_bind_arrays(ctx);
 
-   memset(prim, 0, sizeof(prim));
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].mode = mode;
-   prim[0].is_indirect = 1;
-   prim[0].indirect_offset = (GLsizeiptr)indirect;
-
-   /* NOTE: We do NOT want to handle primitive restart here, nor perform any
-    * other checks that require knowledge of the values in the command buffer.
-    * That would defeat the whole purpose of this function.
-    */
-
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, 1,
-                   NULL, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, (GLsizeiptr)indirect,
+                            1 /* draw_count */, 16 /* stride */,
+                            NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
       _mesa_flush(ctx);
@@ -1580,36 +1567,18 @@ vbo_validated_multidrawarraysindirect(struct gl_context *ctx,
 {
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_prim *prim;
-   GLsizei i;
    GLsizeiptr offset = (GLsizeiptr)indirect;
 
    if (primcount == 0)
       return;
-   prim = calloc(primcount, sizeof(*prim));
-   if (prim == NULL) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glMultiDrawArraysIndirect");
-      return;
-   }
 
    vbo_bind_arrays(ctx);
 
-   prim[0].begin = 1;
-   prim[primcount - 1].end = 1;
-   for (i = 0; i < primcount; ++i, offset += stride) {
-      prim[i].mode = mode;
-      prim[i].indirect_offset = offset;
-      prim[i].is_indirect = 1;
-      prim[i].draw_id = i;
-   }
-
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, primcount,
-                   NULL, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
-
-   free(prim);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            primcount, stride,
+                            NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
       _mesa_flush(ctx);
@@ -1623,7 +1592,6 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
    struct _mesa_index_buffer ib;
-   struct _mesa_prim prim[1];
 
    vbo_bind_arrays(ctx);
 
@@ -1632,19 +1600,12 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
    ib.obj = ctx->Array.VAO->IndexBufferObj;
    ib.ptr = NULL;
 
-   memset(prim, 0, sizeof(prim));
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].mode = mode;
-   prim[0].indexed = 1;
-   prim[0].indirect_offset = (GLsizeiptr)indirect;
-   prim[0].is_indirect = 1;
-
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, 1,
-                   &ib, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, (GLsizeiptr)indirect,
+                            1 /* draw_count */, 20 /* stride */,
+                            NULL, 0,
+                            &ib);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
       _mesa_flush(ctx);
@@ -1659,17 +1620,10 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
    struct _mesa_index_buffer ib;
-   struct _mesa_prim *prim;
-   GLsizei i;
    GLsizeiptr offset = (GLsizeiptr)indirect;
 
    if (primcount == 0)
       return;
-   prim = calloc(primcount, sizeof(*prim));
-   if (prim == NULL) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glMultiDrawElementsIndirect");
-      return;
-   }
 
    vbo_bind_arrays(ctx);
 
@@ -1680,23 +1634,12 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
    ib.obj = ctx->Array.VAO->IndexBufferObj;
    ib.ptr = NULL;
 
-   prim[0].begin = 1;
-   prim[primcount - 1].end = 1;
-   for (i = 0; i < primcount; ++i, offset += stride) {
-      prim[i].mode = mode;
-      prim[i].indexed = 1;
-      prim[i].indirect_offset = offset;
-      prim[i].is_indirect = 1;
-      prim[i].draw_id = i;
-   }
-
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, primcount,
-                   &ib, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
-
-   free(prim);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            primcount, stride,
+                            NULL, 0,
+                            &ib);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
       _mesa_flush(ctx);
@@ -1789,6 +1732,128 @@ vbo_exec_MultiDrawElementsIndirect(GLenum mode, GLenum type,
                                            primcount, stride);
 }
 
+static void
+vbo_validated_multidrawarraysindirectcount(struct gl_context *ctx,
+                                           GLenum mode,
+                                           GLintptr indirect,
+                                           GLintptr drawcount,
+                                           GLsizei maxdrawcount,
+                                           GLsizei stride)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   GLsizeiptr offset = indirect;
+
+   if (maxdrawcount == 0)
+      return;
+
+   vbo_bind_arrays(ctx);
+
+   check_buffers_are_unmapped(exec->array.inputs);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            maxdrawcount, stride,
+                            ctx->ParameterBuffer, drawcount,
+                            NULL);
+
+   if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
+      _mesa_flush(ctx);
+}
+
+static void
+vbo_validated_multidrawelementsindirectcount(struct gl_context *ctx,
+                                             GLenum mode, GLenum type,
+                                             GLintptr indirect,
+                                             GLintptr drawcount,
+                                             GLsizei maxdrawcount,
+                                             GLsizei stride)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   struct _mesa_index_buffer ib;
+   GLsizeiptr offset = (GLsizeiptr)indirect;
+
+   if (maxdrawcount == 0)
+      return;
+
+   vbo_bind_arrays(ctx);
+
+   /* NOTE: IndexBufferObj is guaranteed to be a VBO. */
+
+   ib.count = 0; /* unknown */
+   ib.type = type;
+   ib.obj = ctx->Array.VAO->IndexBufferObj;
+   ib.ptr = NULL;
+
+   check_buffers_are_unmapped(exec->array.inputs);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            maxdrawcount, stride,
+                            ctx->ParameterBuffer, drawcount,
+                            &ib);
+
+   if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
+      _mesa_flush(ctx);
+}
+
+static void GLAPIENTRY
+vbo_exec_MultiDrawArraysIndirectCount(GLenum mode,
+                                      GLintptr indirect,
+                                      GLintptr drawcount,
+                                      GLsizei maxdrawcount, GLsizei stride)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glMultiDrawArraysIndirectCountARB"
+                  "(%s, %lx, %lx, %i, %i)\n",
+                  _mesa_enum_to_string(mode), indirect,
+                  drawcount, maxdrawcount, stride);
+
+   /* If <stride> is zero, the array elements are treated as tightly packed. */
+   if (stride == 0)
+      stride = 4 * sizeof(GLuint); /* sizeof(DrawArraysIndirectCommand) */
+
+   if (!_mesa_validate_MultiDrawArraysIndirectCount(ctx, mode,
+                                                    indirect, drawcount,
+                                                    maxdrawcount, stride))
+      return;
+
+   vbo_validated_multidrawarraysindirectcount(ctx, mode,
+                                              indirect, drawcount,
+                                              maxdrawcount, stride);
+}
+
+static void GLAPIENTRY
+vbo_exec_MultiDrawElementsIndirectCount(GLenum mode, GLenum type,
+                                        GLintptr indirect,
+                                        GLintptr drawcount,
+                                        GLsizei maxdrawcount, GLsizei stride)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glMultiDrawElementsIndirectCountARB"
+                  "(%s, %s, %lx, %lx, %i, %i)\n",
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect,
+                  drawcount, maxdrawcount, stride);
+
+   /* If <stride> is zero, the array elements are treated as tightly packed. */
+   if (stride == 0)
+      stride = 5 * sizeof(GLuint); /* sizeof(DrawElementsIndirectCommand) */
+
+   if (!_mesa_validate_MultiDrawElementsIndirectCount(ctx, mode, type,
+                                                      indirect, drawcount,
+                                                      maxdrawcount, stride))
+      return;
+
+   vbo_validated_multidrawelementsindirectcount(ctx, mode, type,
+                                                indirect, drawcount,
+                                                maxdrawcount, stride);
+}
+
+
 /**
  * Initialize the dispatch table with the VBO functions for drawing.
  */
@@ -1836,6 +1901,8 @@ vbo_initialize_exec_dispatch(const struct gl_context *ctx,
    if (ctx->API == API_OPENGL_CORE) {
       SET_MultiDrawArraysIndirect(exec, vbo_exec_MultiDrawArraysIndirect);
       SET_MultiDrawElementsIndirect(exec, vbo_exec_MultiDrawElementsIndirect);
+      SET_MultiDrawArraysIndirectCountARB(exec, vbo_exec_MultiDrawArraysIndirectCount);
+      SET_MultiDrawElementsIndirectCountARB(exec, vbo_exec_MultiDrawElementsIndirectCount);
    }
 
    if (_mesa_is_desktop_gl(ctx) || _mesa_is_gles3(ctx)) {