373 files changed, 14809 insertions, 6163 deletions
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 4ba5b2fac29..eb4a3da3c84 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 SUBDIRS = . main/tests
 
 if HAVE_X11_DRIVER
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 83f500fbf20..ed9848c5454 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -407,6 +407,7 @@ STATETRACKER_FILES = \
 	state_tracker/st_atom_shader.c \
 	state_tracker/st_atom_shader.h \
 	state_tracker/st_atom_stipple.c \
+	state_tracker/st_atom_tess.c \
 	state_tracker/st_atom_texture.c \
 	state_tracker/st_atom_viewport.c \
 	state_tracker/st_cache.h \
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 71c1a763912..6fe42b1775c 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -94,14 +94,14 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->QuerySamplesForFormat = _mesa_query_samples_for_format;
    driver->TexImage = _mesa_store_teximage;
    driver->TexSubImage = _mesa_store_texsubimage;
-   driver->GetTexImage = _mesa_meta_GetTexImage;
+   driver->GetTexSubImage = _mesa_meta_GetTexSubImage;
    driver->ClearTexSubImage = _mesa_meta_ClearTexSubImage;
    driver->CopyTexSubImage = _mesa_meta_CopyTexSubImage;
    driver->GenerateMipmap = _mesa_meta_GenerateMipmap;
    driver->TestProxyTexImage = _mesa_test_proxy_teximage;
    driver->CompressedTexImage = _mesa_store_compressed_teximage;
    driver->CompressedTexSubImage = _mesa_store_compressed_texsubimage;
-   driver->GetCompressedTexImage = _mesa_GetCompressedTexImage_sw;
+   driver->GetCompressedTexSubImage = _mesa_GetCompressedTexSubImage_sw;
    driver->BindTexture = NULL;
    driver->NewTextureObject = _mesa_new_texture_object;
    driver->DeleteTexture = _mesa_delete_texture_object;
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 214a68a9129..bde544ef490 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -728,7 +728,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       save->DepthNear = ctx->ViewportArray[0].Near;
       save->DepthFar = ctx->ViewportArray[0].Far;
       /* set depth range to default */
-      _mesa_DepthRange(0.0, 1.0);
+      _mesa_set_depth_range(ctx, 0, 0.0, 1.0);
    }
 
    if (state & MESA_META_CLAMP_FRAGMENT_COLOR) {
@@ -945,6 +945,8 @@ _mesa_meta_end(struct gl_context *ctx)
    if (state & MESA_META_SHADER) {
       static const GLenum targets[] = {
          GL_VERTEX_SHADER,
+         GL_TESS_CONTROL_SHADER,
+         GL_TESS_EVALUATION_SHADER,
          GL_GEOMETRY_SHADER,
          GL_FRAGMENT_SHADER,
       };
@@ -1129,7 +1131,7 @@ _mesa_meta_end(struct gl_context *ctx)
          _mesa_set_viewport(ctx, 0, save->ViewportX, save->ViewportY,
                             save->ViewportW, save->ViewportH);
       }
-      _mesa_DepthRange(save->DepthNear, save->DepthFar);
+      _mesa_set_depth_range(ctx, 0, save->DepthNear, save->DepthFar);
    }
 
    if (state & MESA_META_CLAMP_FRAGMENT_COLOR &&
@@ -2449,30 +2451,53 @@ _mesa_meta_Bitmap(struct gl_context *ctx,
 
 /**
  * Compute the texture coordinates for the four vertices of a quad for
- * drawing a 2D texture image or slice of a cube/3D texture.
+ * drawing a 2D texture image or slice of a cube/3D texture.  The offset
+ * and width, height specify a sub-region of the 2D image.
+ *
  * \param faceTarget  GL_TEXTURE_1D/2D/3D or cube face name
  * \param slice  slice of a 1D/2D array texture or 3D texture
- * \param width  width of the texture image
- * \param height  height of the texture image
+ * \param xoffset  X position of sub texture
+ * \param yoffset  Y position of sub texture
+ * \param width  width of the sub texture image
+ * \param height  height of the sub texture image
+ * \param total_width  total width of the texture image
+ * \param total_height  total height of the texture image
+ * \param total_depth  total depth of the texture image
  * \param coords0/1/2/3  returns the computed texcoords
  */
 void
 _mesa_meta_setup_texture_coords(GLenum faceTarget,
                                 GLint slice,
+                                GLint xoffset,
+                                GLint yoffset,
                                 GLint width,
                                 GLint height,
-                                GLint depth,
+                                GLint total_width,
+                                GLint total_height,
+                                GLint total_depth,
                                 GLfloat coords0[4],
                                 GLfloat coords1[4],
                                 GLfloat coords2[4],
                                 GLfloat coords3[4])
 {
-   static const GLfloat st[4][2] = {
-      {0.0f, 0.0f}, {1.0f, 0.0f}, {1.0f, 1.0f}, {0.0f, 1.0f}
-   };
+   float st[4][2];
    GLuint i;
+   const float s0 = (float) xoffset / (float) total_width;
+   const float s1 = (float) (xoffset + width) / (float) total_width;
+   const float t0 = (float) yoffset / (float) total_height;
+   const float t1 = (float) (yoffset + height) / (float) total_height;
    GLfloat r;
 
+   /* setup the reference texcoords */
+   st[0][0] = s0;
+   st[0][1] = t0;
+   st[1][0] = s1;
+   st[1][1] = t0;
+   st[2][0] = s1;
+   st[2][1] = t1;
+   st[3][0] = s0;
+   st[3][1] = t1;
+
    if (faceTarget == GL_TEXTURE_CUBE_MAP_ARRAY)
       faceTarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + slice % 6;
 
@@ -2489,52 +2514,52 @@ _mesa_meta_setup_texture_coords(GLenum faceTarget,
    case GL_TEXTURE_3D:
    case GL_TEXTURE_2D_ARRAY:
       if (faceTarget == GL_TEXTURE_3D) {
-         assert(slice < depth);
-         assert(depth >= 1);
-         r = (slice + 0.5f) / depth;
+         assert(slice < total_depth);
+         assert(total_depth >= 1);
+         r = (slice + 0.5f) / total_depth;
       }
       else if (faceTarget == GL_TEXTURE_2D_ARRAY)
          r = (float) slice;
       else
          r = 0.0F;
-      coords0[0] = 0.0F; /* s */
-      coords0[1] = 0.0F; /* t */
+      coords0[0] = st[0][0]; /* s */
+      coords0[1] = st[0][1]; /* t */
       coords0[2] = r; /* r */
-      coords1[0] = 1.0F;
-      coords1[1] = 0.0F;
+      coords1[0] = st[1][0];
+      coords1[1] = st[1][1];
       coords1[2] = r;
-      coords2[0] = 1.0F;
-      coords2[1] = 1.0F;
+      coords2[0] = st[2][0];
+      coords2[1] = st[2][1];
       coords2[2] = r;
-      coords3[0] = 0.0F;
-      coords3[1] = 1.0F;
+      coords3[0] = st[3][0];
+      coords3[1] = st[3][1];
       coords3[2] = r;
       break;
    case GL_TEXTURE_RECTANGLE_ARB:
-      coords0[0] = 0.0F; /* s */
-      coords0[1] = 0.0F; /* t */
+      coords0[0] = (float) xoffset; /* s */
+      coords0[1] = (float) yoffset; /* t */
       coords0[2] = 0.0F; /* r */
-      coords1[0] = (float) width;
-      coords1[1] = 0.0F;
+      coords1[0] = (float) (xoffset + width);
+      coords1[1] = (float) yoffset;
       coords1[2] = 0.0F;
-      coords2[0] = (float) width;
-      coords2[1] = (float) height;
+      coords2[0] = (float) (xoffset + width);
+      coords2[1] = (float) (yoffset + height);
       coords2[2] = 0.0F;
-      coords3[0] = 0.0F;
-      coords3[1] = (float) height;
+      coords3[0] = (float) xoffset;
+      coords3[1] = (float) (yoffset + height);
       coords3[2] = 0.0F;
       break;
    case GL_TEXTURE_1D_ARRAY:
-      coords0[0] = 0.0F; /* s */
+      coords0[0] = st[0][0]; /* s */
       coords0[1] = (float) slice; /* t */
       coords0[2] = 0.0F; /* r */
-      coords1[0] = 1.0f;
+      coords1[0] = st[1][0];
       coords1[1] = (float) slice;
       coords1[2] = 0.0F;
-      coords2[0] = 1.0F;
+      coords2[0] = st[2][0];
       coords2[1] = (float) slice;
       coords2[2] = 0.0F;
-      coords3[0] = 0.0F;
+      coords3[0] = st[3][0];
       coords3[1] = (float) slice;
       coords3[2] = 0.0F;
       break;
@@ -2943,15 +2968,14 @@ static bool
 decompress_texture_image(struct gl_context *ctx,
                          struct gl_texture_image *texImage,
                          GLuint slice,
+                         GLint xoffset, GLint yoffset,
+                         GLsizei width, GLsizei height,
                          GLenum destFormat, GLenum destType,
                          GLvoid *dest)
 {
    struct decompress_state *decompress = &ctx->Meta->Decompress;
    struct decompress_fbo_state *decompress_fbo;
    struct gl_texture_object *texObj = texImage->TexObject;
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Height;
    const GLenum target = texObj->Target;
    GLenum rbFormat;
    GLenum faceTarget;
@@ -3069,7 +3093,10 @@ decompress_texture_image(struct gl_context *ctx,
    /* Silence valgrind warnings about reading uninitialized stack. */
    memset(verts, 0, sizeof(verts));
 
-   _mesa_meta_setup_texture_coords(faceTarget, slice, width, height, depth,
+   _mesa_meta_setup_texture_coords(faceTarget, slice,
+                                   xoffset, yoffset, width, height,
+                                   texImage->Width, texImage->Height,
+                                   texImage->Depth,
                                    verts[0].tex,
                                    verts[1].tex,
                                    verts[2].tex,
@@ -3123,7 +3150,7 @@ decompress_texture_image(struct gl_context *ctx,
    /* read pixels from renderbuffer */
    {
       GLenum baseTexFormat = texImage->_BaseFormat;
-      GLenum destBaseFormat = _mesa_base_tex_format(ctx, destFormat);
+      GLenum destBaseFormat = _mesa_unpack_format_to_base_format(destFormat);
 
       /* The pixel transfer state will be set to default values at this point
        * (see MESA_META_PIXEL_TRANSFER) so pixel transfer ops are effectively
@@ -3132,19 +3159,13 @@ decompress_texture_image(struct gl_context *ctx,
        * returned as red and two-channel texture values are returned as
        * red/alpha.
        */
-      if ((baseTexFormat == GL_LUMINANCE ||
-           baseTexFormat == GL_LUMINANCE_ALPHA ||
-           baseTexFormat == GL_INTENSITY) ||
+      if (_mesa_need_luminance_to_rgb_conversion(baseTexFormat,
+                                                 destBaseFormat) ||
           /* If we're reading back an RGB(A) texture (using glGetTexImage) as
 	   * luminance then we need to return L=tex(R).
 	   */
-          ((baseTexFormat == GL_RGBA ||
-            baseTexFormat == GL_RGB  ||
-            baseTexFormat == GL_RG) &&
-          (destBaseFormat == GL_LUMINANCE ||
-           destBaseFormat == GL_LUMINANCE_ALPHA ||
-           destBaseFormat == GL_LUMINANCE_INTEGER_EXT ||
-           destBaseFormat == GL_LUMINANCE_ALPHA_INTEGER_EXT))) {
+          _mesa_need_rgb_to_luminance_conversion(baseTexFormat,
+                                                 destBaseFormat)) {
          /* Green and blue must be zero */
          _mesa_PixelTransferf(GL_GREEN_SCALE, 0.0f);
          _mesa_PixelTransferf(GL_BLUE_SCALE, 0.0f);
@@ -3171,15 +3192,17 @@ decompress_texture_image(struct gl_context *ctx,
  * from core Mesa.
  */
 void
-_mesa_meta_GetTexImage(struct gl_context *ctx,
-                       GLenum format, GLenum type, GLvoid *pixels,
-                       struct gl_texture_image *texImage)
+_mesa_meta_GetTexSubImage(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage)
 {
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
       GLuint slice;
       bool result = true;
 
-      for (slice = 0; slice < texImage->Depth; slice++) {
+      for (slice = 0; slice < depth; slice++) {
          void *dst;
          if (texImage->TexObject->Target == GL_TEXTURE_2D_ARRAY
              || texImage->TexObject->Target == GL_TEXTURE_CUBE_MAP_ARRAY) {
@@ -3191,14 +3214,14 @@ _mesa_meta_GetTexImage(struct gl_context *ctx,
             struct gl_pixelstore_attrib packing = ctx->Pack;
             packing.SkipPixels = 0;
             packing.SkipRows = 0;
-            dst = _mesa_image_address3d(&packing, pixels, texImage->Width,
-                                        texImage->Height, format, type,
-                                        slice, 0, 0);
+            dst = _mesa_image_address3d(&packing, pixels, width, height,
+                                        format, type, slice, 0, 0);
          }
          else {
             dst = pixels;
          }
          result = decompress_texture_image(ctx, texImage, slice,
+                                           xoffset, yoffset, width, height,
                                            format, type, dst);
          if (!result)
             break;
@@ -3208,7 +3231,8 @@ _mesa_meta_GetTexImage(struct gl_context *ctx,
          return;
    }
 
-   _mesa_GetTexImage_sw(ctx, format, type, pixels, texImage);
+   _mesa_GetTexSubImage_sw(ctx, xoffset, yoffset, zoffset,
+                           width, height, depth, format, type, pixels, texImage);
 }
 
 
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index e7d894df1d7..fe439153aa0 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -560,9 +560,11 @@ _mesa_meta_ClearTexSubImage(struct gl_context *ctx,
                             const GLvoid *clearValue);
 
 extern void
-_mesa_meta_GetTexImage(struct gl_context *ctx,
-                       GLenum format, GLenum type, GLvoid *pixels,
-                       struct gl_texture_image *texImage);
+_mesa_meta_GetTexSubImage(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage);
 
 extern void
 _mesa_meta_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
@@ -594,9 +596,13 @@ _mesa_meta_alloc_texture(struct temp_texture *tex,
 void
 _mesa_meta_setup_texture_coords(GLenum faceTarget,
                                 GLint slice,
+                                GLint xoffset,
+                                GLint yoffset,
                                 GLint width,
                                 GLint height,
-                                GLint depth,
+                                GLint total_width,
+                                GLint total_height,
+                                GLint total_depth,
                                 GLfloat coords0[4],
                                 GLfloat coords1[4],
                                 GLfloat coords2[4],
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 9cace2b245a..71d18de87db 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -82,7 +82,7 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
    y_scale = samples * 0.5;
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && is_power_of_two(samples));
+   assert(samples > 0 && _mesa_is_pow_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -263,7 +263,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
    }
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && is_power_of_two(samples));
+   assert(samples > 0 && _mesa_is_pow_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -312,7 +312,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
       break;
    default:
       _mesa_problem(ctx, "Unkown texture target %s\n",
-                    _mesa_lookup_enum_by_nr(target));
+                    _mesa_enum_to_string(target));
       shader_index = BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_RESOLVE;
    }
 
@@ -434,7 +434,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
           * (so the floating point exponent just gets increased), rather than
           * doing a naive sum and dividing.
           */
-         assert(is_power_of_two(samples));
+         assert(_mesa_is_pow_two(samples));
          /* Fetch each individual sample. */
          sample_resolve = rzalloc_size(mem_ctx, 1);
          for (i = 0; i < samples; i++) {
diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c
index 1729766f78d..149ed18503c 100644
--- a/src/mesa/drivers/common/meta_copy_image.c
+++ b/src/mesa/drivers/common/meta_copy_image.c
@@ -138,8 +138,8 @@ _mesa_meta_CopyImageSubData_uncompressed(struct gl_context *ctx,
          goto cleanup;
    }
 
-   /* We really only need to stash the bound framebuffers. */
-   _mesa_meta_begin(ctx, 0);
+   /* We really only need to stash the bound framebuffers and scissor. */
+   _mesa_meta_begin(ctx, MESA_META_SCISSOR);
 
    _mesa_GenFramebuffers(2, fbos);
    _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, fbos[0]);
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index c1b6d3c1f86..0655f052219 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -66,7 +66,7 @@ fallback_required(struct gl_context *ctx, GLenum target,
    if (target == GL_TEXTURE_3D) {
       _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH,
                        "glGenerateMipmap() to %s target\n",
-                       _mesa_lookup_enum_by_nr(target));
+                       _mesa_enum_to_string(target));
       return true;
    }
 
@@ -317,7 +317,9 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
          /* Setup texture coordinates */
          _mesa_meta_setup_texture_coords(faceTarget,
                                          layer,
-                                         0, 0, 1, /* width, height never used here */
+                                         0, 0, /* xoffset, yoffset */
+                                         srcWidth, srcHeight, /* img size */
+                                         srcWidth, srcHeight, srcDepth,
                                          verts[0].tex,
                                          verts[1].tex,
                                          verts[2].tex,
diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index d2474f52718..16d8f5d4747 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -25,8 +25,10 @@
  *    Jason Ekstrand <[email protected]>
  */
 
+#include "blend.h"
 #include "bufferobj.h"
 #include "buffers.h"
+#include "clear.h"
 #include "fbobject.h"
 #include "glformats.h"
 #include "glheader.h"
@@ -248,6 +250,24 @@ fail:
    return success;
 }
 
+static bool
+need_signed_unsigned_int_conversion(mesa_format rbFormat,
+                                    GLenum format, GLenum type)
+{
+   const GLenum srcType = _mesa_get_format_datatype(rbFormat);
+   const bool is_dst_format_integer = _mesa_is_enum_format_integer(format);
+   return (srcType == GL_INT &&
+           is_dst_format_integer &&
+           (type == GL_UNSIGNED_INT ||
+            type == GL_UNSIGNED_SHORT ||
+            type == GL_UNSIGNED_BYTE)) ||
+          (srcType == GL_UNSIGNED_INT &&
+           is_dst_format_integer &&
+           (type == GL_INT ||
+            type == GL_SHORT ||
+            type == GL_BYTE));
+}
+
 bool
 _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                               struct gl_texture_image *tex_image,
@@ -260,8 +280,10 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    int full_height, image_height;
    struct gl_texture_image *pbo_tex_image;
    struct gl_renderbuffer *rb = NULL;
-   GLenum status;
-   bool success = false;
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
+   GLenum status, src_base_format;
+   bool success = false, clear_channels_to_zero = false;
+   float save_clear_color[4];
    int z;
 
    if (!_mesa_is_bufferobj(packing->BufferObj))
@@ -273,13 +295,27 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
        format == GL_COLOR_INDEX)
       return false;
 
-   if (ctx->_ImageTransferState)
-      return false;
-
-
+   /* Don't use meta path for readpixels in below conditions. */
    if (!tex_image) {
       rb = ctx->ReadBuffer->_ColorReadBuffer;
-      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format))
+
+      /* _mesa_get_readpixels_transfer_ops() includes the cases of read
+       * color clamping along with the ctx->_ImageTransferState.
+       */
+      if (_mesa_get_readpixels_transfer_ops(ctx, rb->Format, format,
+                                            type, GL_FALSE))
+         return false;
+
+      if (_mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat,
+                                                 dstBaseFormat))
+         return false;
+
+      /* This function rely on BlitFramebuffer to fill in the pixel data for
+       * ReadPixels. But, BlitFrameBuffer doesn't support signed to unsigned
+       * or unsigned to signed integer conversions. OpenGL spec expects an
+       * invalid operation in that case.
+       */
+      if (need_signed_unsigned_int_conversion(rb->Format, format, type))
          return false;
    }
 
@@ -300,6 +336,10 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER |
                            MESA_META_PIXEL_STORE));
 
+   /* GL_CLAMP_FRAGMENT_COLOR doesn't affect ReadPixels and GettexImage */
+   if (ctx->Extensions.ARB_color_buffer_float)
+      _mesa_ClampColor(GL_CLAMP_FRAGMENT_COLOR, GL_FALSE);
+
    _mesa_GenFramebuffers(2, fbos);
 
    if (tex_image && tex_image->TexObject->Target == GL_TEXTURE_1D_ARRAY) {
@@ -345,6 +385,27 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                                   GL_COLOR_BUFFER_BIT, GL_NEAREST))
       goto fail;
 
+   src_base_format = tex_image ?
+                     tex_image->_BaseFormat :
+                     ctx->ReadBuffer->_ColorReadBuffer->_BaseFormat;
+
+   /* Depending on the base formats involved we might need to rebase some
+    * values. For example if we download from a Luminance format to RGBA
+    * format, we want G=0 and B=0.
+    */
+   clear_channels_to_zero =
+      _mesa_need_luminance_to_rgb_conversion(src_base_format,
+                                             pbo_tex_image->_BaseFormat);
+
+   if (clear_channels_to_zero) {
+      memcpy(save_clear_color, ctx->Color.ClearColor.f, 4 * sizeof(float));
+      /* Clear the Green, Blue channels. */
+      _mesa_ColorMask(GL_FALSE, GL_TRUE, GL_TRUE,
+                      src_base_format != GL_LUMINANCE_ALPHA);
+      _mesa_ClearColor(0.0, 0.0, 0.0, 1.0);
+      _mesa_Clear(GL_COLOR_BUFFER_BIT);
+   }
+
    for (z = 1; z < depth; z++) {
       _mesa_meta_bind_fbo_image(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
                                 tex_image, zoffset + z);
@@ -357,6 +418,15 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                                  0, z * image_height,
                                  width, z * image_height + height,
                                  GL_COLOR_BUFFER_BIT, GL_NEAREST);
+      if (clear_channels_to_zero)
+         _mesa_Clear(GL_COLOR_BUFFER_BIT);
+   }
+
+   /* Unmask the color channels and restore the saved clear color values. */
+   if (clear_channels_to_zero) {
+      _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+      _mesa_ClearColor(save_clear_color[0], save_clear_color[1],
+                       save_clear_color[2], save_clear_color[3]);
    }
 
    success = true;
diff --git a/src/mesa/drivers/dri/common/Android.mk b/src/mesa/drivers/dri/common/Android.mk
index 6986f5e8cb4..f1a733011b9 100644
--- a/src/mesa/drivers/dri/common/Android.mk
+++ b/src/mesa/drivers/dri/common/Android.mk
@@ -43,13 +43,6 @@ LOCAL_EXPORT_C_INCLUDE_DIRS := \
     $(LOCAL_PATH) \
     $(intermediates)
 
-# swrast only
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS := -D__NOT_HAVE_DRM_H
-else
-LOCAL_SHARED_LIBRARIES := libdrm
-endif
-
 LOCAL_SRC_FILES := \
 	$(DRI_COMMON_FILES) \
 	$(XMLCONFIG_FILES)
@@ -110,13 +103,6 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_C_INCLUDES := \
     $(MESA_DRI_C_INCLUDES)
 
-# swrast only
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS := -D__NOT_HAVE_DRM_H
-else
-LOCAL_SHARED_LIBRARIES := libdrm
-endif
-
 LOCAL_SRC_FILES := $(megadriver_stub_FILES)
 
 include $(MESA_COMMON_MK)
diff --git a/src/mesa/drivers/dri/common/Makefile.am b/src/mesa/drivers/dri/common/Makefile.am
index ae19fcb3565..b307f10f56b 100644
--- a/src/mesa/drivers/dri/common/Makefile.am
+++ b/src/mesa/drivers/dri/common/Makefile.am
@@ -32,6 +32,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	$(LIBDRM_CFLAGS) \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS)
 
@@ -53,10 +54,3 @@ libdri_test_stubs_la_CFLAGS = $(AM_CFLAGS) -DNO_MAIN
 libmegadriver_stub_la_SOURCES = $(megadriver_stub_FILES)
 
 sysconf_DATA = drirc
-
-if DRICOMMON_NEED_LIBDRM
-AM_CFLAGS += $(LIBDRM_CFLAGS)
-libdricommon_la_LIBADD = $(LIBDRM_LIBS)
-else
-AM_CFLAGS += -D__NOT_HAVE_DRM_H
-endif
diff --git a/src/mesa/drivers/dri/common/SConscript b/src/mesa/drivers/dri/common/SConscript
index b402736db69..52d201f8913 100644
--- a/src/mesa/drivers/dri/common/SConscript
+++ b/src/mesa/drivers/dri/common/SConscript
@@ -32,11 +32,6 @@ drienv.AppendUnique(LIBS = [
     'expat',
 ])
 
-# if HAVE_DRI2
-drienv.PkgUseModules('DRM')
-# else
-#env.Append(CPPDEFINES = ['__NOT_HAVE_DRM_H'])
-
 sources = drienv.ParseSourceList('Makefile.sources', ['DRI_COMMON_FILES', 'XMLCONFIG_FILES' ])
 
 dri_common = drienv.ConvenienceLibrary(
@@ -57,7 +52,6 @@ env.Append(CPPPATH = [
 ])
 
 env.Append(CPPDEFINES = [
-    '__NOT_HAVE_DRM_H',
     'HAVE_DLADDR',
 ])
 
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index e7ababe0b67..d35ac263a45 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -40,13 +40,9 @@
 
 
 #include <stdbool.h>
-#ifndef __NOT_HAVE_DRM_H
-#include <xf86drm.h>
-#endif
 #include "dri_util.h"
 #include "utils.h"
 #include "xmlpool.h"
-#include "../glsl/glsl_parser_extras.h"
 #include "main/mtypes.h"
 #include "main/version.h"
 #include "main/errors.h"
@@ -138,18 +134,6 @@ driCreateNewScreen2(int scrn, int fd,
 
     setupLoaderExtensions(psp, extensions);
 
-#ifndef __NOT_HAVE_DRM_H
-    if (fd != -1) {
-       drmVersionPtr version = drmGetVersion(fd);
-       if (version) {
-          psp->drm_version.major = version->version_major;
-          psp->drm_version.minor = version->version_minor;
-          psp->drm_version.patch = version->version_patchlevel;
-          drmFreeVersion(version);
-       }
-    }
-#endif
-
     psp->loaderPrivate = data;
 
     psp->extensions = emptyExtensionList;
@@ -179,7 +163,9 @@ driCreateNewScreen2(int scrn, int fd,
        }
     }
 
-    psp->api_mask = (1 << __DRI_API_OPENGL);
+    psp->api_mask = 0;
+    if (psp->max_gl_compat_version > 0)
+       psp->api_mask |= (1 << __DRI_API_OPENGL);
     if (psp->max_gl_core_version > 0)
        psp->api_mask |= (1 << __DRI_API_OPENGL_CORE);
     if (psp->max_gl_es1_version > 0)
@@ -238,8 +224,6 @@ static void driDestroyScreen(__DRIscreen *psp)
 	 * stream open to the X-server anymore.
 	 */
 
-       _mesa_destroy_shader_compiler();
-
 	psp->driver->DestroyScreen(psp);
 
 	driDestroyOptionCache(&psp->optionCache);
diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h
index 1138bf106de..6987f555e66 100644
--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -149,11 +149,6 @@ struct __DRIscreenRec {
     int fd;
 
     /**
-     * DRM (kernel module) version information.
-     */
-    __DRIversion drm_version;
-
-    /**
      * Device-dependent private information (not stored in the SAREA).
      * 
      * This pointer is never touched by the DRI layer.
diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc
index 145e707a64c..97d961b6597 100644
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -4,24 +4,15 @@
 Application bugs worked around in this file:
 ============================================
 
+* Unigine Heaven 3.0 and older contain too many bugs and can't be supported
+  by drivers that want to be compliant.
+
 * Various Unigine products don't use the #version and #extension GLSL
   directives, meaning they only get GLSL 1.10 and no extensions for their
   shaders.
   Enabling all extensions for Unigine fixes most issues, but the GLSL version
   is still 1.10.
 
-* Unigine Heaven 3.0 with ARB_texture_multisample uses a "ivec4 * vec4"
-  expression, which is illegal in GLSL 1.10.
-  Adding "#version 130" fixes this.
-
-* Unigine Heaven 3.0 with ARB_shader_bit_encoding uses the uint keyword, which
-  is illegal in GLSL 1.10.
-  Adding "#version 130" fixes this.
-
-* Unigine Heaven 3.0 with ARB_shader_bit_encoding uses a "uint & int"
-  expression, which is illegal in any GLSL version.
-  Disabling ARB_shader_bit_encoding fixes this.
-
 * If ARB_sample_shading is supported, Unigine Heaven 4.0 and Valley 1.0 uses
   an #extension directive in the middle of its shaders, which is illegal
   in GLSL.
@@ -45,18 +36,10 @@ TODO: document the other workarounds.
 	</application>
 
         <application name="Unigine Heaven (32-bit)" executable="heaven_x86">
-            <option name="force_glsl_extensions_warn" value="true" />
-            <option name="disable_blend_func_extended" value="true" />
-            <option name="force_glsl_version" value="130" />
-            <option name="disable_shader_bit_encoding" value="true" />
             <option name="allow_glsl_extension_directive_midshader" value="true" />
 	</application>
 
         <application name="Unigine Heaven (64-bit)" executable="heaven_x64">
-            <option name="force_glsl_extensions_warn" value="true" />
-            <option name="disable_blend_func_extended" value="true" />
-            <option name="force_glsl_version" value="130" />
-            <option name="disable_shader_bit_encoding" value="true" />
             <option name="allow_glsl_extension_directive_midshader" value="true" />
 	</application>
 
diff --git a/src/mesa/drivers/dri/common/utils.c b/src/mesa/drivers/dri/common/utils.c
index 70d34e8ce55..b51b263fe46 100644
--- a/src/mesa/drivers/dri/common/utils.c
+++ b/src/mesa/drivers/dri/common/utils.c
@@ -213,6 +213,7 @@ driCreateConfigs(mesa_format format,
       masks = masks_table[0];
       break;
    case MESA_FORMAT_B8G8R8X8_UNORM:
+   case MESA_FORMAT_B8G8R8X8_SRGB:
       masks = masks_table[1];
       break;
    case MESA_FORMAT_B8G8R8A8_UNORM:
diff --git a/src/mesa/drivers/dri/i915/i830_state.c b/src/mesa/drivers/dri/i915/i830_state.c
index ea54e2b25b1..906e942b020 100644
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -57,7 +57,7 @@ i830StencilFuncSeparate(struct gl_context * ctx, GLenum face, GLenum func, GLint
    mask = mask & 0xff;
 
    DBG("%s : func: %s, ref : 0x%x, mask: 0x%x\n", __func__,
-       _mesa_lookup_enum_by_nr(func), ref, mask);
+       _mesa_enum_to_string(func), ref, mask);
 
 
    I830_STATECHANGE(i830, I830_UPLOAD_CTX);
@@ -95,9 +95,9 @@ i830StencilOpSeparate(struct gl_context * ctx, GLenum face, GLenum fail, GLenum
    int fop, dfop, dpop;
 
    DBG("%s: fail : %s, zfail: %s, zpass : %s\n", __func__,
-       _mesa_lookup_enum_by_nr(fail),
-       _mesa_lookup_enum_by_nr(zfail), 
-       _mesa_lookup_enum_by_nr(zpass));
+       _mesa_enum_to_string(fail),
+       _mesa_enum_to_string(zfail), 
+       _mesa_enum_to_string(zpass));
 
    fop = 0;
    dfop = 0;
@@ -389,8 +389,8 @@ static void
 i830BlendEquationSeparate(struct gl_context * ctx, GLenum modeRGB, GLenum modeA)
 {
    DBG("%s -> %s, %s\n", __func__,
-       _mesa_lookup_enum_by_nr(modeRGB),
-       _mesa_lookup_enum_by_nr(modeA));
+       _mesa_enum_to_string(modeRGB),
+       _mesa_enum_to_string(modeA));
 
    (void) modeRGB;
    (void) modeA;
@@ -403,10 +403,10 @@ i830BlendFuncSeparate(struct gl_context * ctx, GLenum sfactorRGB,
                       GLenum dfactorRGB, GLenum sfactorA, GLenum dfactorA)
 {
    DBG("%s -> RGB(%s, %s) A(%s, %s)\n", __func__,
-       _mesa_lookup_enum_by_nr(sfactorRGB),
-       _mesa_lookup_enum_by_nr(dfactorRGB),
-       _mesa_lookup_enum_by_nr(sfactorA),
-       _mesa_lookup_enum_by_nr(dfactorA));
+       _mesa_enum_to_string(sfactorRGB),
+       _mesa_enum_to_string(dfactorRGB),
+       _mesa_enum_to_string(sfactorA),
+       _mesa_enum_to_string(dfactorA));
 
    (void) sfactorRGB;
    (void) dfactorRGB;
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 42ea54e087d..57b033c07ea 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -255,6 +255,8 @@ i915CreateContext(int api,
     * FINISHME: vertex shaders?
     */
    ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitCondCodes = true;
+   ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectSampler =
+      true;
 
    struct gl_shader_compiler_options *const fs_options =
       & ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT];
@@ -266,6 +268,7 @@ i915CreateContext(int api,
    fs_options->EmitNoIndirectOutput = true;
    fs_options->EmitNoIndirectUniform = true;
    fs_options->EmitNoIndirectTemp = true;
+   fs_options->EmitNoIndirectSampler = true;
 
    ctx->Const.MaxDrawBuffers = 1;
    ctx->Const.QueryCounterBits.SamplesPassed = 0;
diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index 5f10b840b1a..4c83073e692 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -402,7 +402,7 @@ void
 intelCalcViewport(struct gl_context * ctx)
 {
    struct intel_context *intel = intel_context(ctx);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
 
    _mesa_get_viewport_xform(ctx, 0, scale, translate);
 
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index aef5ff99eb2..f653f441ad8 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -342,7 +342,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
        * Thus, I guess we need do this for other platforms as well.
        */
       if (tObj->Target == GL_TEXTURE_CUBE_MAP_ARB &&
-          !is_power_of_two(firstImage->Height))
+          !_mesa_is_pow_two(firstImage->Height))
          return false;
 
       state[I915_TEXREG_SS3] = ss3;     /* SS3_NORMALIZED_COORDS */
diff --git a/src/mesa/drivers/dri/i915/intel_context.c b/src/mesa/drivers/dri/i915/intel_context.c
index 5618dcd8358..c780103228f 100644
--- a/src/mesa/drivers/dri/i915/intel_context.c
+++ b/src/mesa/drivers/dri/i915/intel_context.c
@@ -428,7 +428,6 @@ intelInitContext(struct intel_context *intel,
 
    driContextPriv->driverPrivate = intel;
    intel->driContext = driContextPriv;
-   intel->driFd = sPriv->fd;
 
    intel->gen = intelScreen->gen;
 
diff --git a/src/mesa/drivers/dri/i915/intel_context.h b/src/mesa/drivers/dri/i915/intel_context.h
index 350d35d9033..4ec4015d453 100644
--- a/src/mesa/drivers/dri/i915/intel_context.h
+++ b/src/mesa/drivers/dri/i915/intel_context.h
@@ -273,8 +273,6 @@ struct intel_context
 
    bool use_early_z;
 
-   int driFd;
-
    __DRIcontext *driContext;
    struct intel_screen *intelScreen;
 
diff --git a/src/mesa/drivers/dri/i915/intel_fbo.c b/src/mesa/drivers/dri/i915/intel_fbo.c
index a5d5c5832fb..67013666377 100644
--- a/src/mesa/drivers/dri/i915/intel_fbo.c
+++ b/src/mesa/drivers/dri/i915/intel_fbo.c
@@ -216,7 +216,7 @@ intel_alloc_private_renderbuffer_storage(struct gl_context * ctx, struct gl_rend
    intel_miptree_release(&irb->mt);
 
    DBG("%s: %s: %s (%dx%d)\n", __func__,
-       _mesa_lookup_enum_by_nr(internalFormat),
+       _mesa_enum_to_string(internalFormat),
        _mesa_get_format_name(rb->Format), width, height);
 
    if (width == 0 || height == 0)
diff --git a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
index e56b9859377..1aa06c18f15 100644
--- a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
@@ -81,7 +81,7 @@ intel_miptree_create_layout(struct intel_context *intel,
       return NULL;
 
    DBG("%s target %s format %s level %d..%d <-- %p\n", __func__,
-       _mesa_lookup_enum_by_nr(target),
+       _mesa_enum_to_string(target),
        _mesa_get_format_name(format),
        first_level, last_level, mt);
 
diff --git a/src/mesa/drivers/dri/i915/intel_render.c b/src/mesa/drivers/dri/i915/intel_render.c
index 0b0d48e1663..5962dad7d11 100644
--- a/src/mesa/drivers/dri/i915/intel_render.c
+++ b/src/mesa/drivers/dri/i915/intel_render.c
@@ -113,7 +113,7 @@ static void
 intelDmaPrimitive(struct intel_context *intel, GLenum prim)
 {
    if (0)
-      fprintf(stderr, "%s %s\n", __func__, _mesa_lookup_enum_by_nr(prim));
+      fprintf(stderr, "%s %s\n", __func__, _mesa_enum_to_string(prim));
    INTEL_FIREVERTICES(intel);
    intel->vtbl.reduced_primitive_state(intel, reduced_prim[prim]);
    intel_set_prim(intel, hw_prim[prim]);
diff --git a/src/mesa/drivers/dri/i915/intel_tex_image.c b/src/mesa/drivers/dri/i915/intel_tex_image.c
index 01de966a134..0a213e9f614 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_image.c
@@ -189,7 +189,7 @@ intelTexImage(struct gl_context * ctx,
               const struct gl_pixelstore_attrib *unpack)
 {
    DBG("%s target %s level %d %dx%dx%d\n", __func__,
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
+       _mesa_enum_to_string(texImage->TexObject->Target),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    /* Attempt to use the blitter for PBO image uploads.
diff --git a/src/mesa/drivers/dri/i915/intel_tex_subimage.c b/src/mesa/drivers/dri/i915/intel_tex_subimage.c
index 2e02d50f13f..f11ef2ea329 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_subimage.c
@@ -72,7 +72,7 @@ intel_blit_texsubimage(struct gl_context * ctx,
 
    DBG("BLT subimage %s target %s level %d offset %d,%d %dx%d\n",
        __func__,
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
+       _mesa_enum_to_string(texImage->TexObject->Target),
        texImage->Level, xoffset, yoffset, width, height);
 
    pixels = _mesa_validate_pbo_teximage(ctx, 2, width, height, 1,
diff --git a/src/mesa/drivers/dri/i915/intel_tris.c b/src/mesa/drivers/dri/i915/intel_tris.c
index 144f0fc911a..ae62a800fb7 100644
--- a/src/mesa/drivers/dri/i915/intel_tris.c
+++ b/src/mesa/drivers/dri/i915/intel_tris.c
@@ -1134,7 +1134,7 @@ intelRasterPrimitive(struct gl_context * ctx, GLenum rprim, GLuint hwprim)
 
    if (0)
       fprintf(stderr, "%s %s %x\n", __func__,
-              _mesa_lookup_enum_by_nr(rprim), hwprim);
+              _mesa_enum_to_string(rprim), hwprim);
 
    intel->vtbl.reduced_primitive_state(intel, rprim);
 
@@ -1158,7 +1158,7 @@ intelRenderPrimitive(struct gl_context * ctx, GLenum prim)
                          ctx->Polygon.BackMode != GL_FILL);
 
    if (0)
-      fprintf(stderr, "%s %s\n", __func__, _mesa_lookup_enum_by_nr(prim));
+      fprintf(stderr, "%s %s\n", __func__, _mesa_enum_to_string(prim));
 
    /* Let some clipping routines know which primitive they're dealing
     * with.
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 981fe79b132..dfdad75329d 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -60,6 +60,8 @@ i965_FILES = \
 	brw_fs_register_coalesce.cpp \
 	brw_fs_saturate_propagation.cpp \
 	brw_fs_sel_peephole.cpp \
+	brw_fs_surface_builder.cpp \
+	brw_fs_surface_builder.h \
 	brw_fs_vector_splitting.cpp \
 	brw_fs_visitor.cpp \
 	brw_gs.c \
@@ -86,6 +88,7 @@ i965_FILES = \
 	brw_object_purgeable.c \
 	brw_packed_float.c \
 	brw_performance_monitor.c \
+	brw_pipe_control.c \
 	brw_primitive_restart.c \
 	brw_program.c \
 	brw_program.h \
@@ -122,6 +125,8 @@ i965_FILES = \
 	brw_vec4.h \
 	brw_vec4_live_variables.cpp \
 	brw_vec4_live_variables.h \
+	brw_vec4_nir.cpp \
+	brw_vec4_gs_nir.cpp \
 	brw_vec4_reg_allocate.cpp \
 	brw_vec4_visitor.cpp \
 	brw_vec4_vp.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 98ff0ddcd58..b188fc7de57 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -44,6 +44,41 @@
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
 
+static const GLuint stage_to_bt_edit[] = {
+   [MESA_SHADER_VERTEX] = _3DSTATE_BINDING_TABLE_EDIT_VS,
+   [MESA_SHADER_GEOMETRY] = _3DSTATE_BINDING_TABLE_EDIT_GS,
+   [MESA_SHADER_FRAGMENT] = _3DSTATE_BINDING_TABLE_EDIT_PS,
+};
+
+static uint32_t
+reserve_hw_bt_space(struct brw_context *brw, unsigned bytes)
+{
+   /* From the Broadwell PRM, Volume 16, "Workarounds",
+    * WaStateBindingTableOverfetch:
+    * "HW over-fetches two cache lines of binding table indices.  When
+    *  using the resource streamer, SW needs to pad binding table pointer
+    *  updates with an additional two cache lines."
+    *
+    * Cache lines are 64 bytes, so we subtract 128 bytes from the size of
+    * the binding table pool buffer.
+    */
+   if (brw->hw_bt_pool.next_offset + bytes >= brw->hw_bt_pool.bo->size - 128) {
+      gen7_reset_hw_bt_pool_offsets(brw);
+   }
+
+   uint32_t offset = brw->hw_bt_pool.next_offset;
+
+   /* From the Haswell PRM, Volume 2b: Command Reference: Instructions,
+    * 3DSTATE_BINDING_TABLE_POINTERS_xS:
+    *
+    * "If HW Binding Table is enabled, the offset is relative to the
+    *  Binding Table Pool Base Address and the alignment is 64 bytes."
+    */
+   brw->hw_bt_pool.next_offset += ALIGN(bytes, 64);
+
+   return offset;
+}
+
 /**
  * Upload a shader stage's binding table as indirect state.
  *
@@ -72,22 +107,41 @@ brw_upload_binding_table(struct brw_context *brw,
             brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW,
             brw->shader_time.bo->size, 1, true);
       }
+      /* When RS is enabled use hw-binding table uploads, otherwise fallback to
+       * software-uploads.
+       */
+      if (brw->use_resource_streamer) {
+         gen7_update_binding_table_from_array(brw, stage_state->stage,
+                                              stage_state->surf_offset,
+                                              prog_data->binding_table
+                                              .size_bytes / 4);
+      } else {
+         uint32_t *bind = brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
+                                          prog_data->binding_table.size_bytes,
+                                          32,
+                                          &stage_state->bind_bo_offset);
 
-      uint32_t *bind = brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
-                                       prog_data->binding_table.size_bytes, 32,
-                                       &stage_state->bind_bo_offset);
-
-      /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
-      memcpy(bind, stage_state->surf_offset,
-             prog_data->binding_table.size_bytes);
+         /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
+         memcpy(bind, stage_state->surf_offset,
+                prog_data->binding_table.size_bytes);
+      }
    }
 
    brw->ctx.NewDriverState |= brw_new_binding_table;
 
    if (brw->gen >= 7) {
+      if (brw->use_resource_streamer) {
+         stage_state->bind_bo_offset =
+            reserve_hw_bt_space(brw, prog_data->binding_table.size_bytes);
+      }
       BEGIN_BATCH(2);
       OUT_BATCH(packet_name << 16 | (2 - 2));
-      OUT_BATCH(stage_state->bind_bo_offset);
+      /* Align SurfaceStateOffset[16:6] format to [15:5] PS Binding Table field
+       * when hw-generated binding table is enabled.
+       */
+      OUT_BATCH(brw->use_resource_streamer ?
+                (stage_state->bind_bo_offset >> 1) :
+                stage_state->bind_bo_offset);
       ADVANCE_BATCH();
    }
 }
@@ -170,6 +224,158 @@ const struct brw_tracked_state brw_gs_binding_table = {
    .emit = brw_gs_upload_binding_table,
 };
 
+/**
+ * Edit a single entry in a hardware-generated binding table
+ */
+void
+gen7_edit_hw_binding_table_entry(struct brw_context *brw,
+                                 gl_shader_stage stage,
+                                 uint32_t index,
+                                 uint32_t surf_offset)
+{
+   assert(stage < ARRAY_SIZE(stage_to_bt_edit));
+   assert(stage_to_bt_edit[stage]);
+
+   uint32_t dw2 = SET_FIELD(index, BRW_BINDING_TABLE_INDEX) |
+      (brw->gen >= 8 ? GEN8_SURFACE_STATE_EDIT(surf_offset) :
+       HSW_SURFACE_STATE_EDIT(surf_offset));
+
+   BEGIN_BATCH(3);
+   OUT_BATCH(stage_to_bt_edit[stage] << 16 | (3 - 2));
+   OUT_BATCH(BRW_BINDING_TABLE_EDIT_TARGET_ALL);
+   OUT_BATCH(dw2);
+   ADVANCE_BATCH();
+}
+
+/**
+ * Upload a whole hardware binding table for the given stage.
+ *
+ * Takes an array of surface offsets and the number of binding table
+ * entries.
+ */
+void
+gen7_update_binding_table_from_array(struct brw_context *brw,
+                                     gl_shader_stage stage,
+                                     const uint32_t* binding_table,
+                                     int num_surfaces)
+{
+   uint32_t dw2 = 0;
+
+   assert(stage < ARRAY_SIZE(stage_to_bt_edit));
+   assert(stage_to_bt_edit[stage]);
+
+   BEGIN_BATCH(num_surfaces + 2);
+   OUT_BATCH(stage_to_bt_edit[stage] << 16 | num_surfaces);
+   OUT_BATCH(BRW_BINDING_TABLE_EDIT_TARGET_ALL);
+   for (int i = 0; i < num_surfaces; i++) {
+      dw2 = SET_FIELD(i, BRW_BINDING_TABLE_INDEX) |
+         (brw->gen >= 8 ? GEN8_SURFACE_STATE_EDIT(binding_table[i]) :
+          HSW_SURFACE_STATE_EDIT(binding_table[i]));
+      OUT_BATCH(dw2);
+   }
+   ADVANCE_BATCH();
+}
+
+/**
+ * Disable hardware binding table support, falling back to the
+ * older software-generated binding table mechanism.
+ */
+void
+gen7_disable_hw_binding_tables(struct brw_context *brw)
+{
+   if (!brw->use_resource_streamer)
+      return;
+   /* From the Haswell PRM, Volume 7: 3D Media GPGPU,
+    * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+    *
+    * "When switching between HW and SW binding table generation, SW must
+    * issue a state cache invalidate."
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+
+   int pkt_len = brw->gen >= 8 ? 4 : 3;
+
+   BEGIN_BATCH(pkt_len);
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POOL_ALLOC << 16 | (pkt_len - 2));
+   if (brw->gen >= 8) {
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH(HSW_BT_POOL_ALLOC_MUST_BE_ONE);
+      OUT_BATCH(0);
+   }
+   ADVANCE_BATCH();
+}
+
+/**
+ * Enable hardware binding tables and set up the binding table pool.
+ */
+void
+gen7_enable_hw_binding_tables(struct brw_context *brw)
+{
+   if (!brw->use_resource_streamer)
+      return;
+
+   if (!brw->hw_bt_pool.bo) {
+      /* We use a single re-usable buffer object for the lifetime of the
+       * context and size it to maximum allowed binding tables that can be
+       * programmed per batch:
+       *
+       * From the Haswell PRM, Volume 7: 3D Media GPGPU,
+       * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+       * "A maximum of 16,383 Binding tables are allowed in any batch buffer"
+       */
+      static const int max_size = 16383 * 4;
+      brw->hw_bt_pool.bo = drm_intel_bo_alloc(brw->bufmgr, "hw_bt",
+                                              max_size, 64);
+      brw->hw_bt_pool.next_offset = 0;
+   }
+
+   /* From the Haswell PRM, Volume 7: 3D Media GPGPU,
+    * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+    *
+    * "When switching between HW and SW binding table generation, SW must
+    * issue a state cache invalidate."
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+
+   int pkt_len = brw->gen >= 8 ? 4 : 3;
+   uint32_t dw1 = BRW_HW_BINDING_TABLE_ENABLE;
+   if (brw->is_haswell) {
+      dw1 |= SET_FIELD(GEN7_MOCS_L3, GEN7_HW_BT_POOL_MOCS) |
+             HSW_BT_POOL_ALLOC_MUST_BE_ONE;
+   } else if (brw->gen >= 8) {
+      dw1 |= BDW_MOCS_WB;
+   }
+
+   BEGIN_BATCH(pkt_len);
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POOL_ALLOC << 16 | (pkt_len - 2));
+   if (brw->gen >= 8) {
+      OUT_RELOC64(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1);
+      OUT_BATCH(brw->hw_bt_pool.bo->size);
+   } else {
+      OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1);
+      OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0,
+             brw->hw_bt_pool.bo->size);
+   }
+   ADVANCE_BATCH();
+}
+
+void
+gen7_reset_hw_bt_pool_offsets(struct brw_context *brw)
+{
+   brw->hw_bt_pool.next_offset = 0;
+}
+
+const struct brw_tracked_state gen7_hw_binding_tables = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+   },
+   .emit = gen7_enable_hw_binding_tables
+};
+
 /** @} */
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index b404869f0c7..eac1f005496 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -220,13 +220,13 @@ brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params)
     * data with different formats, which blorp does for stencil and depth
     * data.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
 retry:
    intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
    intel_batchbuffer_save_state(brw);
    drm_intel_bo *saved_bo = brw->batch.bo;
-   uint32_t saved_used = brw->batch.used;
+   uint32_t saved_used = USED_BATCH(brw->batch);
    uint32_t saved_state_batch_offset = brw->batch.state_batch_offset;
 
    switch (brw->gen) {
@@ -245,7 +245,7 @@ retry:
     * reserved enough space that a wrap will never happen.
     */
    assert(brw->batch.bo == saved_bo);
-   assert((brw->batch.used - saved_used) * 4 +
+   assert((USED_BATCH(brw->batch) - saved_used) * 4 +
           (saved_state_batch_offset - brw->batch.state_batch_offset) <
           estimated_max_batch_usage);
    /* Shut up compiler warnings on release build */
@@ -283,7 +283,7 @@ retry:
    /* Flush the sampler cache so any texturing from the destination is
     * coherent.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 brw_hiz_op_params::brw_hiz_op_params(struct intel_mipmap_tree *mt,
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 1561b593969..205c905b447 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -1285,8 +1285,8 @@ brw_blorp_blit_program::translate_dst_to_src()
       /* Round the float coordinates down to nearest integer */
       emit_rndd(Xp_f, X_f);
       emit_rndd(Yp_f, Y_f);
-      emit_mul(X_f, Xp_f, brw_imm_f(1 / key->x_scale));
-      emit_mul(Y_f, Yp_f, brw_imm_f(1 / key->y_scale));
+      emit_mul(X_f, Xp_f, brw_imm_f(1.0f / key->x_scale));
+      emit_mul(Y_f, Yp_f, brw_imm_f(1.0f / key->y_scale));
       SWAP_XY_AND_XPYP();
    } else if (!key->bilinear_filter) {
       /* Round the float coordinates down to nearest integer by moving to
@@ -1442,7 +1442,7 @@ brw_blorp_blit_program::manual_blend_average(unsigned num_samples)
       for (int j = 0; j < 4; ++j) {
          emit_mul(offset(texture_data[0], 2*j),
                  offset(vec8(texture_data[0]), 2*j),
-                 brw_imm_f(1.0/num_samples));
+                 brw_imm_f(1.0f / num_samples));
       }
    }
 
@@ -1475,9 +1475,9 @@ brw_blorp_blit_program::manual_blend_bilinear(unsigned num_samples)
 
       /* Compute pixel coordinates */
       emit_add(vec16(x_sample_coords), Xp_f,
-              brw_imm_f((float)(i & 0x1) * (1.0 / key->x_scale)));
+              brw_imm_f((float)(i & 0x1) * (1.0f / key->x_scale)));
       emit_add(vec16(y_sample_coords), Yp_f,
-              brw_imm_f((float)((i >> 1) & 0x1) * (1.0 / key->y_scale)));
+              brw_imm_f((float)((i >> 1) & 0x1) * (1.0f / key->y_scale)));
       emit_mov(vec16(X), x_sample_coords);
       emit_mov(vec16(Y), y_sample_coords);
 
@@ -1789,7 +1789,7 @@ brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1,
        * so 0.5 provides the necessary correction.
        */
       multiplier = scale;
-      offset = src0 + (-dst0 + 0.5) * scale;
+      offset = src0 + (-dst0 + 0.5f) * scale;
    } else {
       /* When mirroring X we need:
        *   src_x - src_x0 = dst_x1 - dst_x - 0.5
@@ -1797,7 +1797,7 @@ brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1,
        *   src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
        */
       multiplier = -scale;
-      offset = src0 + (dst1 - 0.5) * scale;
+      offset = src0 + (dst1 - 0.5f) * scale;
    }
 }
 
@@ -1952,8 +1952,8 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
    /* Scaling factors used for bilinear filtering in multisample scaled
     * blits.
     */
-   wm_prog_key.x_scale = 2.0;
-   wm_prog_key.y_scale = src_mt->num_samples / 2.0;
+   wm_prog_key.x_scale = 2.0f;
+   wm_prog_key.y_scale = src_mt->num_samples / 2.0f;
 
    if (filter == GL_LINEAR && src.num_samples <= 1 && dst.num_samples <= 1)
       wm_prog_key.bilinear_filter = true;
@@ -2000,9 +2000,9 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
    x1 = wm_push_consts.dst_x1 = roundf(dst_x1);
    y1 = wm_push_consts.dst_y1 = roundf(dst_y1);
    wm_push_consts.rect_grid_x1 = (minify(src_mt->logical_width0, src_level) *
-                                  wm_prog_key.x_scale - 1.0);
+                                  wm_prog_key.x_scale - 1.0f);
    wm_push_consts.rect_grid_y1 = (minify(src_mt->logical_height0, src_level) *
-                                  wm_prog_key.y_scale - 1.0);
+                                  wm_prog_key.y_scale - 1.0f);
 
    wm_push_consts.x_transform.setup(src_x0, src_x1, dst_x0, dst_x1, mirror_x);
    wm_push_consts.y_transform.setup(src_y0, src_y1, dst_y0, dst_y1, mirror_y);
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index 789520c7353..d458ad846bf 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -73,7 +73,7 @@ brw_blorp_eu_emitter::emit_kill_if_outside_rect(const struct brw_reg &x,
    emit_cmp(BRW_CONDITIONAL_L, x, dst_x1)->predicate = BRW_PREDICATE_NORMAL;
    emit_cmp(BRW_CONDITIONAL_L, y, dst_y1)->predicate = BRW_PREDICATE_NORMAL;
 
-   fs_inst *inst = new (mem_ctx) fs_inst(BRW_OPCODE_AND, g1, f0, g1);
+   fs_inst *inst = new (mem_ctx) fs_inst(BRW_OPCODE_AND, 16, g1, f0, g1);
    inst->force_writemask_all = true;
    insts.push_tail(inst);
 }
@@ -84,7 +84,7 @@ brw_blorp_eu_emitter::emit_texture_lookup(const struct brw_reg &dst,
                                           unsigned base_mrf,
                                           unsigned msg_length)
 {
-   fs_inst *inst = new (mem_ctx) fs_inst(op, dst, brw_message_reg(base_mrf),
+   fs_inst *inst = new (mem_ctx) fs_inst(op, 16, dst, brw_message_reg(base_mrf),
                                          fs_reg(0u));
 
    inst->base_mrf = base_mrf;
@@ -119,7 +119,8 @@ brw_blorp_eu_emitter::emit_combine(enum opcode combine_opcode,
 {
    assert(combine_opcode == BRW_OPCODE_ADD || combine_opcode == BRW_OPCODE_AVG);
 
-   insts.push_tail(new (mem_ctx) fs_inst(combine_opcode, dst, src_1, src_2));
+   insts.push_tail(new (mem_ctx) fs_inst(combine_opcode, 16, dst,
+                                         src_1, src_2));
 }
 
 fs_inst *
@@ -127,7 +128,7 @@ brw_blorp_eu_emitter::emit_cmp(enum brw_conditional_mod op,
                                const struct brw_reg &x,
                                const struct brw_reg &y)
 {
-   fs_inst *cmp = new (mem_ctx) fs_inst(BRW_OPCODE_CMP,
+   fs_inst *cmp = new (mem_ctx) fs_inst(BRW_OPCODE_CMP, 16,
                                         vec16(brw_null_reg()), x, y);
    cmp->conditional_mod = op;
    insts.push_tail(cmp);
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
index f1f230e3751..91d53eff5a7 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
@@ -208,6 +208,7 @@ cfg_t::cfg_t(exec_list *instructions)
          cur_else = cur;
 
 	 next = new_block();
+         assert(cur_if != NULL);
 	 cur_if->add_successor(mem_ctx, next);
 
 	 set_next_block(&cur, next, ip);
@@ -274,6 +275,7 @@ cfg_t::cfg_t(exec_list *instructions)
          inst->exec_node::remove();
          cur->instructions.push_tail(inst);
 
+         assert(cur_do != NULL);
 	 cur->add_successor(mem_ctx, cur_do);
 
 	 next = new_block();
@@ -287,6 +289,7 @@ cfg_t::cfg_t(exec_list *instructions)
          inst->exec_node::remove();
          cur->instructions.push_tail(inst);
 
+         assert(cur_while != NULL);
 	 cur->add_successor(mem_ctx, cur_while);
 
 	 next = new_block();
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index 1d4ba3cac7e..f981388ef1a 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -184,7 +184,7 @@ brw_fast_clear_depth(struct gl_context *ctx)
     *      must be issued before the rectangle primitive used for the depth
     *      buffer clear operation.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (fb->MaxNumLayers > 0) {
       for (unsigned layer = 0; layer < depth_irb->layer_count; layer++) {
@@ -204,7 +204,7 @@ brw_fast_clear_depth(struct gl_context *ctx)
        *      by a PIPE_CONTROL command with DEPTH_STALL bit set and Then
        *      followed by Depth FLUSH'
       */
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
    }
 
    /* Now, the HiZ buffer contains data that needs to be resolved to the depth
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index ebf12fab69e..328662da82e 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -506,6 +506,18 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers = BRW_MAX_ABO;
       ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicBuffers = BRW_MAX_ABO;
       ctx->Const.MaxCombinedAtomicBuffers = 3 * BRW_MAX_ABO;
+
+      ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms =
+         BRW_MAX_IMAGES;
+      ctx->Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms =
+         (brw->intelScreen->compiler->scalar_vs ? BRW_MAX_IMAGES : 0);
+      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxImageUniforms =
+         BRW_MAX_IMAGES;
+      ctx->Const.MaxImageUnits = MAX_IMAGE_UNITS;
+      ctx->Const.MaxCombinedImageUnitsAndFragmentOutputs =
+         MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
+      ctx->Const.MaxImageSamples = 0;
+      ctx->Const.MaxCombinedImageUniforms = 3 * BRW_MAX_IMAGES;
    }
 
    /* Gen6 converts quads to polygon in beginning of 3D pipeline,
@@ -716,6 +728,7 @@ brwCreateContext(gl_api api,
    brw->is_baytrail = devinfo->is_baytrail;
    brw->is_haswell = devinfo->is_haswell;
    brw->is_cherryview = devinfo->is_cherryview;
+   brw->is_broxton = devinfo->is_broxton;
    brw->has_llc = devinfo->has_llc;
    brw->has_hiz = devinfo->has_hiz_and_separate_stencil;
    brw->has_separate_stencil = devinfo->has_hiz_and_separate_stencil;
@@ -820,6 +833,12 @@ brwCreateContext(gl_api api,
       }
    }
 
+   if (brw_init_pipe_control(brw, devinfo)) {
+      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
+      intelDestroyContext(driContextPriv);
+      return false;
+   }
+
    brw_init_state(brw);
 #endif
 
@@ -867,6 +886,10 @@ brwCreateContext(gl_api api,
 
    brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
 
+   brw->use_resource_streamer = screen->has_resource_streamer &&
+      (brw_env_var_as_boolean("INTEL_USE_HW_BT", false) ||
+       brw_env_var_as_boolean("INTEL_USE_GATHER", false));
+
    ctx->VertexProgram._MaintainTnlProgram = true;
    ctx->FragmentProgram._MaintainTexEnvProgram = true;
 
@@ -935,6 +958,10 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    if (brw->wm.base.scratch_bo)
       drm_intel_bo_unreference(brw->wm.base.scratch_bo);
 
+   gen7_reset_hw_bt_pool_offsets(brw);
+   drm_intel_bo_unreference(brw->hw_bt_pool.bo);
+   brw->hw_bt_pool.bo = NULL;
+
    drm_intel_gem_context_destroy(brw->hw_ctx);
 
    if (ctx->swrast_context) {
@@ -946,6 +973,7 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    if (ctx->swrast_context)
       _swrast_DestroyContext(&brw->ctx);
 
+   brw_fini_pipe_control(brw);
    intel_batchbuffer_free(brw);
 
    drm_intel_bo_unreference(brw->throttle_batch[1]);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 9e1f722df9e..1267a6f5a97 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -201,6 +201,7 @@ enum brw_state_id {
    BRW_STATE_STATS_WM,
    BRW_STATE_UNIFORM_BUFFER,
    BRW_STATE_ATOMIC_BUFFER,
+   BRW_STATE_IMAGE_UNITS,
    BRW_STATE_META_IN_PROGRESS,
    BRW_STATE_INTERPOLATION_MAP,
    BRW_STATE_PUSH_CONSTANT_ALLOCATION,
@@ -282,6 +283,7 @@ enum brw_state_id {
 #define BRW_NEW_STATS_WM                (1ull << BRW_STATE_STATS_WM)
 #define BRW_NEW_UNIFORM_BUFFER          (1ull << BRW_STATE_UNIFORM_BUFFER)
 #define BRW_NEW_ATOMIC_BUFFER           (1ull << BRW_STATE_ATOMIC_BUFFER)
+#define BRW_NEW_IMAGE_UNITS             (1ull << BRW_STATE_IMAGE_UNITS)
 #define BRW_NEW_META_IN_PROGRESS        (1ull << BRW_STATE_META_IN_PROGRESS)
 #define BRW_NEW_INTERPOLATION_MAP       (1ull << BRW_STATE_INTERPOLATION_MAP)
 #define BRW_NEW_PUSH_CONSTANT_ALLOCATION (1ull << BRW_STATE_PUSH_CONSTANT_ALLOCATION)
@@ -367,6 +369,7 @@ struct brw_stage_prog_data {
 
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
+   unsigned nr_image_params;
 
    unsigned curb_read_length;
    unsigned total_scratch;
@@ -387,6 +390,59 @@ struct brw_stage_prog_data {
     */
    const gl_constant_value **param;
    const gl_constant_value **pull_param;
+
+   /**
+    * Image metadata passed to the shader as uniforms.  This is deliberately
+    * ignored by brw_stage_prog_data_compare() because its contents don't have
+    * any influence on program compilation.
+    */
+   struct brw_image_param *image_param;
+};
+
+/*
+ * Image metadata structure as laid out in the shader parameter
+ * buffer.  Entries have to be 16B-aligned for the vec4 back-end to be
+ * able to use them.  That's okay because the padding and any unused
+ * entries [most of them except when we're doing untyped surface
+ * access] will be removed by the uniform packing pass.
+ */
+#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET      0
+#define BRW_IMAGE_PARAM_OFFSET_OFFSET           4
+#define BRW_IMAGE_PARAM_SIZE_OFFSET             8
+#define BRW_IMAGE_PARAM_STRIDE_OFFSET           12
+#define BRW_IMAGE_PARAM_TILING_OFFSET           16
+#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET        20
+#define BRW_IMAGE_PARAM_SIZE                    24
+
+struct brw_image_param {
+   /** Surface binding table index. */
+   uint32_t surface_idx;
+
+   /** Offset applied to the X and Y surface coordinates. */
+   uint32_t offset[2];
+
+   /** Surface X, Y and Z dimensions. */
+   uint32_t size[3];
+
+   /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in
+    * pixels, vertical slice stride in pixels.
+    */
+   uint32_t stride[4];
+
+   /** Log2 of the tiling modulus in the X, Y and Z dimension. */
+   uint32_t tiling[3];
+
+   /**
+    * Right shift to apply for bit 6 address swizzling.  Two different
+    * swizzles can be specified and will be applied one after the other.  The
+    * resulting address will be:
+    *
+    *  addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^
+    *                              (addr >> swizzling[1])))
+    *
+    * Use \c 0xff if any of the swizzles is not required.
+    */
+   uint32_t swizzling[2];
 };
 
 /* Data about a particular attempt to compile a program.  Note that
@@ -416,11 +472,13 @@ struct brw_wm_prog_data {
 
    uint8_t computed_depth_mode;
 
+   bool early_fragment_tests;
    bool no_8;
    bool dual_src_blend;
    bool uses_pos_offset;
    bool uses_omask;
    bool uses_kill;
+   bool pulls_bary;
    uint32_t prog_offset_16;
 
    /**
@@ -874,11 +932,12 @@ struct intel_batchbuffer {
    drm_intel_bo *bo;
    /** Last BO submitted to the hardware.  Used for glFinish(). */
    drm_intel_bo *last_bo;
-   /** BO for post-sync nonzero writes for gen6 workaround. */
-   drm_intel_bo *workaround_bo;
 
+#ifdef DEBUG
    uint16_t emit, total;
-   uint16_t used, reserved_space;
+#endif
+   uint16_t reserved_space;
+   uint32_t *map_next;
    uint32_t *map;
    uint32_t *cpu_map;
 #define BATCH_SZ (8192*sizeof(uint32_t))
@@ -887,10 +946,8 @@ struct intel_batchbuffer {
    enum brw_gpu_ring ring;
    bool needs_sol_reset;
 
-   uint8_t pipe_controls_since_last_cs_stall;
-
    struct {
-      uint16_t used;
+      uint32_t *map_next;
       int reloc_count;
    } saved;
 };
@@ -1040,6 +1097,10 @@ struct brw_context
 
    drm_intel_context *hw_ctx;
 
+   /** BO for post-sync nonzero writes for gen6 workaround. */
+   drm_intel_bo *workaround_bo;
+   uint8_t pipe_controls_since_last_cs_stall;
+
    /**
     * Set of drm_intel_bo * that have been rendered to within this batchbuffer
     * and would need flushing before being used from another cache domain that
@@ -1123,6 +1184,7 @@ struct brw_context
    bool is_baytrail;
    bool is_haswell;
    bool is_cherryview;
+   bool is_broxton;
 
    bool has_hiz;
    bool has_separate_stencil;
@@ -1135,6 +1197,7 @@ struct brw_context
    bool has_pln;
    bool no_simd8;
    bool use_rep_send;
+   bool use_resource_streamer;
 
    /**
     * Some versions of Gen hardware don't do centroid interpolation correctly
@@ -1241,12 +1304,12 @@ struct brw_context
     * Platform specific constants containing the maximum number of threads
     * for each pipeline stage.
     */
-   int max_vs_threads;
-   int max_hs_threads;
-   int max_ds_threads;
-   int max_gs_threads;
-   int max_wm_threads;
-   int max_cs_threads;
+   unsigned max_vs_threads;
+   unsigned max_hs_threads;
+   unsigned max_ds_threads;
+   unsigned max_gs_threads;
+   unsigned max_wm_threads;
+   unsigned max_cs_threads;
 
    /* BRW_NEW_URB_ALLOCATIONS:
     */
@@ -1398,6 +1461,12 @@ struct brw_context
       struct brw_cs_prog_data *prog_data;
    } cs;
 
+   /* RS hardware binding table */
+   struct {
+      drm_intel_bo *bo;
+      uint32_t next_offset;
+   } hw_bt_pool;
+
    struct {
       uint32_t state_offset;
       uint32_t blend_state_offset;
@@ -1453,8 +1522,8 @@ struct brw_context
    } perfmon;
 
    int num_atoms[BRW_NUM_PIPELINES];
-   const struct brw_tracked_state render_atoms[57];
-   const struct brw_tracked_state compute_atoms[3];
+   const struct brw_tracked_state render_atoms[60];
+   const struct brw_tracked_state compute_atoms[4];
 
    /* If (INTEL_DEBUG & DEBUG_BATCH) */
    struct {
@@ -1732,11 +1801,17 @@ void brw_upload_abo_surfaces(struct brw_context *brw,
                              struct gl_shader_program *prog,
                              struct brw_stage_state *stage_state,
                              struct brw_stage_prog_data *prog_data);
+void brw_upload_image_surfaces(struct brw_context *brw,
+                               struct gl_shader *shader,
+                               struct brw_stage_state *stage_state,
+                               struct brw_stage_prog_data *prog_data);
 
 /* brw_surface_formats.c */
 bool brw_render_target_supported(struct brw_context *brw,
                                  struct gl_renderbuffer *rb);
 uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
+mesa_format brw_lower_mesa_image_format(const struct brw_device_info *devinfo,
+                                        mesa_format format);
 
 /* brw_performance_monitor.c */
 void brw_init_performance_monitors(struct brw_context *brw);
@@ -2013,6 +2088,21 @@ bool
 gen9_use_linear_1d_layout(const struct brw_context *brw,
                           const struct intel_mipmap_tree *mt);
 
+/* brw_pipe_control.c */
+int brw_init_pipe_control(struct brw_context *brw,
+			  const struct brw_device_info *info);
+void brw_fini_pipe_control(struct brw_context *brw);
+
+void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
+void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
+                                 drm_intel_bo *bo, uint32_t offset,
+                                 uint32_t imm_lower, uint32_t imm_upper);
+void brw_emit_mi_flush(struct brw_context *brw);
+void brw_emit_post_sync_nonzero_flush(struct brw_context *brw);
+void brw_emit_depth_stall_flushes(struct brw_context *brw);
+void gen7_emit_vs_workaround_flush(struct brw_context *brw);
+void gen7_emit_cs_stall_flush(struct brw_context *brw);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
index 42a082b57b6..6ce5779137e 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -82,7 +82,7 @@ brw_cs_emit(struct brw_context *brw,
    prog_data->local_size[0] = cp->LocalSize[0];
    prog_data->local_size[1] = cp->LocalSize[1];
    prog_data->local_size[2] = cp->LocalSize[2];
-   int local_workgroup_size =
+   unsigned local_workgroup_size =
       cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
 
    cfg_t *cfg = NULL;
@@ -182,7 +182,8 @@ brw_codegen_cs_prog(struct brw_context *brw,
     * prog_data associated with the compiled program, and which will be freed
     * by the state cache.
     */
-   int param_count = cs->num_uniform_components;
+   int param_count = cs->num_uniform_components +
+                     cs->NumImages * BRW_IMAGE_PARAM_SIZE;
 
    /* The backend also sometimes adds params for texture size. */
    param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
@@ -190,7 +191,10 @@ brw_codegen_cs_prog(struct brw_context *brw,
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    prog_data.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, cs->NumImages);
    prog_data.base.nr_params = param_count;
+   prog_data.base.nr_image_params = cs->NumImages;
 
    program = brw_cs_emit(brw, mem_ctx, key, &prog_data,
                          &cp->program, prog, &program_size);
@@ -291,6 +295,17 @@ brw_cs_precompile(struct gl_context *ctx,
 }
 
 
+static unsigned
+get_cs_thread_count(const struct brw_cs_prog_data *cs_prog_data)
+{
+   const unsigned simd_size = cs_prog_data->simd_size;
+   unsigned group_size = cs_prog_data->local_size[0] *
+      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
+
+   return (group_size + simd_size - 1) / simd_size;
+}
+
+
 static void
 brw_upload_cs_state(struct brw_context *brw)
 {
@@ -316,6 +331,8 @@ brw_upload_cs_state(struct brw_context *brw)
                                             prog_data->binding_table.size_bytes,
                                             32, &stage_state->bind_bo_offset);
 
+   unsigned threads = get_cs_thread_count(cs_prog_data);
+
    uint32_t dwords = brw->gen < 8 ? 8 : 9;
    BEGIN_BATCH(dwords);
    OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));
@@ -365,6 +382,13 @@ brw_upload_cs_state(struct brw_context *brw)
    desc[dw++] = 0;
    desc[dw++] = 0;
    desc[dw++] = stage_state->bind_bo_offset;
+   desc[dw++] = 0;
+   const uint32_t media_threads =
+      brw->gen >= 8 ?
+      SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
+      SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT);
+   assert(threads <= brw->max_cs_threads);
+   desc[dw++] = media_threads;
 
    BEGIN_BATCH(4);
    OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index befd7a9538c..a149ce3ba12 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -176,7 +176,7 @@ void brw_upload_cs_urb_state(struct brw_context *brw)
    ADVANCE_BATCH();
 }
 
-static GLfloat fixed_plane[6][4] = {
+static const GLfloat fixed_plane[6][4] = {
    { 0,    0,   -1, 1 },
    { 0,    0,    1, 1 },
    { 0,   -1,    0, 1 },
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index c113d52a3d3..3bbaf977bc5 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -877,6 +877,21 @@ enum opcode {
     * instructions.
     */
    FS_OPCODE_FB_WRITE = 128,
+
+   /**
+    * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as
+    * individual sources instead of as a single payload blob:
+    *
+    * Source 0: [required] Color 0.
+    * Source 1: [optional] Color 1 (for dual source blend messages).
+    * Source 2: [optional] Src0 Alpha.
+    * Source 3: [optional] Source Depth (passthrough from the thread payload).
+    * Source 4: [optional] Destination Depth (gl_FragDepth).
+    * Source 5: [optional] Sample Mask (gl_SampleMask).
+    * Source 6: [required] Number of color components (as a UD immediate).
+    */
+   FS_OPCODE_FB_WRITE_LOGICAL,
+
    FS_OPCODE_BLORP_FB_WRITE,
    FS_OPCODE_REP_FB_WRITE,
    SHADER_OPCODE_RCP,
@@ -890,18 +905,49 @@ enum opcode {
    SHADER_OPCODE_SIN,
    SHADER_OPCODE_COS,
 
+   /**
+    * Texture sampling opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources:
+    *
+    * Source 0: [optional] Texture coordinates.
+    * Source 1: [optional] Shadow comparitor.
+    * Source 2: [optional] dPdx if the operation takes explicit derivatives,
+    *                      otherwise LOD value.
+    * Source 3: [optional] dPdy if the operation takes explicit derivatives.
+    * Source 4: [optional] Sample index.
+    * Source 5: [optional] MCS data.
+    * Source 6: [required] Texture sampler.
+    * Source 7: [optional] Texel offset.
+    * Source 8: [required] Number of coordinate components (as UD immediate).
+    * Source 9: [required] Number derivative components (as UD immediate).
+    */
    SHADER_OPCODE_TEX,
+   SHADER_OPCODE_TEX_LOGICAL,
    SHADER_OPCODE_TXD,
+   SHADER_OPCODE_TXD_LOGICAL,
    SHADER_OPCODE_TXF,
+   SHADER_OPCODE_TXF_LOGICAL,
    SHADER_OPCODE_TXL,
+   SHADER_OPCODE_TXL_LOGICAL,
    SHADER_OPCODE_TXS,
+   SHADER_OPCODE_TXS_LOGICAL,
    FS_OPCODE_TXB,
+   FS_OPCODE_TXB_LOGICAL,
    SHADER_OPCODE_TXF_CMS,
+   SHADER_OPCODE_TXF_CMS_LOGICAL,
    SHADER_OPCODE_TXF_UMS,
+   SHADER_OPCODE_TXF_UMS_LOGICAL,
    SHADER_OPCODE_TXF_MCS,
+   SHADER_OPCODE_TXF_MCS_LOGICAL,
    SHADER_OPCODE_LOD,
+   SHADER_OPCODE_LOD_LOGICAL,
    SHADER_OPCODE_TG4,
+   SHADER_OPCODE_TG4_LOGICAL,
    SHADER_OPCODE_TG4_OFFSET,
+   SHADER_OPCODE_TG4_OFFSET_LOGICAL,
 
    /**
     * Combines multiple sources of size 1 into a larger virtual GRF.
@@ -919,13 +965,33 @@ enum opcode {
 
    SHADER_OPCODE_SHADER_TIME_ADD,
 
+   /**
+    * Typed and untyped surface access opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources:
+    *
+    * Source 0: [required] Surface coordinates.
+    * Source 1: [optional] Operation source.
+    * Source 2: [required] Surface index.
+    * Source 3: [required] Number of coordinate components (as UD immediate).
+    * Source 4: [required] Opcode-specific control immediate, same as source 2
+    *                      of the matching non-LOGICAL opcode.
+    */
    SHADER_OPCODE_UNTYPED_ATOMIC,
+   SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+   SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+   SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
 
    SHADER_OPCODE_TYPED_ATOMIC,
+   SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
    SHADER_OPCODE_TYPED_SURFACE_READ,
+   SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+   SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
 
    SHADER_OPCODE_MEMORY_FENCE,
 
@@ -971,7 +1037,6 @@ enum opcode {
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
    FS_OPCODE_DISCARD_JUMP,
-   FS_OPCODE_SET_OMASK,
    FS_OPCODE_SET_SAMPLE_ID,
    FS_OPCODE_SET_SIMD4X2_OFFSET,
    FS_OPCODE_PACK_HALF_2x16_SPLIT,
@@ -1151,6 +1216,11 @@ enum opcode {
     * GLSL barrier()
     */
    SHADER_OPCODE_BARRIER,
+
+   /**
+    * Calculate the high 32-bits of a 32x32 multiply.
+    */
+   SHADER_OPCODE_MULH,
 };
 
 enum brw_urb_write_flags {
@@ -1642,6 +1712,36 @@ enum brw_message_target {
 #define _3DSTATE_BINDING_TABLE_POINTERS_GS	0x7829 /* GEN7+ */
 #define _3DSTATE_BINDING_TABLE_POINTERS_PS	0x782A /* GEN7+ */
 
+#define _3DSTATE_BINDING_TABLE_POOL_ALLOC       0x7919 /* GEN7.5+ */
+#define BRW_HW_BINDING_TABLE_ENABLE             (1 << 11)
+#define GEN7_HW_BT_POOL_MOCS_SHIFT              7
+#define GEN7_HW_BT_POOL_MOCS_MASK               INTEL_MASK(10, 7)
+#define GEN8_HW_BT_POOL_MOCS_SHIFT              0
+#define GEN8_HW_BT_POOL_MOCS_MASK               INTEL_MASK(6, 0)
+/* Only required in HSW */
+#define HSW_BT_POOL_ALLOC_MUST_BE_ONE           (3 << 5)
+
+#define _3DSTATE_BINDING_TABLE_EDIT_VS          0x7843 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_GS          0x7844 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_HS          0x7845 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_DS          0x7846 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_PS          0x7847 /* GEN7.5 */
+#define BRW_BINDING_TABLE_INDEX_SHIFT           16
+#define BRW_BINDING_TABLE_INDEX_MASK            INTEL_MASK(23, 16)
+
+#define BRW_BINDING_TABLE_EDIT_TARGET_ALL       3
+#define BRW_BINDING_TABLE_EDIT_TARGET_CORE1     2
+#define BRW_BINDING_TABLE_EDIT_TARGET_CORE0     1
+/* In HSW, when editing binding table entries to surface state offsets,
+ * the surface state offset is a 16-bit value aligned to 32 bytes. But
+ * Surface State Pointer in dword 2 is [15:0]. Right shift surf_offset
+ * by 5 bits so it won't disturb bit 16 (which is used as the binding
+ * table index entry), otherwise it would hang the GPU.
+ */
+#define HSW_SURFACE_STATE_EDIT(value)           (value >> 5)
+/* Same as Haswell, but surface state offsets now aligned to 64 bytes.*/
+#define GEN8_SURFACE_STATE_EDIT(value)          (value >> 6)
+
 #define _3DSTATE_SAMPLER_STATE_POINTERS		0x7802 /* GEN6+ */
 # define PS_SAMPLER_STATE_CHANGE				(1 << 12)
 # define GS_SAMPLER_STATE_CHANGE				(1 << 9)
@@ -1757,6 +1857,7 @@ enum brw_message_target {
 # define GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
 # define GEN6_VS_FLOATING_POINT_MODE_IEEE_754		(0 << 16)
 # define GEN6_VS_FLOATING_POINT_MODE_ALT		(1 << 16)
+# define HSW_VS_UAV_ACCESS_ENABLE                       (1 << 12)
 /* DW4 */
 # define GEN6_VS_DISPATCH_START_GRF_SHIFT		20
 # define GEN6_VS_URB_READ_LENGTH_SHIFT			11
@@ -1782,6 +1883,7 @@ enum brw_message_target {
 # define GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
 # define GEN6_GS_FLOATING_POINT_MODE_IEEE_754		(0 << 16)
 # define GEN6_GS_FLOATING_POINT_MODE_ALT		(1 << 16)
+# define HSW_GS_UAV_ACCESS_ENABLE       		(1 << 12)
 /* DW4 */
 # define GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT		23
 # define GEN7_GS_OUTPUT_TOPOLOGY_SHIFT			17
@@ -2147,6 +2249,7 @@ enum brw_pixel_shader_computed_depth_mode {
 # define GEN8_PSX_SHADER_DISABLES_ALPHA_TO_COVERAGE     (1 << 7)
 # define GEN8_PSX_SHADER_IS_PER_SAMPLE                  (1 << 6)
 # define GEN8_PSX_SHADER_COMPUTES_STENCIL               (1 << 5)
+# define GEN9_PSX_SHADER_PULLS_BARY                     (1 << 3)
 # define GEN8_PSX_SHADER_HAS_UAV                        (1 << 2)
 # define GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK       (1 << 1)
 
@@ -2283,6 +2386,9 @@ enum brw_wm_barycentric_interp_mode {
 # define GEN7_WM_KILL_ENABLE				(1 << 25)
 # define GEN7_WM_COMPUTED_DEPTH_MODE_SHIFT              23
 # define GEN7_WM_USES_SOURCE_DEPTH			(1 << 20)
+# define GEN7_WM_EARLY_DS_CONTROL_NORMAL                (0 << 21)
+# define GEN7_WM_EARLY_DS_CONTROL_PSEXEC                (1 << 21)
+# define GEN7_WM_EARLY_DS_CONTROL_PREPS                 (2 << 21)
 # define GEN7_WM_USES_SOURCE_W			        (1 << 19)
 # define GEN7_WM_POSITION_ZW_PIXEL			(0 << 17)
 # define GEN7_WM_POSITION_ZW_CENTROID			(2 << 17)
@@ -2307,6 +2413,7 @@ enum brw_wm_barycentric_interp_mode {
 /* DW2 */
 # define GEN7_WM_MSDISPMODE_PERSAMPLE			(0 << 31)
 # define GEN7_WM_MSDISPMODE_PERPIXEL			(1 << 31)
+# define HSW_WM_UAV_ONLY                                (1 << 30)
 
 #define _3DSTATE_PS				0x7820 /* GEN7+ */
 /* DW1: kernel pointer */
@@ -2330,6 +2437,7 @@ enum brw_wm_barycentric_interp_mode {
 # define GEN7_PS_RENDER_TARGET_FAST_CLEAR_ENABLE	(1 << 8)
 # define GEN7_PS_DUAL_SOURCE_BLEND_ENABLE		(1 << 7)
 # define GEN7_PS_RENDER_TARGET_RESOLVE_ENABLE		(1 << 6)
+# define HSW_PS_UAV_ACCESS_ENABLE			(1 << 5)
 # define GEN7_PS_POSOFFSET_NONE				(0 << 3)
 # define GEN7_PS_POSOFFSET_CENTROID			(2 << 3)
 # define GEN7_PS_POSOFFSET_SAMPLE			(3 << 3)
@@ -2493,12 +2601,13 @@ enum brw_wm_barycentric_interp_mode {
 #define BDW_MOCS_WT  0x58
 #define BDW_MOCS_PTE 0x18
 
-/* Skylake: MOCS is now an index into an array of 64 different configurable
- * cache settings.  We still use only either write-back or write-through; and
- * rely on the documented default values.
+/* Skylake: MOCS is now an index into an array of 62 different caching
+ * configurations programmed by the kernel.
  */
-#define SKL_MOCS_WB (0b001001 << 1)
-#define SKL_MOCS_WT (0b000101 << 1)
+/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
+#define SKL_MOCS_WB  (2 << 1)
+/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
+#define SKL_MOCS_PTE (1 << 1)
 
 #define MEDIA_VFE_STATE                         0x7000
 /* GEN7 DW2, GEN8+ DW3 */
@@ -2519,6 +2628,11 @@ enum brw_wm_barycentric_interp_mode {
 # define MEDIA_VFE_STATE_CURBE_ALLOC_MASK       INTEL_MASK(15, 0)
 
 #define MEDIA_INTERFACE_DESCRIPTOR_LOAD         0x7002
+/* GEN7 DW5, GEN8+ DW6 */
+# define MEDIA_GPGPU_THREAD_COUNT_SHIFT         0
+# define MEDIA_GPGPU_THREAD_COUNT_MASK          INTEL_MASK(7, 0)
+# define GEN8_MEDIA_GPGPU_THREAD_COUNT_SHIFT    0
+# define GEN8_MEDIA_GPGPU_THREAD_COUNT_MASK     INTEL_MASK(9, 0)
 #define MEDIA_STATE_FLUSH                       0x7004
 #define GPGPU_WALKER                            0x7105
 /* GEN8+ DW2 */
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index a07b86e60e2..16c125d07ee 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -170,7 +170,8 @@ static const struct brw_device_info brw_device_info_byt = {
 #define HSW_FEATURES             \
    GEN7_FEATURES,                \
    .is_haswell = true,           \
-   .supports_simd16_3src = true
+   .supports_simd16_3src = true, \
+   .has_resource_streamer = true
 
 static const struct brw_device_info brw_device_info_hsw_gt1 = {
    HSW_FEATURES, .gt = 1,
@@ -229,6 +230,7 @@ static const struct brw_device_info brw_device_info_hsw_gt3 = {
 #define GEN8_FEATURES                               \
    .gen = 8,                                        \
    .has_hiz_and_separate_stencil = true,            \
+   .has_resource_streamer = true,                   \
    .must_use_separate_stencil = true,               \
    .has_llc = true,                                 \
    .has_pln = true,                                 \
@@ -297,41 +299,62 @@ static const struct brw_device_info brw_device_info_chv = {
    }
 };
 
-/* Thread counts and URB limits are placeholders, and may not be accurate. */
 #define GEN9_FEATURES                               \
    .gen = 9,                                        \
    .has_hiz_and_separate_stencil = true,            \
+   .has_resource_streamer = true,                   \
    .must_use_separate_stencil = true,               \
    .has_llc = true,                                 \
    .has_pln = true,                                 \
-   .max_vs_threads = 280,                           \
-   .max_gs_threads = 256,                           \
-   .max_wm_threads = 408,                           \
+   .supports_simd16_3src = true,                    \
+   .max_vs_threads = 336,                           \
+   .max_gs_threads = 336,                           \
+   .max_hs_threads = 336,                           \
+   .max_ds_threads = 336,                           \
+   .max_wm_threads = 64 * 6,                        \
+   .max_cs_threads = 56,                            \
    .urb = {                                         \
-      .size = 128,                                  \
+      .size = 192,                                  \
       .min_vs_entries = 64,                         \
-      .max_vs_entries = 1664,                       \
+      .max_vs_entries = 1856,                       \
+      .max_hs_entries = 672,                        \
+      .max_ds_entries = 1120,                       \
       .max_gs_entries = 640,                        \
    }
 
-static const struct brw_device_info brw_device_info_skl_early = {
-   GEN9_FEATURES, .gt = 1,
-   .supports_simd16_3src = false,
-};
-
 static const struct brw_device_info brw_device_info_skl_gt1 = {
    GEN9_FEATURES, .gt = 1,
-   .supports_simd16_3src = true,
 };
 
 static const struct brw_device_info brw_device_info_skl_gt2 = {
    GEN9_FEATURES, .gt = 2,
-   .supports_simd16_3src = true,
 };
 
 static const struct brw_device_info brw_device_info_skl_gt3 = {
    GEN9_FEATURES, .gt = 3,
-   .supports_simd16_3src = true,
+};
+
+static const struct brw_device_info brw_device_info_bxt = {
+   GEN9_FEATURES,
+   .is_broxton = 1,
+   .gt = 1,
+   .has_llc = false,
+
+   /* XXX: These are preliminary thread counts and URB sizes. */
+   .max_vs_threads = 56,
+   .max_hs_threads = 56,
+   .max_ds_threads = 56,
+   .max_gs_threads = 56,
+   .max_wm_threads = 32,
+   .max_cs_threads = 28,
+   .urb = {
+      .size = 64,
+      .min_vs_entries = 34,
+      .max_vs_entries = 640,
+      .max_hs_entries = 80,
+      .max_ds_entries = 80,
+      .max_gs_entries = 256,
+   }
 };
 
 const struct brw_device_info *
@@ -348,9 +371,6 @@ brw_get_device_info(int devid, int revision)
       return NULL;
    }
 
-   if (devinfo->gen == 9 && (revision == 2 || revision == 3 || revision == -1))
-      return &brw_device_info_skl_early;
-
    return devinfo;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h
index 9192235fb0e..7bab5716b43 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -35,6 +35,7 @@ struct brw_device_info
    bool is_baytrail;
    bool is_haswell;
    bool is_cherryview;
+   bool is_broxton;
 
    bool has_hiz_and_separate_stencil;
    bool must_use_separate_stencil;
@@ -45,6 +46,7 @@ struct brw_device_info
    bool has_compr4;
    bool has_surface_tile_offset;
    bool supports_simd16_3src;
+   bool has_resource_streamer;
 
    /**
     * Quirks:
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index b91597a9f5d..e092ef4a7c6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -104,13 +104,13 @@ get_hw_prim_for_gl_prim(int mode)
  * programs be immune to the active primitive (ie. cope with all
  * possibilities).  That may not be realistic however.
  */
-static void brw_set_prim(struct brw_context *brw,
-                         const struct _mesa_prim *prim)
+static void
+brw_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
 
-   DBG("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));
+   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));
 
    /* Slight optimization to avoid the GS program when not needed:
     */
@@ -138,15 +138,12 @@ static void brw_set_prim(struct brw_context *brw,
    }
 }
 
-static void gen6_set_prim(struct brw_context *brw,
-                          const struct _mesa_prim *prim)
+static void
+gen6_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
 {
-   uint32_t hw_prim;
-
-   DBG("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));
-
-   hw_prim = get_hw_prim_for_gl_prim(prim->mode);
+   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));
 
+   const uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
    if (hw_prim != brw->primitive) {
       brw->primitive = hw_prim;
       brw->ctx.NewDriverState |= BRW_NEW_PRIMITIVE;
@@ -162,7 +159,8 @@ static void gen6_set_prim(struct brw_context *brw,
  * quads so that those dangling vertices won't get drawn when we convert to
  * trifans/tristrips.
  */
-static GLuint trim(GLenum prim, GLuint length)
+static GLuint
+trim(GLenum prim, GLuint length)
 {
    if (prim == GL_QUAD_STRIP)
       return length > 3 ? (length - length % 2) : 0;
@@ -173,16 +171,16 @@ static GLuint trim(GLenum prim, GLuint length)
 }
 
 
-static void brw_emit_prim(struct brw_context *brw,
-			  const struct _mesa_prim *prim,
-			  uint32_t hw_prim)
+static void
+brw_emit_prim(struct brw_context *brw,
+              const struct _mesa_prim *prim,
+              uint32_t hw_prim)
 {
    int verts_per_instance;
    int vertex_access_type;
    int indirect_flag;
-   int predicate_enable;
 
-   DBG("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode),
+   DBG("PRIM: %s %d %d\n", _mesa_enum_to_string(prim->mode),
        prim->start, prim->count);
 
    int start_vertex_location = prim->start;
@@ -216,9 +214,8 @@ static void brw_emit_prim(struct brw_context *brw,
     * and missed flushes of the render cache as it heads to other parts of
     * the besides the draw code.
     */
-   if (brw->always_flush_cache) {
-      intel_batchbuffer_emit_mi_flush(brw);
-   }
+   if (brw->always_flush_cache)
+      brw_emit_mi_flush(brw);
 
    /* If indirect, emit a bunch of loads from the indirect BO. */
    if (prim->is_indirect) {
@@ -256,22 +253,20 @@ static void brw_emit_prim(struct brw_context *brw,
          OUT_BATCH(0);
          ADVANCE_BATCH();
       }
-   }
-   else {
+   } else {
       indirect_flag = 0;
    }
 
+   BEGIN_BATCH(brw->gen >= 7 ? 7 : 6);
+
    if (brw->gen >= 7) {
-      if (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
-         predicate_enable = GEN7_3DPRIM_PREDICATE_ENABLE;
-      else
-         predicate_enable = 0;
+      const int predicate_enable =
+         (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
+         ? GEN7_3DPRIM_PREDICATE_ENABLE : 0;
 
-      BEGIN_BATCH(7);
       OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable);
       OUT_BATCH(hw_prim | vertex_access_type);
    } else {
-      BEGIN_BATCH(6);
       OUT_BATCH(CMD_3D_PRIM << 16 | (6 - 2) |
                 hw_prim << GEN4_3DPRIM_TOPOLOGY_TYPE_SHIFT |
                 vertex_access_type);
@@ -283,14 +278,14 @@ static void brw_emit_prim(struct brw_context *brw,
    OUT_BATCH(base_vertex_location);
    ADVANCE_BATCH();
 
-   if (brw->always_flush_cache) {
-      intel_batchbuffer_emit_mi_flush(brw);
-   }
+   if (brw->always_flush_cache)
+      brw_emit_mi_flush(brw);
 }
 
 
-static void brw_merge_inputs( struct brw_context *brw,
-		       const struct gl_client_array *arrays[])
+static void
+brw_merge_inputs(struct brw_context *brw,
+                 const struct gl_client_array *arrays[])
 {
    const struct gl_context *ctx = &brw->ctx;
    GLuint i;
@@ -359,7 +354,8 @@ static void brw_merge_inputs( struct brw_context *brw,
  * Also mark any render targets which will be textured as needing a render
  * cache flush.
  */
-static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
+static void
+brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
@@ -399,21 +395,22 @@ static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
 /* May fail if out of video memory for texture or vbo upload, or on
  * fallback conditions.
  */
-static void brw_try_draw_prims( struct gl_context *ctx,
-				     const struct gl_client_array *arrays[],
-				     const struct _mesa_prim *prims,
-				     GLuint nr_prims,
-				     const struct _mesa_index_buffer *ib,
-				     GLuint min_index,
-				     GLuint max_index,
-				     struct gl_buffer_object *indirect)
+static void
+brw_try_draw_prims(struct gl_context *ctx,
+                   const struct gl_client_array *arrays[],
+                   const struct _mesa_prim *prims,
+                   GLuint nr_prims,
+                   const struct _mesa_index_buffer *ib,
+                   GLuint min_index,
+                   GLuint max_index,
+                   struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
    GLuint i;
    bool fail_next = false;
 
    if (ctx->NewState)
-      _mesa_update_state( ctx );
+      _mesa_update_state(ctx);
 
    /* Find the highest sampler unit used by each shader program.  A bit-count
     * won't work since ARB programs use the texture unit number as the sampler
@@ -433,7 +430,7 @@ static void brw_try_draw_prims( struct gl_context *ctx,
     * software fallback will segfault if it attempts to access any
     * texture level other than level 0.
     */
-   brw_validate_textures( brw );
+   brw_validate_textures(brw);
 
    intel_prepare_render(brw);
 
@@ -445,7 +442,7 @@ static void brw_try_draw_prims( struct gl_context *ctx,
 
    /* Bind all inputs, derive varying and size information:
     */
-   brw_merge_inputs( brw, arrays );
+   brw_merge_inputs(brw, arrays);
 
    brw->ib.ib = ib;
    brw->ctx.NewDriverState |= BRW_NEW_INDICES;
@@ -553,15 +550,17 @@ retry:
    return;
 }
 
-void brw_draw_prims( struct gl_context *ctx,
-		     const struct _mesa_prim *prims,
-		     GLuint nr_prims,
-		     const struct _mesa_index_buffer *ib,
-		     GLboolean index_bounds_valid,
-		     GLuint min_index,
-		     GLuint max_index,
-		     struct gl_transform_feedback_object *unused_tfb_object,
-		     struct gl_buffer_object *indirect )
+void
+brw_draw_prims(struct gl_context *ctx,
+               const struct _mesa_prim *prims,
+               GLuint nr_prims,
+               const struct _mesa_index_buffer *ib,
+               GLboolean index_bounds_valid,
+               GLuint min_index,
+               GLuint max_index,
+               struct gl_transform_feedback_object *unused_tfb_object,
+               unsigned stream,
+               struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
    const struct gl_client_array **arrays = ctx->Array._DrawArrays;
@@ -582,11 +581,11 @@ void brw_draw_prims( struct gl_context *ctx,
     */
    if (ctx->RenderMode != GL_RENDER) {
       perf_debug("%s render mode not supported in hardware\n",
-                 _mesa_lookup_enum_by_nr(ctx->RenderMode));
+                 _mesa_enum_to_string(ctx->RenderMode));
       _swsetup_Wakeup(ctx);
       _tnl_wakeup(ctx);
       _tnl_draw_prims(ctx, prims, nr_prims, ib,
-                      index_bounds_valid, min_index, max_index, NULL, NULL);
+                      index_bounds_valid, min_index, max_index, NULL, 0, NULL);
       return;
    }
 
@@ -604,26 +603,28 @@ void brw_draw_prims( struct gl_context *ctx,
     * manage it.  swrast doesn't support our featureset, so we can't fall back
     * to it.
     */
-   brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index, indirect);
+   brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index,
+                      indirect);
 }
 
-void brw_draw_init( struct brw_context *brw )
+void
+brw_draw_init(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    struct vbo_context *vbo = vbo_context(ctx);
-   int i;
 
    /* Register our drawing function:
     */
    vbo->draw_prims = brw_draw_prims;
 
-   for (i = 0; i < VERT_ATTRIB_MAX; i++)
+   for (int i = 0; i < VERT_ATTRIB_MAX; i++)
       brw->vb.inputs[i].buffer = -1;
    brw->vb.nr_buffers = 0;
    brw->vb.nr_enabled = 0;
 }
 
-void brw_draw_destroy( struct brw_context *brw )
+void
+brw_draw_destroy(struct brw_context *brw)
 {
    int i;
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw.h b/src/mesa/drivers/dri/i965/brw_draw.h
index fc83dcdd0bb..f994726f5b6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.h
+++ b/src/mesa/drivers/dri/i965/brw_draw.h
@@ -34,7 +34,7 @@
 struct brw_context;
 
 
-void brw_draw_prims( struct gl_context *ctx,
+void brw_draw_prims(struct gl_context *ctx,
 		     const struct _mesa_prim *prims,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
@@ -42,6 +42,7 @@ void brw_draw_prims( struct gl_context *ctx,
 		     GLuint min_index,
 		     GLuint max_index,
 		     struct gl_transform_feedback_object *unused_tfb_object,
+                     unsigned stream,
 		     struct gl_buffer_object *indirect );
 
 void brw_draw_init( struct brw_context *brw );
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 320e40e1007..cbfd5855410 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -40,7 +40,7 @@
 #include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
-static GLuint double_types[5] = {
+static const GLuint double_types[5] = {
    0,
    BRW_SURFACEFORMAT_R64_FLOAT,
    BRW_SURFACEFORMAT_R64G64_FLOAT,
@@ -48,7 +48,7 @@ static GLuint double_types[5] = {
    BRW_SURFACEFORMAT_R64G64B64A64_FLOAT
 };
 
-static GLuint float_types[5] = {
+static const GLuint float_types[5] = {
    0,
    BRW_SURFACEFORMAT_R32_FLOAT,
    BRW_SURFACEFORMAT_R32G32_FLOAT,
@@ -56,7 +56,7 @@ static GLuint float_types[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_FLOAT
 };
 
-static GLuint half_float_types[5] = {
+static const GLuint half_float_types[5] = {
    0,
    BRW_SURFACEFORMAT_R16_FLOAT,
    BRW_SURFACEFORMAT_R16G16_FLOAT,
@@ -64,7 +64,7 @@ static GLuint half_float_types[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_FLOAT
 };
 
-static GLuint fixed_point_types[5] = {
+static const GLuint fixed_point_types[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SFIXED,
    BRW_SURFACEFORMAT_R32G32_SFIXED,
@@ -72,7 +72,7 @@ static GLuint fixed_point_types[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SFIXED,
 };
 
-static GLuint uint_types_direct[5] = {
+static const GLuint uint_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R32_UINT,
    BRW_SURFACEFORMAT_R32G32_UINT,
@@ -80,7 +80,7 @@ static GLuint uint_types_direct[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_UINT
 };
 
-static GLuint uint_types_norm[5] = {
+static const GLuint uint_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R32_UNORM,
    BRW_SURFACEFORMAT_R32G32_UNORM,
@@ -88,7 +88,7 @@ static GLuint uint_types_norm[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_UNORM
 };
 
-static GLuint uint_types_scale[5] = {
+static const GLuint uint_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R32_USCALED,
    BRW_SURFACEFORMAT_R32G32_USCALED,
@@ -96,7 +96,7 @@ static GLuint uint_types_scale[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_USCALED
 };
 
-static GLuint int_types_direct[5] = {
+static const GLuint int_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SINT,
    BRW_SURFACEFORMAT_R32G32_SINT,
@@ -104,7 +104,7 @@ static GLuint int_types_direct[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SINT
 };
 
-static GLuint int_types_norm[5] = {
+static const GLuint int_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SNORM,
    BRW_SURFACEFORMAT_R32G32_SNORM,
@@ -112,7 +112,7 @@ static GLuint int_types_norm[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SNORM
 };
 
-static GLuint int_types_scale[5] = {
+static const GLuint int_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SSCALED,
    BRW_SURFACEFORMAT_R32G32_SSCALED,
@@ -120,7 +120,7 @@ static GLuint int_types_scale[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SSCALED
 };
 
-static GLuint ushort_types_direct[5] = {
+static const GLuint ushort_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R16_UINT,
    BRW_SURFACEFORMAT_R16G16_UINT,
@@ -128,7 +128,7 @@ static GLuint ushort_types_direct[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_UINT
 };
 
-static GLuint ushort_types_norm[5] = {
+static const GLuint ushort_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R16_UNORM,
    BRW_SURFACEFORMAT_R16G16_UNORM,
@@ -136,7 +136,7 @@ static GLuint ushort_types_norm[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_UNORM
 };
 
-static GLuint ushort_types_scale[5] = {
+static const GLuint ushort_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R16_USCALED,
    BRW_SURFACEFORMAT_R16G16_USCALED,
@@ -144,7 +144,7 @@ static GLuint ushort_types_scale[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_USCALED
 };
 
-static GLuint short_types_direct[5] = {
+static const GLuint short_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SINT,
    BRW_SURFACEFORMAT_R16G16_SINT,
@@ -152,7 +152,7 @@ static GLuint short_types_direct[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SINT
 };
 
-static GLuint short_types_norm[5] = {
+static const GLuint short_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SNORM,
    BRW_SURFACEFORMAT_R16G16_SNORM,
@@ -160,7 +160,7 @@ static GLuint short_types_norm[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SNORM
 };
 
-static GLuint short_types_scale[5] = {
+static const GLuint short_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SSCALED,
    BRW_SURFACEFORMAT_R16G16_SSCALED,
@@ -168,7 +168,7 @@ static GLuint short_types_scale[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SSCALED
 };
 
-static GLuint ubyte_types_direct[5] = {
+static const GLuint ubyte_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R8_UINT,
    BRW_SURFACEFORMAT_R8G8_UINT,
@@ -176,7 +176,7 @@ static GLuint ubyte_types_direct[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_UINT
 };
 
-static GLuint ubyte_types_norm[5] = {
+static const GLuint ubyte_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R8_UNORM,
    BRW_SURFACEFORMAT_R8G8_UNORM,
@@ -184,7 +184,7 @@ static GLuint ubyte_types_norm[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_UNORM
 };
 
-static GLuint ubyte_types_scale[5] = {
+static const GLuint ubyte_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R8_USCALED,
    BRW_SURFACEFORMAT_R8G8_USCALED,
@@ -192,7 +192,7 @@ static GLuint ubyte_types_scale[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_USCALED
 };
 
-static GLuint byte_types_direct[5] = {
+static const GLuint byte_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SINT,
    BRW_SURFACEFORMAT_R8G8_SINT,
@@ -200,7 +200,7 @@ static GLuint byte_types_direct[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_SINT
 };
 
-static GLuint byte_types_norm[5] = {
+static const GLuint byte_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SNORM,
    BRW_SURFACEFORMAT_R8G8_SNORM,
@@ -208,7 +208,7 @@ static GLuint byte_types_norm[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_SNORM
 };
 
-static GLuint byte_types_scale[5] = {
+static const GLuint byte_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SSCALED,
    BRW_SURFACEFORMAT_R8G8_SSCALED,
@@ -230,7 +230,7 @@ brw_get_vertex_surface_type(struct brw_context *brw,
 
    if (unlikely(INTEL_DEBUG & DEBUG_VERTS))
       fprintf(stderr, "type %s size %d normalized %d\n",
-              _mesa_lookup_enum_by_nr(glarray->Type),
+              _mesa_enum_to_string(glarray->Type),
               glarray->Size, glarray->Normalized);
 
    if (glarray->Integer) {
@@ -604,14 +604,15 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
 /**
  * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
  */
-static void
+static uint32_t *
 emit_vertex_buffer_state(struct brw_context *brw,
                          unsigned buffer_nr,
                          drm_intel_bo *bo,
                          unsigned bo_ending_address,
                          unsigned bo_offset,
                          unsigned stride,
-                         unsigned step_rate)
+                         unsigned step_rate,
+                         uint32_t *__map)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t dw0;
@@ -643,9 +644,13 @@ emit_vertex_buffer_state(struct brw_context *brw,
       OUT_BATCH(0);
    }
    OUT_BATCH(step_rate);
+
+   return __map;
 }
+#define EMIT_VERTEX_BUFFER_STATE(...) __map = emit_vertex_buffer_state(__VA_ARGS__, __map)
 
-static void brw_emit_vertices(struct brw_context *brw)
+static void
+brw_emit_vertices(struct brw_context *brw)
 {
    GLuint i;
 
@@ -704,14 +709,14 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
       for (i = 0; i < brw->vb.nr_buffers; i++) {
 	 struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
-         emit_vertex_buffer_state(brw, i, buffer->bo, buffer->bo->size - 1,
+         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1,
                                   buffer->offset, buffer->stride,
                                   buffer->step_rate);
 
       }
 
       if (brw->vs.prog_data->uses_vertexid) {
-         emit_vertex_buffer_state(brw, brw->vb.nr_buffers,
+         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
                                   brw->draw.draw_params_bo,
                                   brw->draw.draw_params_bo->size - 1,
                                   brw->draw.draw_params_offset,
@@ -855,7 +860,8 @@ const struct brw_tracked_state brw_vertices = {
    .emit = brw_emit_vertices,
 };
 
-static void brw_upload_indices(struct brw_context *brw)
+static void
+brw_upload_indices(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
@@ -935,7 +941,8 @@ const struct brw_tracked_state brw_indices = {
    .emit = brw_upload_indices,
 };
 
-static void brw_emit_index_buffer(struct brw_context *brw)
+static void
+brw_emit_index_buffer(struct brw_context *brw)
 {
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
    GLuint cut_index_setting;
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 0f536046f6f..4d397622fc1 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1584,8 +1584,8 @@ brw_ENDIF(struct brw_codegen *p)
    }
 
    if (devinfo->gen < 6) {
-      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, brw_imm_d(0x0));
    } else if (devinfo->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8984b4cb3ca..0e091ddc227 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -68,28 +68,6 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 
    assert(dst.file != IMM && dst.file != UNIFORM);
 
-   /* If exec_size == 0, try to guess it from the registers.  Since all
-    * manner of things may use hardware registers, we first try to guess
-    * based on GRF registers.  If this fails, we will go ahead and take the
-    * width from the destination register.
-    */
-   if (this->exec_size == 0) {
-      if (dst.file == GRF) {
-         this->exec_size = dst.width;
-      } else {
-         for (unsigned i = 0; i < sources; ++i) {
-            if (src[i].file != GRF && src[i].file != ATTR)
-               continue;
-
-            if (this->exec_size <= 1)
-               this->exec_size = src[i].width;
-            assert(src[i].width == 1 || src[i].width == this->exec_size);
-         }
-      }
-
-      if (this->exec_size == 0 && dst.file != BAD_FILE)
-         this->exec_size = dst.width;
-   }
    assert(this->exec_size != 0);
 
    this->conditional_mod = BRW_CONDITIONAL_NONE;
@@ -100,8 +78,8 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case HW_REG:
    case MRF:
    case ATTR:
-      this->regs_written =
-         DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
+      this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
+                                        REG_SIZE);
       break;
    case BAD_FILE:
       this->regs_written = 0;
@@ -126,9 +104,9 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
    init(opcode, exec_size, reg_undef, NULL, 0);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
 {
-   init(opcode, 0, dst, NULL, 0);
+   init(opcode, exec_size, dst, NULL, 0);
 }
 
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
@@ -138,12 +116,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 1);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
-   const fs_reg src[1] = { src0 };
-   init(opcode, 0, dst, src, 1);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0, const fs_reg &src1)
 {
@@ -151,13 +123,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 2);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1)
-{
-   const fs_reg src[2] = { src0, src1 };
-   init(opcode, 0, dst, src, 2);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 {
@@ -165,19 +130,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 3);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1, const fs_reg &src2)
-{
-   const fs_reg src[3] = { src0, src1, src2 };
-   init(opcode, 0, dst, src, 3);
-}
-
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
-                 const fs_reg src[], unsigned sources)
-{
-   init(opcode, 0, dst, src, sources);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
                  const fs_reg src[], unsigned sources)
 {
@@ -236,7 +188,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 
    int scale = 1;
-   if (devinfo->gen == 4 && dst.width == 8) {
+   if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
        * u, v, r) as parameters, or we can just use the SIMD16 message
        * consisting of (header, u).  We choose the second, at the cost of a
@@ -251,10 +203,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    else
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 
-   assert(dst.width % 8 == 0);
-   int regs_written = 4 * (dst.width / 8) * scale;
-   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
-                               dst.type, dst.width);
+   int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
+   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
 
@@ -264,10 +214,10 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
       if (devinfo->gen == 4)
          inst->mlen = 3;
       else
-         inst->mlen = 1 + dispatch_width / 8;
+         inst->mlen = 1 + bld.dispatch_width() / 8;
    }
 
-   bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
+   bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
 }
 
 /**
@@ -358,10 +308,14 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 
    for (int i = 0; i < this->sources; i++) {
       reg.type = this->src[i].type;
-      reg.width = this->src[i].width;
       if (!this->src[i].equals(reg))
          return false;
-      reg = ::offset(reg, 1);
+
+      if (i < this->header_size) {
+         reg.reg_offset += 1;
+      } else {
+         reg.reg_offset += this->exec_size / 8;
+      }
    }
 
    return true;
@@ -408,8 +362,8 @@ fs_reg::fs_reg(float f)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_F;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.f = f;
-   this->width = 1;
 }
 
 /** Immediate value constructor. */
@@ -418,8 +372,8 @@ fs_reg::fs_reg(int32_t i)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_D;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.d = i;
-   this->width = 1;
 }
 
 /** Immediate value constructor. */
@@ -428,8 +382,8 @@ fs_reg::fs_reg(uint32_t u)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_UD;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.ud = u;
-   this->width = 1;
 }
 
 /** Vector float immediate value constructor. */
@@ -460,7 +414,6 @@ fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
    this->file = HW_REG;
    this->fixed_hw_reg = fixed_hw_reg;
    this->type = fixed_hw_reg.type;
-   this->width = 1 << fixed_hw_reg.width;
 }
 
 bool
@@ -475,7 +428,6 @@ fs_reg::equals(const fs_reg &r) const
            abs == r.abs &&
            !reladdr && !r.reladdr &&
            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
-           width == r.width &&
            stride == r.stride);
 }
 
@@ -494,6 +446,15 @@ fs_reg::is_contiguous() const
    return stride == 1;
 }
 
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+   const unsigned stride = (file != HW_REG ? this->stride :
+                            fixed_hw_reg.hstride == 0 ? 0 :
+                            1 << (fixed_hw_reg.hstride - 1));
+   return MAX2(width * stride, 1) * type_sz(type);
+}
+
 int
 fs_visitor::type_size(const struct glsl_type *type)
 {
@@ -520,7 +481,10 @@ fs_visitor::type_size(const struct glsl_type *type)
       return 0;
    case GLSL_TYPE_ATOMIC_UINT:
       return 0;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
    case GLSL_TYPE_IMAGE:
+      return BRW_IMAGE_PARAM_SIZE;
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
@@ -548,12 +512,12 @@ fs_visitor::get_timestamp(const fs_builder &bld)
                                           0),
                              BRW_REGISTER_TYPE_UD));
 
-   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
+   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
     */
-   bld.exec_all().MOV(dst, ts);
+   bld.group(4, 0).exec_all().MOV(dst, ts);
 
    /* The caller wants the low 32 bits of the timestamp.  Since it's running
     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
@@ -598,19 +562,21 @@ fs_visitor::emit_shader_time_end()
 
    fs_reg start = shader_start_time;
    start.negate = true;
-   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
+   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    diff.set_smear(0);
-   ibld.ADD(diff, start, shader_end_time);
+
+   const fs_builder cbld = ibld.group(1, 0);
+   cbld.group(1, 0).ADD(diff, start, shader_end_time);
 
    /* If there were no instructions between the two timestamp gets, the diff
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   ibld.ADD(diff, diff, fs_reg(-2u));
-   SHADER_TIME_ADD(ibld, 0, diff);
-   SHADER_TIME_ADD(ibld, 1, fs_reg(1u));
+   cbld.ADD(diff, diff, fs_reg(-2u));
+   SHADER_TIME_ADD(cbld, 0, diff);
+   SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
    ibld.emit(BRW_OPCODE_ELSE);
-   SHADER_TIME_ADD(ibld, 2, fs_reg(1u));
+   SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
    ibld.emit(BRW_OPCODE_ENDIF);
 }
 
@@ -695,50 +661,160 @@ bool
 fs_inst::is_partial_write() const
 {
    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
-           (this->dst.width * type_sz(this->dst.type)) < 32 ||
+           (this->exec_size * type_sz(this->dst.type)) < 32 ||
            !this->dst.is_contiguous());
 }
 
+unsigned
+fs_inst::components_read(unsigned i) const
+{
+   switch (opcode) {
+   case FS_OPCODE_LINTERP:
+      if (i == 0)
+         return 2;
+      else
+         return 1;
+
+   case FS_OPCODE_PIXEL_X:
+   case FS_OPCODE_PIXEL_Y:
+      assert(i == 0);
+      return 2;
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      assert(src[6].file == IMM);
+      /* First/second FB write color. */
+      if (i < 2)
+         return src[6].fixed_hw_reg.dw1.ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      assert(src[8].file == IMM && src[9].file == IMM);
+      /* Texture coordinates. */
+      if (i == 0)
+         return src[8].fixed_hw_reg.dw1.ud;
+      /* Texture derivatives. */
+      else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
+         return src[9].fixed_hw_reg.dw1.ud;
+      /* Texture offset. */
+      else if (i == 7)
+         return 2;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      assert(src[3].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source (ignored for reads). */
+      else if (i == 1)
+         return 0;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1)
+         return src[4].fixed_hw_reg.dw1.ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1 && op == BRW_AOP_CMPWR)
+         return 2;
+      else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+                          op == BRW_AOP_PREDEC))
+         return 0;
+      else
+         return 1;
+   }
+
+   default:
+      return 1;
+   }
+}
+
 int
 fs_inst::regs_read(int arg) const
 {
-   if (is_tex() && arg == 0 && src[0].file == GRF) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
-      return exec_size / 4;
+   switch (opcode) {
+   case FS_OPCODE_FB_WRITE:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      if (arg == 0)
+         return mlen;
+      break;
+
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+      /* The payload is actually stored in src1 */
+      if (arg == 1)
+         return mlen;
+      break;
+
+   case FS_OPCODE_LINTERP:
+      if (arg == 1)
+         return 1;
+      break;
+
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      if (arg < this->header_size)
+         return 1;
+      break;
+
+   case CS_OPCODE_CS_TERMINATE:
+      return 1;
+
+   default:
+      if (is_tex() && arg == 0 && src[0].file == GRF)
+         return mlen;
+      break;
    }
 
    switch (src[arg].file) {
    case BAD_FILE:
+      return 0;
    case UNIFORM:
    case IMM:
       return 1;
    case GRF:
+   case ATTR:
    case HW_REG:
-      if (src[arg].stride == 0) {
-         return 1;
-      } else {
-         int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
-         return (size + 31) / 32;
-      }
+      return DIV_ROUND_UP(components_read(arg) *
+                          src[arg].component_size(exec_size),
+                          REG_SIZE);
    case MRF:
       unreachable("MRF registers are not allowed as sources");
    default:
@@ -832,7 +908,7 @@ fs_visitor::vgrf(const glsl_type *const type)
 {
    int reg_width = dispatch_width / 8;
    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
-                 brw_type_for_base_type(type), dispatch_width);
+                 brw_type_for_base_type(type));
 }
 
 /** Fixed HW reg constructor. */
@@ -842,14 +918,7 @@ fs_reg::fs_reg(enum register_file file, int reg)
    this->file = file;
    this->reg = reg;
    this->type = BRW_REGISTER_TYPE_F;
-
-   switch (file) {
-   case UNIFORM:
-      this->width = 1;
-      break;
-   default:
-      this->width = 8;
-   }
+   this->stride = (file == UNIFORM ? 0 : 1);
 }
 
 /** Fixed HW reg constructor. */
@@ -859,25 +928,7 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
    this->file = file;
    this->reg = reg;
    this->type = type;
-
-   switch (file) {
-   case UNIFORM:
-      this->width = 1;
-      break;
-   default:
-      this->width = 8;
-   }
-}
-
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
-               uint8_t width)
-{
-   init();
-   this->file = file;
-   this->reg = reg;
-   this->type = type;
-   this->width = width;
+   this->stride = (file == UNIFORM ? 0 : 1);
 }
 
 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
@@ -892,6 +943,18 @@ fs_visitor::import_uniforms(fs_visitor *v)
    this->param_size = v->param_size;
 }
 
+void
+fs_visitor::setup_vector_uniform_values(const gl_constant_value *values, unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      stage_prog_data->param[uniforms++] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      stage_prog_data->param[uniforms++] = &zero;
+}
+
 fs_reg *
 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
                                          bool origin_upper_left)
@@ -908,23 +971,23 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
    } else {
       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.y */
    if (!flip && pixel_center_integer) {
       bld.MOV(wpos, this->pixel_y);
    } else {
       fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0 : 0.5);
+      float offset = (pixel_center_integer ? 0.0f : 0.5f);
 
       if (flip) {
 	 pixel_y.negate = true;
-	 offset += key->drawable_height - 1.0;
+	 offset += key->drawable_height - 1.0f;
       }
 
       bld.ADD(wpos, pixel_y, fs_reg(offset));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.z */
    if (devinfo->gen >= 6) {
@@ -934,7 +997,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
            interp_reg(VARYING_SLOT_POS, 2));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.w: Already set up in emit_interpolation */
    bld.MOV(wpos, this->wpos_w);
@@ -1017,7 +1080,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 	    /* If there's no incoming setup data for this slot, don't
 	     * emit interpolation for it.
 	     */
-	    attr = offset(attr, type->vector_elements);
+	    attr = offset(attr, bld, type->vector_elements);
 	    location++;
 	    continue;
 	 }
@@ -1032,7 +1095,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 	       interp = suboffset(interp, 3);
                interp.type = attr.type;
                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
-	       attr = offset(attr, 1);
+	       attr = offset(attr, bld, 1);
 	    }
 	 } else {
 	    /* Smooth/noperspective interpolation case. */
@@ -1070,7 +1133,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
                   bld.MUL(attr, attr, this->pixel_w);
                }
-	       attr = offset(attr, 1);
+	       attr = offset(attr, bld, 1);
 	    }
 
 	 }
@@ -1178,7 +1241,7 @@ fs_visitor::emit_samplepos_setup()
    }
    /* Compute gl_SamplePosition.x */
    compute_sample_position(pos, int_sample_x);
-   pos = offset(pos, 1);
+   pos = offset(pos, abld, 1);
    if (dispatch_width == 8) {
       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
    } else {
@@ -1250,15 +1313,16 @@ fs_visitor::emit_sampleid_setup()
    return reg;
 }
 
-void
-fs_visitor::resolve_source_modifiers(fs_reg *src)
+fs_reg
+fs_visitor::resolve_source_modifiers(const fs_reg &src)
 {
-   if (!src->abs && !src->negate)
-      return;
+   if (!src.abs && !src.negate)
+      return src;
 
-   fs_reg temp = bld.vgrf(src->type);
-   bld.MOV(temp, *src);
-   *src = temp;
+   fs_reg temp = bld.vgrf(src.type);
+   bld.MOV(temp, src);
+
+   return temp;
 }
 
 void
@@ -1318,6 +1382,7 @@ fs_visitor::assign_curb_setup()
 						  constant_nr / 8,
 						  constant_nr % 8);
 
+            assert(inst->src[i].stride == 0);
 	    inst->src[i].file = HW_REG;
 	    inst->src[i].fixed_hw_reg = byte_offset(
                retype(brw_reg, inst->src[i].type),
@@ -1867,11 +1932,12 @@ fs_visitor::demote_pull_constants()
 	    continue;
 
          /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
-                                    .at(block, inst);
+         const fs_builder ibld(this, block, inst);
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
          fs_reg dst = vgrf(glsl_type::float_type);
 
+         assert(inst->src[i].stride == 0);
+
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
@@ -1879,9 +1945,11 @@ fs_visitor::demote_pull_constants()
                                        *inst->src[i].reladdr,
                                        pull_index);
             inst->src[i].reladdr = NULL;
+            inst->src[i].stride = 1;
          } else {
+            const fs_builder ubld = ibld.exec_all().group(8, 0);
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
                       dst, surf_index, offset);
             inst->src[i].set_smear(pull_index & 3);
          }
@@ -1890,7 +1958,6 @@ fs_visitor::demote_pull_constants()
          inst->src[i].file = GRF;
          inst->src[i].reg = dst.reg;
          inst->src[i].reg_offset = 0;
-         inst->src[i].width = dispatch_width;
       }
    }
    invalidate_live_intervals();
@@ -2158,11 +2225,11 @@ fs_visitor::opt_zero_samples()
        *     "Parameter 0 is required except for the sampleinfo message, which
        *      has no parameter 0"
        */
-      while (inst->mlen > inst->header_size + dispatch_width / 8 &&
+      while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
              load_payload->src[(inst->mlen - inst->header_size) /
-                               (dispatch_width / 8) +
+                               (inst->exec_size / 8) +
                                inst->header_size - 1].is_zero()) {
-         inst->mlen -= dispatch_width / 8;
+         inst->mlen -= inst->exec_size / 8;
          progress = true;
       }
    }
@@ -2199,7 +2266,8 @@ fs_visitor::opt_sampler_eot()
       return false;
 
    /* Look for a texturing instruction immediately before the final FB_WRITE. */
-   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+   fs_inst *fb_write = (fs_inst *)block->end();
    assert(fb_write->eot);
    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
 
@@ -2230,9 +2298,11 @@ fs_visitor::opt_sampler_eot()
    assert(!tex_inst->eot); /* We can't get here twice */
    assert((tex_inst->offset & (0xff << 24)) == 0);
 
+   const fs_builder ibld(this, block, tex_inst);
+
    tex_inst->offset |= fb_write->target << 24;
    tex_inst->eot = true;
-   tex_inst->dst = bld.null_reg_ud();
+   tex_inst->dst = ibld.null_reg_ud();
    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
 
    /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2244,8 +2314,8 @@ fs_visitor::opt_sampler_eot()
    if (tex_inst->header_size != 0)
       return true;
 
-   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
-                                 load_payload->sources + 1);
+   fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+                                  load_payload->sources + 1);
    fs_reg *new_sources =
       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
 
@@ -2307,12 +2377,12 @@ fs_visitor::opt_register_renaming()
 
       if (depth == 0 &&
           inst->dst.file == GRF &&
-          alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
+          alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
          } else {
-            remap[dst] = alloc.allocate(inst->dst.width / 8);
+            remap[dst] = alloc.allocate(inst->exec_size / 8);
             inst->dst.reg = remap[dst];
             progress = true;
          }
@@ -2443,7 +2513,7 @@ fs_visitor::compute_to_mrf()
             /* Things returning more than one register would need us to
              * understand coalescing out more than one MOV at a time.
              */
-            if (scan_inst->regs_written > scan_inst->dst.width / 8)
+            if (scan_inst->regs_written > scan_inst->exec_size / 8)
                break;
 
 	    /* SEND instructions can't have MRF as a destination. */
@@ -2780,7 +2850,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
       if (block->start() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
          }
          return;
       }
@@ -2796,7 +2867,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
                 needs_dep[reg - first_write_grf]) {
-               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
                needs_dep[reg - first_write_grf] = false;
                if (scan_inst->exec_size == 16)
                   needs_dep[reg - first_write_grf + 1] = false;
@@ -2843,7 +2914,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       if (block->end() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
          }
          return;
       }
@@ -2858,7 +2930,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
           scan_inst->dst.reg >= first_write_grf &&
           scan_inst->dst.reg < first_write_grf + write_len &&
           needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.reg);
          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
       }
 
@@ -2928,14 +3001,18 @@ fs_visitor::lower_uniform_pull_constant_loads()
          assert(const_offset_reg.file == IMM &&
                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
-         fs_reg payload = fs_reg(GRF, alloc.allocate(1));
 
-         /* We have to use a message header on Skylake to get SIMD4x2 mode.
-          * Reserve space for the register.
-          */
+         fs_reg payload, offset;
          if (devinfo->gen >= 9) {
-            payload.reg_offset++;
-            alloc.sizes[payload.reg] = 2;
+            /* We have to use a message header on Skylake to get SIMD4x2
+             * mode.  Reserve space for the register.
+            */
+            offset = payload = fs_reg(GRF, alloc.allocate(2));
+            offset.reg_offset++;
+            inst->mlen = 2;
+         } else {
+            offset = payload = fs_reg(GRF, alloc.allocate(1));
+            inst->mlen = 1;
          }
 
          /* This is actually going to be a MOV, but since only the first dword
@@ -2944,7 +3021,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
           * by live variable analysis, or register allocation will explode.
           */
          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
-                                               8, payload, const_offset_reg);
+                                               8, offset, const_offset_reg);
          setup->force_writemask_all = true;
 
          setup->ir = inst->ir;
@@ -2957,6 +3034,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
           */
          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
          inst->src[1] = payload;
+         inst->base_mrf = -1;
 
          invalidate_live_intervals();
       } else {
@@ -2982,28 +3060,24 @@ fs_visitor::lower_load_payload()
 
       assert(inst->dst.file == MRF || inst->dst.file == GRF);
       assert(inst->saturate == false);
-
-      const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
-                                 .exec_all(inst->force_writemask_all)
-                                 .at(block, inst);
       fs_reg dst = inst->dst;
 
       /* Get rid of COMPR4.  We'll add it back in if we need it */
       if (dst.file == MRF)
          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
 
-      dst.width = 8;
+      const fs_builder ibld(this, block, inst);
+      const fs_builder hbld = ibld.exec_all().group(8, 0);
+
       for (uint8_t i = 0; i < inst->header_size; i++) {
          if (inst->src[i].file != BAD_FILE) {
             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
-            mov_src.width = 8;
-            ibld.exec_all().MOV(mov_dst, mov_src);
+            hbld.MOV(mov_dst, mov_src);
          }
-         dst = offset(dst, 1);
+         dst = offset(dst, hbld, 1);
       }
 
-      dst.width = inst->exec_size;
       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
           inst->exec_size > 8) {
          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
@@ -3033,9 +3107,9 @@ fs_visitor::lower_load_payload()
                } else {
                   /* Platform doesn't have COMPR4.  We have to fake it */
                   fs_reg mov_dst = retype(dst, inst->src[i].type);
-                  mov_dst.width = 8;
                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
-                  ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
+                  mov_dst.reg += 4;
+                  ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
                }
             }
 
@@ -3060,7 +3134,7 @@ fs_visitor::lower_load_payload()
       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
          if (inst->src[i].file != BAD_FILE)
             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
-         dst = offset(dst, 1);
+         dst = offset(dst, ibld, 1);
       }
 
       inst->remove(block);
@@ -3078,158 +3152,989 @@ fs_visitor::lower_integer_multiplication()
 {
    bool progress = false;
 
-   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
-    * directly, but Cherryview cannot.
-    */
-   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
-      return false;
-
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      if (inst->opcode != BRW_OPCODE_MUL ||
-          inst->dst.is_accumulator() ||
-          (inst->dst.type != BRW_REGISTER_TYPE_D &&
-           inst->dst.type != BRW_REGISTER_TYPE_UD))
-         continue;
+      const fs_builder ibld(this, block, inst);
 
-      const fs_builder ibld = bld.at(block, inst);
+      if (inst->opcode == BRW_OPCODE_MUL) {
+         if (inst->dst.is_accumulator() ||
+             (inst->dst.type != BRW_REGISTER_TYPE_D &&
+              inst->dst.type != BRW_REGISTER_TYPE_UD))
+            continue;
 
-      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
-       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
-       * src1 are used.
-       *
-       * If multiplying by an immediate value that fits in 16-bits, do a
-       * single MUL instruction with that value in the proper location.
-       */
-      if (inst->src[1].file == IMM &&
-          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
-         if (devinfo->gen < 7) {
-            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
-                       inst->dst.type, dispatch_width);
-            ibld.MOV(imm, inst->src[1]);
-            ibld.MUL(inst->dst, imm, inst->src[0]);
-         } else {
-            ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
-         }
-      } else {
-         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
-          * do 32-bit integer multiplication in one instruction, but instead
-          * must do a sequence (which actually calculates a 64-bit result):
-          *
-          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
-          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
-          *    mov(8)  g2<1>D     acc0<8,8,1>D
-          *
-          * But on Gen > 6, the ability to use second accumulator register
-          * (acc1) for non-float data types was removed, preventing a simple
-          * implementation in SIMD16. A 16-channel result can be calculated by
-          * executing the three instructions twice in SIMD8, once with quarter
-          * control of 1Q for the first eight channels and again with 2Q for
-          * the second eight channels.
-          *
-          * Which accumulator register is implicitly accessed (by AccWrEnable
-          * for instance) is determined by the quarter control. Unfortunately
-          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
-          * implicit accumulator access by an instruction with 2Q will access
-          * acc1 regardless of whether the data type is usable in acc1.
-          *
-          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
-          * integer data types.
-          *
-          * Since we only want the low 32-bits of the result, we can do two
-          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
-          * adjust the high result and add them (like the mach is doing):
-          *
-          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
-          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
-          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
-          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
-          *
-          * We avoid the shl instruction by realizing that we only want to add
-          * the low 16-bits of the "high" result to the high 16-bits of the
-          * "low" result and using proper regioning on the add:
-          *
-          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
-          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
-          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
-          *
-          * Since it does not use the (single) accumulator register, we can
-          * schedule multi-component multiplications much better.
+         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
+          * operation directly, but CHV/BXT cannot.
           */
+         if (devinfo->gen >= 8 &&
+             !devinfo->is_cherryview && !devinfo->is_broxton)
+            continue;
 
-         if (inst->conditional_mod && inst->dst.is_null()) {
-            inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
-                               inst->dst.type, dispatch_width);
-         }
-         fs_reg low = inst->dst;
-         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
-                     inst->dst.type, dispatch_width);
+         if (inst->src[1].file == IMM &&
+             inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+             * src1 are used.
+             *
+             * If multiplying by an immediate value that fits in 16-bits, do a
+             * single MUL instruction with that value in the proper location.
+             */
+            if (devinfo->gen < 7) {
+               fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+                          inst->dst.type);
+               ibld.MOV(imm, inst->src[1]);
+               ibld.MUL(inst->dst, imm, inst->src[0]);
+            } else {
+               ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+            }
+         } else {
+            /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+             * do 32-bit integer multiplication in one instruction, but instead
+             * must do a sequence (which actually calculates a 64-bit result):
+             *
+             *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
+             *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
+             *    mov(8)  g2<1>D     acc0<8,8,1>D
+             *
+             * But on Gen > 6, the ability to use second accumulator register
+             * (acc1) for non-float data types was removed, preventing a simple
+             * implementation in SIMD16. A 16-channel result can be calculated by
+             * executing the three instructions twice in SIMD8, once with quarter
+             * control of 1Q for the first eight channels and again with 2Q for
+             * the second eight channels.
+             *
+             * Which accumulator register is implicitly accessed (by AccWrEnable
+             * for instance) is determined by the quarter control. Unfortunately
+             * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+             * implicit accumulator access by an instruction with 2Q will access
+             * acc1 regardless of whether the data type is usable in acc1.
+             *
+             * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+             * integer data types.
+             *
+             * Since we only want the low 32-bits of the result, we can do two
+             * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+             * adjust the high result and add them (like the mach is doing):
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
+             *    shl(8)  g9<1>D     g8<8,8,1>D      16D
+             *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
+             *
+             * We avoid the shl instruction by realizing that we only want to add
+             * the low 16-bits of the "high" result to the high 16-bits of the
+             * "low" result and using proper regioning on the add:
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
+             *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
+             *
+             * Since it does not use the (single) accumulator register, we can
+             * schedule multi-component multiplications much better.
+             */
 
-         if (devinfo->gen >= 7) {
-            fs_reg src1_0_w = inst->src[1];
-            fs_reg src1_1_w = inst->src[1];
+            if (inst->conditional_mod && inst->dst.is_null()) {
+               inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+                                  inst->dst.type);
+            }
+            fs_reg low = inst->dst;
+            fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+                        inst->dst.type);
+
+            if (devinfo->gen >= 7) {
+               fs_reg src1_0_w = inst->src[1];
+               fs_reg src1_1_w = inst->src[1];
 
-            if (inst->src[1].file == IMM) {
-               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
-               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+               if (inst->src[1].file == IMM) {
+                  src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
+                  src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+               } else {
+                  src1_0_w.type = BRW_REGISTER_TYPE_UW;
+                  if (src1_0_w.stride != 0) {
+                     assert(src1_0_w.stride == 1);
+                     src1_0_w.stride = 2;
+                  }
+
+                  src1_1_w.type = BRW_REGISTER_TYPE_UW;
+                  if (src1_1_w.stride != 0) {
+                     assert(src1_1_w.stride == 1);
+                     src1_1_w.stride = 2;
+                  }
+                  src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+               }
+               ibld.MUL(low, inst->src[0], src1_0_w);
+               ibld.MUL(high, inst->src[0], src1_1_w);
             } else {
-               src1_0_w.type = BRW_REGISTER_TYPE_UW;
-               if (src1_0_w.stride != 0) {
-                  assert(src1_0_w.stride == 1);
-                  src1_0_w.stride = 2;
+               fs_reg src0_0_w = inst->src[0];
+               fs_reg src0_1_w = inst->src[0];
+
+               src0_0_w.type = BRW_REGISTER_TYPE_UW;
+               if (src0_0_w.stride != 0) {
+                  assert(src0_0_w.stride == 1);
+                  src0_0_w.stride = 2;
                }
 
-               src1_1_w.type = BRW_REGISTER_TYPE_UW;
-               if (src1_1_w.stride != 0) {
-                  assert(src1_1_w.stride == 1);
-                  src1_1_w.stride = 2;
+               src0_1_w.type = BRW_REGISTER_TYPE_UW;
+               if (src0_1_w.stride != 0) {
+                  assert(src0_1_w.stride == 1);
+                  src0_1_w.stride = 2;
                }
-               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
-            }
-            ibld.MUL(low, inst->src[0], src1_0_w);
-            ibld.MUL(high, inst->src[0], src1_1_w);
-         } else {
-            fs_reg src0_0_w = inst->src[0];
-            fs_reg src0_1_w = inst->src[0];
+               src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
 
-            src0_0_w.type = BRW_REGISTER_TYPE_UW;
-            if (src0_0_w.stride != 0) {
-               assert(src0_0_w.stride == 1);
-               src0_0_w.stride = 2;
+               ibld.MUL(low, src0_0_w, inst->src[1]);
+               ibld.MUL(high, src0_1_w, inst->src[1]);
             }
 
-            src0_1_w.type = BRW_REGISTER_TYPE_UW;
-            if (src0_1_w.stride != 0) {
-               assert(src0_1_w.stride == 1);
-               src0_1_w.stride = 2;
+            fs_reg dst = inst->dst;
+            dst.type = BRW_REGISTER_TYPE_UW;
+            dst.subreg_offset = 2;
+            dst.stride = 2;
+
+            high.type = BRW_REGISTER_TYPE_UW;
+            high.stride = 2;
+
+            low.type = BRW_REGISTER_TYPE_UW;
+            low.subreg_offset = 2;
+            low.stride = 2;
+
+            ibld.ADD(dst, low, high);
+
+            if (inst->conditional_mod) {
+               fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+               set_condmod(inst->conditional_mod,
+                           ibld.MOV(null, inst->dst));
             }
-            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+         }
 
-            ibld.MUL(low, src0_0_w, inst->src[1]);
-            ibld.MUL(high, src0_1_w, inst->src[1]);
+      } else if (inst->opcode == SHADER_OPCODE_MULH) {
+         /* Should have been lowered to 8-wide. */
+         assert(inst->exec_size <= 8);
+         const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
+                                   inst->dst.type);
+         fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
+         fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+
+         if (devinfo->gen >= 8) {
+            /* Until Gen8, integer multiplies read 32-bits from one source,
+             * and 16-bits from the other, and relying on the MACH instruction
+             * to generate the high bits of the result.
+             *
+             * On Gen8, the multiply instruction does a full 32x32-bit
+             * multiply, but in order to do a 64-bit multiply we can simulate
+             * the previous behavior and then use a MACH instruction.
+             *
+             * FINISHME: Don't use source modifiers on src1.
+             */
+            assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+                   mul->src[1].type == BRW_REGISTER_TYPE_UD);
+            mul->src[1].type = (type_is_signed(mul->src[1].type) ?
+                                BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+            mul->src[1].stride *= 2;
+
+         } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
+                    inst->force_sechalf) {
+            /* Among other things the quarter control bits influence which
+             * accumulator register is used by the hardware for instructions
+             * that access the accumulator implicitly (e.g. MACH).  A
+             * second-half instruction would normally map to acc1, which
+             * doesn't exist on Gen7 and up (the hardware does emulate it for
+             * floating-point instructions *only* by taking advantage of the
+             * extra precision of acc0 not normally used for floating point
+             * arithmetic).
+             *
+             * HSW and up are careful enough not to try to access an
+             * accumulator register that doesn't exist, but on earlier Gen7
+             * hardware we need to make sure that the quarter control bits are
+             * zero to avoid non-deterministic behaviour and emit an extra MOV
+             * to get the result masked correctly according to the current
+             * channel enables.
+             */
+            mach->force_sechalf = false;
+            mach->force_writemask_all = true;
+            mach->dst = ibld.vgrf(inst->dst.type);
+            ibld.MOV(inst->dst, mach->dst);
          }
+      } else {
+         continue;
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+                    fs_reg *dst, fs_reg color, unsigned components)
+{
+   if (key->clamp_fragment_color) {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+      assert(color.type == BRW_REGISTER_TYPE_F);
+
+      for (unsigned i = 0; i < components; i++)
+         set_saturate(true,
+                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+      color = tmp;
+   }
+
+   for (unsigned i = 0; i < components; i++)
+      dst[i] = offset(color, bld, i);
+}
+
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+                            const brw_wm_prog_data *prog_data,
+                            const brw_wm_prog_key *key,
+                            const fs_visitor::thread_payload &payload)
+{
+   assert(inst->src[6].file == IMM);
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &color0 = inst->src[0];
+   const fs_reg &color1 = inst->src[1];
+   const fs_reg &src0_alpha = inst->src[2];
+   const fs_reg &src_depth = inst->src[3];
+   const fs_reg &dst_depth = inst->src[4];
+   fs_reg sample_mask = inst->src[5];
+   const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
+
+   /* We can potentially have a message length of up to 15, so we have to set
+    * base_mrf to either 0 or 1 in order to fit in m0..m15.
+    */
+   fs_reg sources[15];
+   int header_size = 2, payload_header_size;
+   unsigned length = 0;
+
+   /* From the Sandy Bridge PRM, volume 4, page 198:
+    *
+    *     "Dispatched Pixel Enables. One bit per pixel indicating
+    *      which pixels were originally enabled when the thread was
+    *      dispatched. This field is only required for the end-of-
+    *      thread message and on all dual-source messages."
+    */
+   if (devinfo->gen >= 6 &&
+       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
+       color1.file == BAD_FILE &&
+       key->nr_color_regions == 1) {
+      header_size = 0;
+   }
+
+   if (header_size != 0) {
+      assert(header_size == 2);
+      /* Allocate 2 registers for a header */
+      length += 2;
+   }
+
+   if (payload.aa_dest_stencil_reg) {
+      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
+      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+         .MOV(sources[length],
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+      length++;
+   }
+
+   if (prog_data->uses_omask) {
+      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
+                               BRW_REGISTER_TYPE_UD);
+
+      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
+       * relevant.  Since it's unsigned single words one vgrf is always
+       * 16-wide, but only the lower or higher 8 channels will be used by the
+       * hardware when doing a SIMD8 write depending on whether we have
+       * selected the subspans for the first or second half respectively.
+       */
+      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+      sample_mask.type = BRW_REGISTER_TYPE_UW;
+      sample_mask.stride *= 2;
+
+      bld.exec_all().annotate("FB write oMask")
+         .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
+                   inst->force_sechalf),
+              sample_mask);
+      length++;
+   }
+
+   payload_header_size = length;
+
+   if (src0_alpha.file != BAD_FILE) {
+      /* FIXME: This is being passed at the wrong location in the payload and
+       * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+       * It's supposed to be immediately before oMask but there seems to be no
+       * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+       * requires header sources to form a contiguous segment at the beginning
+       * of the message and src0_alpha has per-channel semantics.
+       */
+      setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
+      length++;
+   }
+
+   setup_color_payload(bld, key, &sources[length], color0, components);
+   length += 4;
+
+   if (color1.file != BAD_FILE) {
+      setup_color_payload(bld, key, &sources[length], color1, components);
+      length += 4;
+   }
+
+   if (src_depth.file != BAD_FILE) {
+      sources[length] = src_depth;
+      length++;
+   }
+
+   if (dst_depth.file != BAD_FILE) {
+      sources[length] = dst_depth;
+      length++;
+   }
+
+   fs_inst *load;
+   if (devinfo->gen >= 7) {
+      /* Send from the GRF */
+      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
+      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
+      payload.reg = bld.shader->alloc.allocate(load->regs_written);
+      load->dst = payload;
+
+      inst->src[0] = payload;
+      inst->resize_sources(1);
+      inst->base_mrf = -1;
+   } else {
+      /* Send from the MRF */
+      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+                              sources, length, payload_header_size);
+
+      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
+       * will do this for us if we just give it a COMPR4 destination.
+       */
+      if (devinfo->gen < 6 && bld.dispatch_width() == 16)
+         load->dst.reg |= BRW_MRF_COMPR4;
+
+      inst->resize_sources(0);
+      inst->base_mrf = 1;
+   }
+
+   inst->opcode = FS_OPCODE_FB_WRITE;
+   inst->mlen = load->regs_written;
+   inst->header_size = header_size;
+}
+
+static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &sampler,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+   fs_reg msg_end = msg_begin;
+
+   /* g0 header. */
+   msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+   for (unsigned i = 0; i < coord_components; i++)
+      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+              offset(coordinate, bld, i));
+
+   msg_end = offset(msg_end, bld, coord_components);
+
+   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+    * require all three components to be present and zero if they are unused.
+    */
+   if (coord_components > 0 &&
+       (has_lod || shadow_c.file != BAD_FILE ||
+        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+      for (unsigned i = coord_components; i < 3; i++)
+         bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+
+      msg_end = offset(msg_end, bld, 3 - coord_components);
+   }
+
+   if (op == SHADER_OPCODE_TXD) {
+      /* TXD unsupported in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* the slots for u and v are always present, but r is optional */
+      if (coord_components < 2)
+         msg_end = offset(msg_end, bld, 2 - coord_components);
+
+      /*  P   = u, v, r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * 1-arg: Does not exist.
+       *
+       * 2-arg: dudx   dvdx   dudy   dvdy
+       *        dPdx.x dPdx.y dPdy.x dPdy.y
+       *        m4     m5     m6     m7
+       *
+       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
+       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+       *        m5     m6     m7     m8     m9     m10
+       */
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+   }
+
+   if (has_lod) {
+      /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
+       * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+       */
+      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+             bld.dispatch_width() == 16);
+
+      const brw_reg_type type =
+         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+      bld.MOV(retype(msg_end, type), lod);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+         /* There's no plain shadow compare message, so we use shadow
+          * compare with a bias of 0.0.
+          */
+         bld.MOV(msg_end, fs_reg(0.0f));
+         msg_end = offset(msg_end, bld, 1);
+      }
+
+      bld.MOV(msg_end, shadow_c);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = msg_begin.reg;
+   inst->mlen = msg_end.reg - msg_begin.reg;
+   inst->header_size = 1;
+}
+
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
+                                fs_reg coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, fs_reg lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &sampler,
+                                const fs_reg &offset_value,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+   fs_reg msg_coords = message;
+   unsigned header_size = 0;
+
+   if (offset_value.file != BAD_FILE) {
+      /* The offsets set up by the visitor are in the m1 header, so we can't
+       * go headerless.
+       */
+      header_size = 1;
+      message.reg--;
+   }
+
+   for (unsigned i = 0; i < coord_components; i++) {
+      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
+      coordinate = offset(coordinate, bld, 1);
+   }
+   fs_reg msg_end = offset(msg_coords, bld, coord_components);
+   fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+   if (shadow_c.file != BAD_FILE) {
+      fs_reg msg_shadow = msg_lod;
+      bld.MOV(msg_shadow, shadow_c);
+      msg_lod = offset(msg_shadow, bld, 1);
+      msg_end = msg_lod;
+   }
+
+   switch (op) {
+   case SHADER_OPCODE_TXL:
+   case FS_OPCODE_TXB:
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXD:
+      /**
+       *  P   =  u,    v,    r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * Load up these values:
+       * - dudx   dudy   dvdx   dvdy   drdx   drdy
+       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+       */
+      msg_end = msg_lod;
+      for (unsigned i = 0; i < grad_components; i++) {
+         bld.MOV(msg_end, lod);
+         lod = offset(lod, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
+
+         bld.MOV(msg_end, lod2);
+         lod2 = offset(lod2, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
+      }
+      break;
+   case SHADER_OPCODE_TXS:
+      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF:
+      msg_lod = offset(msg_coords, bld, 3);
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+      msg_lod = offset(msg_coords, bld, 3);
+      /* lod */
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+      /* sample index */
+      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+      msg_end = offset(msg_lod, bld, 2);
+      break;
+   default:
+      break;
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = message.reg;
+   inst->mlen = msg_end.reg - message.reg;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
 
-         fs_reg dst = inst->dst;
-         dst.type = BRW_REGISTER_TYPE_UW;
-         dst.subreg_offset = 2;
-         dst.stride = 2;
+static bool
+is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
+{
+   if (devinfo->gen < 8 && !devinfo->is_haswell)
+      return false;
+
+   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+}
 
-         high.type = BRW_REGISTER_TYPE_UW;
-         high.stride = 2;
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
+                                fs_reg coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, fs_reg lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &mcs, const fs_reg &sampler,
+                                fs_reg offset_value,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   int reg_width = bld.dispatch_width() / 8;
+   unsigned header_size = 0, length = 0;
+   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+       offset_value.file != BAD_FILE ||
+       is_high_sampler(devinfo, sampler)) {
+      /* For general texture offsets (no txf workaround), we need a header to
+       * put them in.  Note that we're only reserving space for it in the
+       * message payload as it will be initialized implicitly by the
+       * generator.
+       *
+       * TG4 needs to place its channel select in the header, for interaction
+       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
+       * larger sampler numbers we need to offset the Sampler State Pointer in
+       * the header.
+       */
+      header_size = 1;
+      sources[0] = fs_reg();
+      length++;
+   }
 
-         low.type = BRW_REGISTER_TYPE_UW;
-         low.subreg_offset = 2;
-         low.stride = 2;
+   if (shadow_c.file != BAD_FILE) {
+      bld.MOV(sources[length], shadow_c);
+      length++;
+   }
 
-         ibld.ADD(dst, low, high);
+   bool coordinate_done = false;
 
-         if (inst->conditional_mod) {
-            fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
-            set_condmod(inst->conditional_mod,
-                        ibld.MOV(null, inst->dst));
+   /* The sampler can only meaningfully compute LOD for fragment shader
+    * messages. For all other stages, we change the opcode to TXL and
+    * hardcode the LOD to 0.
+    */
+   if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
+       op == SHADER_OPCODE_TEX) {
+      op = SHADER_OPCODE_TXL;
+      lod = fs_reg(0.0f);
+   }
+
+   /* Set up the LOD info */
+   switch (op) {
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXL:
+      bld.MOV(sources[length], lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXD:
+      /* TXD should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* Load dPdx and the coordinate together:
+       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+
+         /* For cube map array, the coordinate is (u,v,r,ai) but there are
+          * only derivatives for (u, v, r).
+          */
+         if (i < grad_components) {
+            bld.MOV(sources[length], lod);
+            lod = offset(lod, bld, 1);
+            length++;
+
+            bld.MOV(sources[length], lod2);
+            lod2 = offset(lod2, bld, 1);
+            length++;
          }
       }
 
-      inst->remove(block);
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXS:
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXF:
+      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+       * On Gen9 they are u, v, lod, r
+       */
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+      coordinate = offset(coordinate, bld, 1);
+      length++;
+
+      if (devinfo->gen >= 9) {
+         if (coord_components >= 2) {
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+            coordinate = offset(coordinate, bld, 1);
+         }
+         length++;
+      }
+
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+      length++;
+
+      for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+      if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+         length++;
+      }
+
+      if (op == SHADER_OPCODE_TXF_CMS) {
+         /* Data from the multisample control surface. */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+         length++;
+      }
+
+      /* There is no offsetting for this message; just copy in the integer
+       * texture coordinates.
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TG4_OFFSET:
+      /* gather4_po_c should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
+
+      /* More crazy intermixing */
+      for (unsigned i = 0; i < 2; i++) { /* u, v */
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      for (unsigned i = 0; i < 2; i++) { /* offu, offv */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
+         offset_value = offset(offset_value, bld, 1);
+         length++;
+      }
+
+      if (coord_components == 3) { /* r if present */
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   default:
+      break;
+   }
+
+   /* Set up the coordinate (except for cases where it was done above) */
+   if (!coordinate_done) {
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+   }
+
+   int mlen;
+   if (reg_width == 2)
+      mlen = length * reg_width - header_size;
+   else
+      mlen = length * reg_width;
+
+   const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
+                                     BRW_REGISTER_TYPE_F);
+   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+
+   /* Generate the SEND. */
+   inst->opcode = op;
+   inst->src[0] = src_payload;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = -1;
+   inst->mlen = mlen;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &coordinate = inst->src[0];
+   const fs_reg &shadow_c = inst->src[1];
+   const fs_reg &lod = inst->src[2];
+   const fs_reg &lod2 = inst->src[3];
+   const fs_reg &sample_index = inst->src[4];
+   const fs_reg &mcs = inst->src[5];
+   const fs_reg &sampler = inst->src[6];
+   const fs_reg &offset_value = inst->src[7];
+   assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
+   const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
+   const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+
+   if (devinfo->gen >= 7) {
+      lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      mcs, sampler, offset_value,
+                                      coord_components, grad_components);
+   } else if (devinfo->gen >= 5) {
+      lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      sampler, offset_value,
+                                      coord_components, grad_components);
+   } else {
+      lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sampler,
+                                      coord_components, grad_components);
+   }
+}
+
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+   fs_builder ubld = bld.exec_all().group(8, 0);
+   const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(dst, fs_reg(0));
+   ubld.MOV(component(dst, 7), sample_mask);
+   return dst;
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+                           const fs_reg &sample_mask)
+{
+   /* Get the logical send arguments. */
+   const fs_reg &addr = inst->src[0];
+   const fs_reg &src = inst->src[1];
+   const fs_reg &surface = inst->src[2];
+   const UNUSED fs_reg &dims = inst->src[3];
+   const fs_reg &arg = inst->src[4];
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(0);
+   const unsigned src_sz = inst->components_read(1);
+   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   const unsigned sz = header_sz + addr_sz + src_sz;
+
+   /* Allocate space for the payload. */
+   fs_reg *const components = new fs_reg[sz];
+   const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+   unsigned n = 0;
+
+   /* Construct the payload. */
+   if (header_sz)
+      components[n++] = emit_surface_header(bld, sample_mask);
+
+   for (unsigned i = 0; i < addr_sz; i++)
+      components[n++] = offset(addr, bld, i);
+
+   for (unsigned i = 0; i < src_sz; i++)
+      components[n++] = offset(src, bld, i);
+
+   bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+   /* Update the original instruction. */
+   inst->opcode = op;
+   inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+   inst->header_size = header_sz;
+
+   inst->src[0] = payload;
+   inst->src[1] = surface;
+   inst->src[2] = arg;
+   inst->resize_sources(3);
+
+   delete[] components;
+}
+
+bool
+fs_visitor::lower_logical_sends()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      switch (inst->opcode) {
+      case FS_OPCODE_FB_WRITE_LOGICAL:
+         assert(stage == MESA_SHADER_FRAGMENT);
+         lower_fb_write_logical_send(ibld, inst,
+                                     (const brw_wm_prog_data *)prog_data,
+                                     (const brw_wm_prog_key *)key,
+                                     payload);
+         break;
+
+      case SHADER_OPCODE_TEX_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+         break;
+
+      case SHADER_OPCODE_TXD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+         break;
+
+      case SHADER_OPCODE_TXF_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+         break;
+
+      case SHADER_OPCODE_TXL_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+         break;
+
+      case SHADER_OPCODE_TXS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+         break;
+
+      case FS_OPCODE_TXB_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+         break;
+
+      case SHADER_OPCODE_TXF_CMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+         break;
+
+      case SHADER_OPCODE_TXF_UMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+         break;
+
+      case SHADER_OPCODE_TXF_MCS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+         break;
+
+      case SHADER_OPCODE_LOD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+         break;
+
+      case SHADER_OPCODE_TG4_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+         break;
+
+      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+                                    fs_reg(0xffff));
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_READ,
+                                    fs_reg(0xffff));
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
+      default:
+         continue;
+      }
+
       progress = true;
    }
 
@@ -3239,6 +4144,265 @@ fs_visitor::lower_integer_multiplication()
    return progress;
 }
 
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct brw_device_info *devinfo,
+                       const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_CSEL:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_BFREV:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LZD:
+   case BRW_OPCODE_FBH:
+   case BRW_OPCODE_FBL:
+   case BRW_OPCODE_CBIT:
+   case BRW_OPCODE_SAD2:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS: {
+      /* According to the PRMs:
+       *  "A. In Direct Addressing mode, a source cannot span more than 2
+       *      adjacent GRF registers.
+       *   B. A destination cannot span more than 2 adjacent GRF registers."
+       *
+       * Look for the source or destination with the largest register region
+       * which is the one that is going to limit the overal execution size of
+       * the instruction due to this rule.
+       */
+      unsigned reg_count = inst->regs_written;
+
+      for (unsigned i = 0; i < inst->sources; i++)
+         reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
+
+      /* Calculate the maximum execution size of the instruction based on the
+       * factor by which it goes over the hardware limit of 2 GRFs.
+       */
+      return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
+   }
+   case SHADER_OPCODE_MULH:
+      /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
+       * is 8-wide on Gen7+.
+       */
+      return (devinfo->gen >= 7 ? 8 : inst->exec_size);
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
+       * here.
+       */
+      assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
+             inst->exec_size == 8);
+      /* Dual-source FB writes are unsupported in SIMD16 mode. */
+      return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
+
+   case SHADER_OPCODE_TXD_LOGICAL:
+      /* TXD is unsupported in SIMD16 mode. */
+      return 8;
+
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
+      /* gather4_po_c is unsupported in SIMD16 mode. */
+      const fs_reg &shadow_c = inst->src[1];
+      return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
+   }
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL: {
+      /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
+       * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
+       * mode because the message exceeds the maximum length of 11.
+       */
+      const fs_reg &shadow_c = inst->src[1];
+      if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
+         return 16;
+      else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
+         return 8;
+      else
+         return inst->exec_size;
+   }
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+      /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+       * messages.  Use SIMD16 instead.
+       */
+      if (devinfo->gen == 4)
+         return 16;
+      else
+         return inst->exec_size;
+
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return 8;
+
+   default:
+      return inst->exec_size;
+   }
+}
+
+/**
+ * The \p rows array of registers represents a \p num_rows by \p num_columns
+ * matrix in row-major order, write it in column-major order into the register
+ * passed as destination.  \p stride gives the separation between matrix
+ * elements in the input in fs_builder::dispatch_width() units.
+ */
+static void
+emit_transpose(const fs_builder &bld,
+               const fs_reg &dst, const fs_reg *rows,
+               unsigned num_rows, unsigned num_columns, unsigned stride)
+{
+   fs_reg *const components = new fs_reg[num_rows * num_columns];
+
+   for (unsigned i = 0; i < num_columns; ++i) {
+      for (unsigned j = 0; j < num_rows; ++j)
+         components[num_rows * i + j] = offset(rows[j], bld, stride * i);
+   }
+
+   bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
+
+   delete[] components;
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
+
+      if (lower_width != inst->exec_size) {
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
+         const fs_builder ibld = bld.at(block, inst)
+                                    .exec_all(inst->force_writemask_all)
+                                    .group(MAX2(inst->exec_size, lower_width),
+                                           inst->force_sechalf);
+
+         /* Split the copies in chunks of the execution width of either the
+          * original or the lowered instruction, whichever is lower.
+          */
+         const unsigned copy_width = MIN2(lower_width, inst->exec_size);
+         const unsigned n = inst->exec_size / copy_width;
+         const unsigned dst_size = inst->regs_written * REG_SIZE /
+            inst->dst.component_size(inst->exec_size);
+         fs_reg dsts[4];
+
+         assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
+                !inst->writes_accumulator && !inst->mlen);
+
+         for (unsigned i = 0; i < n; i++) {
+            /* Emit a copy of the original instruction with the lowered width.
+             * If the EOT flag was set throw it away except for the last
+             * instruction to avoid killing the thread prematurely.
+             */
+            fs_inst split_inst = *inst;
+            split_inst.exec_size = lower_width;
+            split_inst.eot = inst->eot && i == n - 1;
+
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
+             */
+            const fs_builder lbld = ibld.group(lower_width, i);
+
+            for (unsigned j = 0; j < inst->sources; j++) {
+               if (inst->src[j].file != BAD_FILE &&
+                   !is_uniform(inst->src[j])) {
+                  /* Get the i-th copy_width-wide chunk of the source. */
+                  const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+                  const unsigned src_size = inst->components_read(j);
+
+                  /* Use a trivial transposition to copy one every n
+                   * copy_width-wide components of the register into a
+                   * temporary passed as source to the lowered instruction.
+                   */
+                  split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
+                  emit_transpose(lbld.group(copy_width, 0),
+                                 split_inst.src[j], &src, 1, src_size, n);
+               }
+            }
+
+            if (inst->regs_written) {
+               /* Allocate enough space to hold the result of the lowered
+                * instruction and fix up the number of registers written.
+                */
+               split_inst.dst = dsts[i] =
+                  lbld.vgrf(inst->dst.type, dst_size);
+               split_inst.regs_written =
+                  DIV_ROUND_UP(inst->regs_written * lower_width,
+                               inst->exec_size);
+            }
+
+            lbld.emit(split_inst);
+         }
+
+         if (inst->regs_written) {
+            /* Distance between useful channels in the temporaries, skipping
+             * garbage if the lowered instruction is wider than the original.
+             */
+            const unsigned m = lower_width / copy_width;
+
+            /* Interleave the components of the result from the lowered
+             * instructions.  We need to set exec_all() when copying more than
+             * one half per component, because LOAD_PAYLOAD (in terms of which
+             * emit_transpose is implemented) can only use the same channel
+             * enable signals for all of its non-header sources.
+             */
+            emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
+                               .group(copy_width, 0),
+                           inst->dst, dsts, n, dst_size, m);
+         }
+
+         inst->remove(block);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::dump_instructions()
 {
@@ -3316,9 +4480,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    switch (inst->dst.file) {
    case GRF:
       fprintf(file, "vgrf%d", inst->dst.reg);
-      if (inst->dst.width != dispatch_width)
-         fprintf(file, "@%d", inst->dst.width);
-      if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
+      if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
           inst->dst.subreg_offset)
          fprintf(file, "+%d.%d",
                  inst->dst.reg_offset, inst->dst.subreg_offset);
@@ -3376,9 +4538,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       switch (inst->src[i].file) {
       case GRF:
          fprintf(file, "vgrf%d", inst->src[i].reg);
-         if (inst->src[i].width != dispatch_width)
-            fprintf(file, "@%d", inst->src[i].width);
-         if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
+         if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
              inst->src[i].subreg_offset)
             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                     inst->src[i].subreg_offset);
@@ -3655,9 +4815,11 @@ fs_visitor::optimize()
     * Ideally optimization passes wouldn't be part of the visitor so they
     * wouldn't have access to bld at all, but they do, so just in case some
     * pass forgets to ask for a location explicitly set it to NULL here to
-    * make it trip.
+    * make it trip.  The dispatch width is initialized to a bogus value to
+    * make sure that optimizations set the execution controls explicitly to
+    * match the code they are manipulating instead of relying on the defaults.
     */
-   bld = bld.at(NULL, NULL);
+   bld = fs_builder(this, 64);
 
    split_virtual_grfs();
 
@@ -3690,9 +4852,13 @@ fs_visitor::optimize()
       backend_shader::dump_instructions(filename);
    }
 
-   bool progress;
+   bool progress = false;
    int iteration = 0;
    int pass_num = 0;
+
+   OPT(lower_simd_width);
+   OPT(lower_logical_sends);
+
    do {
       progress = false;
       pass_num = 0;
@@ -3837,7 +5003,9 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
    if (failed)
       return false;
 
-   emit_urb_writes(clip_planes);
+   compute_clip_distance(clip_planes);
+
+   emit_urb_writes();
 
    if (shader_time_index >= 0)
       emit_shader_time_end();
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 243baf688de..975183e990d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -62,6 +62,27 @@ namespace brw {
    class fs_live_variables;
 }
 
+static inline fs_reg
+offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case GRF:
+   case MRF:
+   case HW_REG:
+   case ATTR:
+      return byte_offset(reg,
+                         delta * reg.component_size(bld.dispatch_width()));
+   case UNIFORM:
+      reg.reg_offset += delta;
+      break;
+   case IMM:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
 /**
  * The fragment shader front-end.
  *
@@ -161,7 +182,9 @@ public:
    void no16(const char *msg);
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
+   bool lower_logical_sends();
    bool lower_integer_multiplication();
+   bool lower_simd_width();
    bool opt_combine_constants();
 
    void emit_dummy_fs();
@@ -185,27 +208,6 @@ public:
    void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
    fs_reg rescale_texcoord(fs_reg coordinate, int coord_components,
                            bool is_rect, uint32_t sampler, int texunit);
-   fs_inst *emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              uint32_t sampler);
-   fs_inst *emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
-                                     fs_reg coordinate, int vector_elements,
-                                     fs_reg shadow_c, fs_reg lod,
-                                     uint32_t sampler);
-   fs_inst *emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, uint32_t sampler,
-                              bool has_offset);
-   fs_inst *emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, fs_reg mcs, fs_reg sampler,
-                              fs_reg offset_value);
    void emit_texture(ir_texture_opcode op,
                      const glsl_type *dest_type,
                      fs_reg coordinate, int components,
@@ -220,9 +222,10 @@ public:
                      uint32_t sampler,
                      fs_reg sampler_reg,
                      int texunit);
-   fs_reg emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler);
+   fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+                         const fs_reg &sampler);
    void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
-   void resolve_source_modifiers(fs_reg *src);
+   fs_reg resolve_source_modifiers(const fs_reg &src);
    void emit_discard_jump();
    bool try_replace_with_sel();
    bool opt_peephole_sel();
@@ -249,6 +252,10 @@ public:
    void nir_emit_block(nir_block *block);
    void nir_emit_instr(nir_instr *instr);
    void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+   void nir_emit_load_const(const brw::fs_builder &bld,
+                            nir_load_const_instr *instr);
+   void nir_emit_undef(const brw::fs_builder &bld,
+                       nir_ssa_undef_instr *instr);
    void nir_emit_intrinsic(const brw::fs_builder &bld,
                            nir_intrinsic_instr *instr);
    void nir_emit_texture(const brw::fs_builder &bld,
@@ -257,21 +264,19 @@ public:
                       nir_jump_instr *instr);
    fs_reg get_nir_src(nir_src src);
    fs_reg get_nir_dest(nir_dest dest);
+   fs_reg get_nir_image_deref(const nir_deref_var *deref);
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
 
    bool optimize_frontfacing_ternary(nir_alu_instr *instr,
                                      const fs_reg &result);
 
-   void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
-                            unsigned exec_size, bool use_2nd_half);
    void emit_alpha_test();
    fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
                                  fs_reg color1, fs_reg color2,
-                                 fs_reg src0_alpha, unsigned components,
-                                 unsigned exec_size, bool use_2nd_half = false);
+                                 fs_reg src0_alpha, unsigned components);
    void emit_fb_writes();
-   void emit_urb_writes(gl_clip_plane *clip_planes);
+   void emit_urb_writes();
    void emit_cs_terminate();
 
    void emit_barrier();
@@ -282,16 +287,13 @@ public:
                         int shader_time_subindex,
                         fs_reg value);
 
-   void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                            fs_reg dst, fs_reg offset, fs_reg src0,
-                            fs_reg src1);
-
-   void emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
-                                  fs_reg offset);
-
    fs_reg get_timestamp(const brw::fs_builder &bld);
 
    struct brw_reg interp_reg(int location, int channel);
+
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n);
+
    int implied_mrf_writes(fs_inst *inst);
 
    virtual void dump_instructions();
@@ -345,7 +347,7 @@ public:
    unsigned max_grf;
 
    fs_reg *nir_locals;
-   fs_reg *nir_globals;
+   fs_reg *nir_ssa_values;
    fs_reg nir_inputs;
    fs_reg nir_outputs;
    fs_reg *nir_system_values;
@@ -359,7 +361,7 @@ public:
    fs_reg result;
 
    /** Register numbers for thread payload fields. */
-   struct {
+   struct thread_payload {
       uint8_t source_depth_reg;
       uint8_t source_w_reg;
       uint8_t aa_dest_stencil_reg;
@@ -468,10 +470,6 @@ private:
                                           struct brw_reg msg_data,
                                           unsigned msg_type);
 
-   void generate_set_omask(fs_inst *inst,
-                           struct brw_reg dst,
-                           struct brw_reg sample_mask);
-
    void generate_set_sample_id(fs_inst *inst,
                                struct brw_reg dst,
                                struct brw_reg src0,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 58ac5980da5..34545eaa0fb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -64,6 +64,22 @@ namespace brw {
       }
 
       /**
+       * Construct an fs_builder that inserts instructions into \p shader
+       * before instruction \p inst in basic block \p block.  The default
+       * execution controls and debug annotation are initialized from the
+       * instruction passed as argument.
+       */
+      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
+         shader(shader), block(block), cursor(inst),
+         _dispatch_width(inst->exec_size),
+         _group(inst->force_sechalf ? 8 : 0),
+         force_writemask_all(inst->force_writemask_all)
+      {
+         annotation.str = inst->annotation;
+         annotation.ir = inst->ir;
+      }
+
+      /**
        * Construct an fs_builder that inserts instructions before \p cursor in
        * basic block \p block, inheriting other code generation parameters
        * from this.
@@ -99,8 +115,8 @@ namespace brw {
       fs_builder
       group(unsigned n, unsigned i) const
       {
-         assert(n <= dispatch_width() &&
-                i < dispatch_width() / n);
+         assert(force_writemask_all ||
+                (n <= dispatch_width() && i < dispatch_width() / n));
          fs_builder bld = *this;
          bld._dispatch_width = n;
          bld._group += i * n;
@@ -160,10 +176,15 @@ namespace brw {
       dst_reg
       vgrf(enum brw_reg_type type, unsigned n = 1) const
       {
-         return dst_reg(GRF, shader->alloc.allocate(
-                           DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
-                                        REG_SIZE)),
-                        type, dispatch_width());
+         assert(dispatch_width() <= 32);
+
+         if (n > 0)
+            return dst_reg(GRF, shader->alloc.allocate(
+                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+                                           REG_SIZE)),
+                           type);
+         else
+            return retype(null_reg_ud(), type);
       }
 
       /**
@@ -235,7 +256,7 @@ namespace brw {
       instruction *
       emit(enum opcode opcode, const dst_reg &dst) const
       {
-         return emit(instruction(opcode, dst));
+         return emit(instruction(opcode, dispatch_width(), dst));
       }
 
       /**
@@ -253,11 +274,11 @@ namespace brw {
          case SHADER_OPCODE_SIN:
          case SHADER_OPCODE_COS:
             return fix_math_instruction(
-               emit(instruction(opcode, dst.width, dst,
+               emit(instruction(opcode, dispatch_width(), dst,
                                 fix_math_operand(src0))));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0));
+            return emit(instruction(opcode, dispatch_width(), dst, src0));
          }
       }
 
@@ -273,12 +294,12 @@ namespace brw {
          case SHADER_OPCODE_INT_QUOTIENT:
          case SHADER_OPCODE_INT_REMAINDER:
             return fix_math_instruction(
-               emit(instruction(opcode, dst.width, dst,
+               emit(instruction(opcode, dispatch_width(), dst,
                                 fix_math_operand(src0),
                                 fix_math_operand(src1))));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0, src1));
+            return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 
          }
       }
@@ -295,22 +316,35 @@ namespace brw {
          case BRW_OPCODE_BFI2:
          case BRW_OPCODE_MAD:
          case BRW_OPCODE_LRP:
-            return emit(instruction(opcode, dst.width, dst,
+            return emit(instruction(opcode, dispatch_width(), dst,
                                     fix_3src_operand(src0),
                                     fix_3src_operand(src1),
                                     fix_3src_operand(src2)));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0, src1, src2));
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    src0, src1, src2));
          }
       }
 
       /**
+       * Create and insert an instruction with a variable number of sources
+       * into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
+           unsigned n) const
+      {
+         return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
+      }
+
+      /**
        * Insert a preallocated instruction into the program.
        */
       instruction *
       emit(instruction *inst) const
       {
+         assert(inst->exec_size <= 32);
          assert(inst->exec_size == dispatch_width() ||
                 force_writemask_all);
          assert(_group == 0 || _group == 8);
@@ -349,17 +383,19 @@ namespace brw {
       }
 
       /**
-       * Copy any live channel from \p src to the first channel of \p dst.
+       * Copy any live channel from \p src to the first channel of the result.
        */
-      void
-      emit_uniformize(const dst_reg &dst, const src_reg &src) const
+      src_reg
+      emit_uniformize(const src_reg &src) const
       {
          const fs_builder ubld = exec_all();
-         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+         const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
+         const dst_reg dst = component(vgrf(src.type), 0);
+
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
 
-         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
-         ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
-                   src, component(chan_index, 0));
+         return src_reg(dst);
       }
 
       /**
@@ -515,20 +551,10 @@ namespace brw {
       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
                    unsigned sources, unsigned header_size) const
       {
-         assert(dst.width % 8 == 0);
-         instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
-                                              dst.width, dst, src, sources));
+         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
          inst->header_size = header_size;
-
-         for (unsigned i = 0; i < header_size; i++)
-            assert(src[i].file != GRF ||
-                   src[i].width * type_sz(src[i].type) == 32);
-         inst->regs_written = header_size;
-
-         for (unsigned i = header_size; i < sources; ++i)
-            assert(src[i].file != GRF ||
-                   src[i].width == dst.width);
-         inst->regs_written += (sources - header_size) * (dst.width / 8);
+         inst->regs_written = header_size +
+                              (sources - header_size) * (dispatch_width() / 8);
 
          return inst;
       }
@@ -626,8 +652,8 @@ namespace brw {
                inst->resize_sources(1);
                inst->src[0] = src0;
 
-               at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
-                                          dispatch_width()), src1);
+               at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
+                                   src1);
             }
          }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index d0f61222e5a..a8883a35ef2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -243,6 +243,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_find_msb:
    case ir_unop_find_lsb:
    case ir_unop_saturate:
+   case ir_unop_subroutine_to_int:
       for (i = 0; i < vector_elements; i++) {
 	 ir_rvalue *op0 = get_element(op_var[0], i);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index 0af5a915c9f..c182232285e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -277,7 +277,7 @@ fs_visitor::opt_combine_constants()
        */
       exec_node *n = (imm->inst ? imm->inst :
                       imm->block->last_non_control_flow_inst()->next);
-      const fs_builder ibld = bld.at(imm->block, n).exec_all();
+      const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
 
       ibld.MOV(reg, fs_reg(imm->val));
       imm->reg = reg.reg;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index c92aae4b1d6..5445ad55670 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -339,6 +339,14 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    if (entry->src.stride * inst->src[arg].stride > 4)
       return false;
 
+   /* Bail if the instruction type is larger than the execution type of the
+    * copy, what implies that each channel is reading multiple channels of the
+    * destination of the copy, and simply replacing the sources would give a
+    * program with different semantics.
+    */
+   if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type))
+      return false;
+
    /* Bail if the result of composing both strides cannot be expressed
     * as another stride. This avoids, for example, trying to transform
     * this:
@@ -388,17 +396,14 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 
    switch (entry->src.file) {
    case UNIFORM:
-      assert(entry->src.width == 1);
    case BAD_FILE:
    case HW_REG:
-      inst->src[arg].width = entry->src.width;
       inst->src[arg].reg_offset = entry->src.reg_offset;
       inst->src[arg].subreg_offset = entry->src.subreg_offset;
       break;
    case ATTR:
    case GRF:
       {
-         assert(entry->src.width % inst->src[arg].width == 0);
          /* In this case, we'll just leave the width alone.  The source
           * register could have different widths depending on how it is
           * being used.  For instance, if only half of the register was
@@ -529,6 +534,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
 
       case BRW_OPCODE_MACH:
       case BRW_OPCODE_MUL:
+      case SHADER_OPCODE_MULH:
       case BRW_OPCODE_ADD:
       case BRW_OPCODE_OR:
       case BRW_OPCODE_AND:
@@ -715,7 +721,6 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
                entry->dst = inst->dst;
                entry->dst.reg_offset = offset;
-               entry->dst.width = effective_width;
                entry->src = inst->src[i];
                entry->regs_written = regs_written;
                entry->opcode = inst->opcode;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 70f0217b93d..c7628dcc2f4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -61,6 +61,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case BRW_OPCODE_CMPN:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_FRC:
    case BRW_OPCODE_RNDU:
    case BRW_OPCODE_RNDD:
@@ -179,9 +180,7 @@ static void
 create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
 {
    int written = inst->regs_written;
-   int dst_width = inst->dst.width / 8;
-   const fs_builder ubld = bld.group(inst->exec_size, inst->force_sechalf)
-                              .exec_all(inst->force_writemask_all);
+   int dst_width = inst->exec_size / 8;
    fs_inst *copy;
 
    if (written > dst_width) {
@@ -200,16 +199,15 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
       payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
       for (int i = 0; i < header_size; i++) {
          payload[i] = src;
-         payload[i].width = 8;
          src.reg_offset++;
       }
       for (int i = header_size; i < sources; i++) {
          payload[i] = src;
-         src = offset(src, 1);
+         src = offset(src, bld, 1);
       }
-      copy = ubld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
+      copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
    } else {
-      copy = ubld.MOV(inst->dst, src);
+      copy = bld.MOV(inst->dst, src);
       copy->src[0].negate = negate;
    }
    assert(copy->regs_written == written);
@@ -259,15 +257,14 @@ fs_visitor::opt_cse_local(bblock_t *block)
              */
             bool no_existing_temp = entry->tmp.file == BAD_FILE;
             if (no_existing_temp && !entry->generator->dst.is_null()) {
+               const fs_builder ibld = fs_builder(this, block, entry->generator)
+                                       .at(block, entry->generator->next);
                int written = entry->generator->regs_written;
-               assert((written * 8) % entry->generator->dst.width == 0);
 
                entry->tmp = fs_reg(GRF, alloc.allocate(written),
-                                   entry->generator->dst.type,
-                                   entry->generator->dst.width);
+                                   entry->generator->dst.type);
 
-               create_copy_instr(bld.at(block, entry->generator->next),
-                                 entry->generator, entry->tmp, false);
+               create_copy_instr(ibld, entry->generator, entry->tmp, false);
 
                entry->generator->dst = entry->tmp;
             }
@@ -275,10 +272,10 @@ fs_visitor::opt_cse_local(bblock_t *block)
             /* dest <- temp */
             if (!inst->dst.is_null()) {
                assert(inst->regs_written == entry->generator->regs_written);
-               assert(inst->dst.width == entry->generator->dst.width);
                assert(inst->dst.type == entry->tmp.type);
+               const fs_builder ibld(this, block, inst);
 
-               create_copy_instr(bld.at(block, inst), inst, entry->tmp, negate);
+               create_copy_instr(ibld, inst, entry->tmp, negate);
             }
 
             /* Set our iterator so that next time through the loop inst->next
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 2ed0bac6fd9..c86ca043b63 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -48,7 +48,7 @@ static uint32_t brw_file_from_reg(fs_reg *reg)
 }
 
 static struct brw_reg
-brw_reg_from_fs_reg(fs_reg *reg)
+brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg)
 {
    struct brw_reg brw_reg;
 
@@ -57,10 +57,10 @@ brw_reg_from_fs_reg(fs_reg *reg)
    case MRF:
       if (reg->stride == 0) {
          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
-      } else if (reg->width < 8) {
+      } else if (inst->exec_size < 8) {
          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
-         brw_reg = stride(brw_reg, reg->width * reg->stride,
-                          reg->width, reg->stride);
+         brw_reg = stride(brw_reg, inst->exec_size * reg->stride,
+                          inst->exec_size, reg->stride);
       } else {
          /* From the Haswell PRM:
           *
@@ -79,6 +79,10 @@ brw_reg_from_fs_reg(fs_reg *reg)
       brw_reg = byte_offset(brw_reg, reg->subreg_offset);
       break;
    case IMM:
+      assert(reg->stride == ((reg->type == BRW_REGISTER_TYPE_V ||
+                              reg->type == BRW_REGISTER_TYPE_UV ||
+                              reg->type == BRW_REGISTER_TYPE_VF) ? 1 : 0));
+
       switch (reg->type) {
       case BRW_REGISTER_TYPE_F:
 	 brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
@@ -217,11 +221,11 @@ fs_generator::fire_fb_write(fs_inst *inst,
    if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
    else if (prog_data->dual_src_blend) {
-      if (dispatch_width == 8 || !inst->eot)
+      if (!inst->force_sechalf)
          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
       else
          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
-   } else if (dispatch_width == 16)
+   } else if (inst->exec_size == 16)
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
    else
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
@@ -414,7 +418,7 @@ fs_generator::generate_blorp_fb_write(fs_inst *inst)
    brw_fb_WRITE(p,
                 16 /* dispatch_width */,
                 brw_message_reg(inst->base_mrf),
-                brw_reg_from_fs_reg(&inst->src[0]),
+                brw_reg_from_fs_reg(inst, &inst->src[0]),
                 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
                 inst->target,
                 inst->mlen,
@@ -651,7 +655,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 /* Note that G45 and older determines shadow compare and dispatch width
 	  * from message length for most messages.
 	  */
-         if (dispatch_width == 8) {
+         if (inst->exec_size == 8) {
             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
             if (inst->shadow_compare) {
                assert(inst->mlen == 6);
@@ -670,7 +674,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case FS_OPCODE_TXB:
 	 if (inst->shadow_compare) {
-            assert(dispatch_width == 8);
+            assert(inst->exec_size == 8);
 	    assert(inst->mlen == 6);
 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 	 } else {
@@ -681,7 +685,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case SHADER_OPCODE_TXL:
 	 if (inst->shadow_compare) {
-            assert(dispatch_width == 8);
+            assert(inst->exec_size == 8);
 	    assert(inst->mlen == 6);
 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 	 } else {
@@ -692,7 +696,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case SHADER_OPCODE_TXD:
 	 /* There is no sample_d_c message; comparisons are done manually */
-         assert(dispatch_width == 8);
+         assert(inst->exec_size == 8);
 	 assert(inst->mlen == 7 || inst->mlen == 10);
 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 	 break;
@@ -1054,7 +1058,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                                                        struct brw_reg index,
                                                        struct brw_reg offset)
 {
-   assert(inst->mlen == 0);
    assert(index.type == BRW_REGISTER_TYPE_UD);
 
    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
@@ -1069,12 +1072,10 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
 
    struct brw_reg src = offset;
    bool header_present = false;
-   int mlen = 1;
 
    if (devinfo->gen >= 9) {
       /* Skylake requires a message header in order to use SIMD4x2 mode. */
-      src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD);
-      mlen = 2;
+      src = retype(brw_vec4_grf(offset.nr, 0), BRW_REGISTER_TYPE_UD);
       header_present = true;
 
       brw_push_insn_state(p);
@@ -1105,7 +1106,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
-                              mlen,
+                              inst->mlen,
                               header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
@@ -1135,7 +1136,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
-                              mlen,
+                              inst->mlen,
                               header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
@@ -1363,37 +1364,6 @@ fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
    brw_pop_insn_state(p);
 }
 
-/* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
- * (when mask is passed as a uniform) of register mask before moving it
- * to register dst.
- */
-void
-fs_generator::generate_set_omask(fs_inst *inst,
-                                 struct brw_reg dst,
-                                 struct brw_reg mask)
-{
-   bool stride_8_8_1 =
-    (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
-     mask.width == BRW_WIDTH_8 &&
-     mask.hstride == BRW_HORIZONTAL_STRIDE_1);
-
-   bool stride_0_1_0 = has_scalar_region(mask);
-
-   assert(stride_8_8_1 || stride_0_1_0);
-   assert(dst.type == BRW_REGISTER_TYPE_UW);
-
-   brw_push_insn_state(p);
-   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-
-   if (stride_8_8_1) {
-      brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
-   } else if (stride_0_1_0) {
-      brw_MOV(p, dst, retype(mask, dst.type));
-   }
-   brw_pop_insn_state(p);
-}
-
 /* Sets vstride=1, width=4, hstride=0 of register src1 during
  * the ADD instruction.
  */
@@ -1563,7 +1533,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
 
       for (unsigned int i = 0; i < inst->sources; i++) {
-	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
+	 src[i] = brw_reg_from_fs_reg(inst, &inst->src[i]);
 
 	 /* The accumulator result appears to get used for the
 	  * conditional modifier generation.  When negating a UD
@@ -1575,7 +1545,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
 		!inst->src[i].negate);
       }
-      dst = brw_reg_from_fs_reg(&inst->dst);
+      dst = brw_reg_from_fs_reg(inst, &inst->dst);
 
       brw_set_default_predicate_control(p, inst->predicate);
       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
@@ -1604,7 +1574,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          /* If the instruction writes to more than one register, it needs to
           * be a "compressed" instruction on Gen <= 5.
           */
-         if (inst->exec_size * inst->dst.stride * type_sz(inst->dst.type) > 32)
+         if (inst->dst.component_size(inst->exec_size) > REG_SIZE)
             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
          else
             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1872,7 +1842,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 	 break;
 
       case BRW_OPCODE_DO:
-	 brw_DO(p, BRW_EXECUTE_8);
+	 brw_DO(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
 	 break;
 
       case BRW_OPCODE_BREAK:
@@ -2019,19 +1989,15 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud,
                             inst->mlen, !inst->dst.is_null());
-         brw_mark_surface_used(prog_data, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1],
                                   inst->mlen, src[2].dw1.ud);
-         brw_mark_surface_used(prog_data, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -2073,10 +2039,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
-      case FS_OPCODE_SET_OMASK:
-         generate_set_omask(inst, dst, src[0]);
-         break;
-
       case FS_OPCODE_SET_SAMPLE_ID:
          generate_set_sample_id(inst, dst, src[0], src[1]);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index 502161d5128..19aec92fad1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -204,27 +204,9 @@ fs_live_variables::compute_live_variables()
    while (cont) {
       cont = false;
 
-      foreach_block (block, cfg) {
+      foreach_block_reverse (block, cfg) {
          struct block_data *bd = &block_data[block->num];
 
-	 /* Update livein */
-	 for (int i = 0; i < bitset_words; i++) {
-            BITSET_WORD new_livein = (bd->use[i] |
-                                      (bd->liveout[i] &
-                                       ~bd->def[i]));
-	    if (new_livein & ~bd->livein[i]) {
-               bd->livein[i] |= new_livein;
-               cont = true;
-	    }
-	 }
-         BITSET_WORD new_livein = (bd->flag_use[0] |
-                                   (bd->flag_liveout[0] &
-                                    ~bd->flag_def[0]));
-         if (new_livein & ~bd->flag_livein[0]) {
-            bd->flag_livein[0] |= new_livein;
-            cont = true;
-         }
-
 	 /* Update liveout */
 	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
             struct block_data *child_bd = &block_data[child_link->block->num];
@@ -244,6 +226,24 @@ fs_live_variables::compute_live_variables()
                cont = true;
             }
 	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
       }
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 4d98b048433..93a36cc03bf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -24,8 +24,10 @@
 #include "glsl/ir.h"
 #include "glsl/ir_optimization.h"
 #include "glsl/nir/glsl_to_nir.h"
+#include "main/shaderimage.h"
 #include "program/prog_to_nir.h"
 #include "brw_fs.h"
+#include "brw_fs_surface_builder.h"
 #include "brw_nir.h"
 
 using namespace brw;
@@ -38,31 +40,11 @@ fs_visitor::emit_nir_code()
    /* emit the arrays used for inputs and outputs - load/store intrinsics will
     * be converted to reads/writes of these arrays
     */
-
-   if (nir->num_inputs > 0) {
-      nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
-      nir_setup_inputs(nir);
-   }
-
-   if (nir->num_outputs > 0) {
-      nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
-      nir_setup_outputs(nir);
-   }
-
-   if (nir->num_uniforms > 0) {
-      nir_setup_uniforms(nir);
-   }
-
+   nir_setup_inputs(nir);
+   nir_setup_outputs(nir);
+   nir_setup_uniforms(nir);
    nir_emit_system_values(nir);
 
-   nir_globals = ralloc_array(mem_ctx, fs_reg, nir->reg_alloc);
-   foreach_list_typed(nir_register, reg, node, &nir->registers) {
-      unsigned array_elems =
-         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
-      unsigned size = array_elems * reg->num_components;
-      nir_globals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
-   }
-
    /* get the main function and emit it */
    nir_foreach_overload(nir, overload) {
       assert(strcmp(overload->function->name, "main") == 0);
@@ -74,9 +56,11 @@ fs_visitor::emit_nir_code()
 void
 fs_visitor::nir_setup_inputs(nir_shader *shader)
 {
+   nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, shader->num_inputs);
+
    foreach_list_typed(nir_variable, var, node, &shader->inputs) {
       enum brw_reg_type type = brw_type_for_base_type(var->type);
-      fs_reg input = offset(nir_inputs, var->data.driver_location);
+      fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
 
       fs_reg reg;
       switch (stage) {
@@ -91,25 +75,35 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
           * So, we need to copy from fs_reg(ATTR, var->location) to
           * offset(nir_inputs, var->data.driver_location).
           */
-         unsigned components = var->type->without_array()->components();
+         const glsl_type *const t = var->type->without_array();
+         const unsigned components = t->components();
+         const unsigned cols = t->matrix_columns;
+         const unsigned elts = t->vector_elements;
          unsigned array_length = var->type->is_array() ? var->type->length : 1;
          for (unsigned i = 0; i < array_length; i++) {
-            for (unsigned j = 0; j < components; j++) {
-               bld.MOV(retype(offset(input, components * i + j), type),
-                       offset(fs_reg(ATTR, var->data.location + i, type), j));
+            for (unsigned j = 0; j < cols; j++) {
+               for (unsigned k = 0; k < elts; k++) {
+                  bld.MOV(offset(retype(input, type), bld,
+                                 components * i + elts * j + k),
+                          offset(fs_reg(ATTR, var->data.location + i, type),
+                                 bld, 4 * j + k));
+               }
             }
          }
          break;
       }
       case MESA_SHADER_GEOMETRY:
       case MESA_SHADER_COMPUTE:
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
          unreachable("fs_visitor not used for these stages yet.");
          break;
       case MESA_SHADER_FRAGMENT:
          if (var->data.location == VARYING_SLOT_POS) {
             reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
                                                 var->data.origin_upper_left);
-            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, input, reg), 0xF);
+            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
+                                      input, reg), 0xF);
          } else {
             emit_general_interpolation(input, var->name, var->type,
                                        (glsl_interp_qualifier) var->data.interpolation,
@@ -126,45 +120,54 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 {
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
+   nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, shader->num_outputs);
+
    foreach_list_typed(nir_variable, var, node, &shader->outputs) {
-      fs_reg reg = offset(nir_outputs, var->data.driver_location);
+      fs_reg reg = offset(nir_outputs, bld, var->data.driver_location);
 
       int vector_elements =
          var->type->is_array() ? var->type->fields.array->vector_elements
                                : var->type->vector_elements;
 
-      if (stage == MESA_SHADER_VERTEX) {
+      switch (stage) {
+      case MESA_SHADER_VERTEX:
          for (int i = 0; i < ALIGN(type_size(var->type), 4) / 4; i++) {
             int output = var->data.location + i;
-            this->outputs[output] = offset(reg, 4 * i);
+            this->outputs[output] = offset(reg, bld, 4 * i);
             this->output_components[output] = vector_elements;
          }
-      } else if (var->data.index > 0) {
-         assert(var->data.location == FRAG_RESULT_DATA0);
-         assert(var->data.index == 1);
-         this->dual_src_output = reg;
-         this->do_dual_src = true;
-      } else if (var->data.location == FRAG_RESULT_COLOR) {
-         /* Writing gl_FragColor outputs to all color regions. */
-         for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
-            this->outputs[i] = reg;
-            this->output_components[i] = 4;
-         }
-      } else if (var->data.location == FRAG_RESULT_DEPTH) {
-         this->frag_depth = reg;
-      } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
-         this->sample_mask = reg;
-      } else {
-         /* gl_FragData or a user-defined FS output */
-         assert(var->data.location >= FRAG_RESULT_DATA0 &&
-                var->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
-
-         /* General color output. */
-         for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
-            int output = var->data.location - FRAG_RESULT_DATA0 + i;
-            this->outputs[output] = offset(reg, vector_elements * i);
-            this->output_components[output] = vector_elements;
+         break;
+      case MESA_SHADER_FRAGMENT:
+         if (var->data.index > 0) {
+            assert(var->data.location == FRAG_RESULT_DATA0);
+            assert(var->data.index == 1);
+            this->dual_src_output = reg;
+            this->do_dual_src = true;
+         } else if (var->data.location == FRAG_RESULT_COLOR) {
+            /* Writing gl_FragColor outputs to all color regions. */
+            for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
+               this->outputs[i] = reg;
+               this->output_components[i] = 4;
+            }
+         } else if (var->data.location == FRAG_RESULT_DEPTH) {
+            this->frag_depth = reg;
+         } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
+            this->sample_mask = reg;
+         } else {
+            /* gl_FragData or a user-defined FS output */
+            assert(var->data.location >= FRAG_RESULT_DATA0 &&
+                   var->data.location < FRAG_RESULT_DATA0+BRW_MAX_DRAW_BUFFERS);
+
+            /* General color output. */
+            for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
+               int output = var->data.location - FRAG_RESULT_DATA0 + i;
+               this->outputs[output] = offset(reg, bld, vector_elements * i);
+               this->output_components[output] = vector_elements;
+            }
          }
+         break;
+      default:
+         unreachable("unhandled shader stage");
       }
    }
 }
@@ -172,18 +175,20 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 void
 fs_visitor::nir_setup_uniforms(nir_shader *shader)
 {
-   uniforms = shader->num_uniforms;
    num_direct_uniforms = shader->num_direct_uniforms;
 
+   if (dispatch_width != 8)
+      return;
+
    /* We split the uniform register file in half.  The first half is
     * entirely direct uniforms.  The second half is indirect.
     */
-   param_size[0] = num_direct_uniforms;
+   if (num_direct_uniforms > 0)
+      param_size[0] = num_direct_uniforms;
    if (shader->num_uniforms > num_direct_uniforms)
       param_size[num_direct_uniforms] = shader->num_uniforms - num_direct_uniforms;
 
-   if (dispatch_width != 8)
-      return;
+   uniforms = shader->num_uniforms;
 
    if (shader_prog) {
       foreach_list_typed(nir_variable, var, node, &shader->uniforms) {
@@ -233,17 +238,26 @@ fs_visitor::nir_setup_uniform(nir_variable *var)
          continue;
       }
 
-      unsigned slots = storage->type->component_slots();
-      if (storage->array_elements)
-         slots *= storage->array_elements;
+      if (storage->type->is_image()) {
+         /* Images don't get a valid location assigned by nir_lower_io()
+          * because their size is driver-specific, so we need to allocate
+          * space for them here at the end of the parameter array.
+          */
+         var->data.driver_location = uniforms;
+         param_size[uniforms] =
+            BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1);
 
-      for (unsigned i = 0; i < slots; i++) {
-         stage_prog_data->param[index++] = &storage->storage[i];
+         setup_image_uniform_values(storage);
+      } else {
+         unsigned slots = storage->type->component_slots();
+         if (storage->array_elements)
+            slots *= storage->array_elements;
+
+         for (unsigned i = 0; i < slots; i++) {
+            stage_prog_data->param[index++] = &storage->storage[i];
+         }
       }
    }
-
-   /* Make sure we actually initialized the right amount of stuff here. */
-   assert(var->data.driver_location + var->type->component_slots() == index);
 }
 
 void
@@ -366,6 +380,9 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
       nir_locals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
    }
 
+   nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
+                             impl->ssa_alloc);
+
    nir_emit_cf_list(&impl->body);
 }
 
@@ -413,18 +430,12 @@ fs_visitor::nir_emit_if(nir_if *if_stmt)
 
    bld.emit(BRW_OPCODE_ENDIF);
 
-   if (!try_replace_with_sel() && devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
+   try_replace_with_sel();
 }
 
 void
 fs_visitor::nir_emit_loop(nir_loop *loop)
 {
-   if (devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-
    bld.emit(BRW_OPCODE_DO);
 
    nir_emit_cf_list(&loop->body);
@@ -459,9 +470,11 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       break;
 
    case nir_instr_type_load_const:
-      /* We can hit these, but we do nothing now and use them as
-       * immediates later.
-       */
+      nir_emit_load_const(abld, nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_ssa_undef:
+      nir_emit_undef(abld, nir_instr_as_ssa_undef(instr));
       break;
 
    case nir_instr_type_jump:
@@ -473,39 +486,16 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
    }
 }
 
-static brw_reg_type
-brw_type_for_nir_type(nir_alu_type type)
-{
-   switch (type) {
-   case nir_type_unsigned:
-      return BRW_REGISTER_TYPE_UD;
-   case nir_type_bool:
-   case nir_type_int:
-      return BRW_REGISTER_TYPE_D;
-   case nir_type_float:
-      return BRW_REGISTER_TYPE_F;
-   default:
-      unreachable("unknown type");
-   }
-
-   return BRW_REGISTER_TYPE_F;
-}
-
 bool
 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
                                          const fs_reg &result)
 {
-   if (instr->src[0].src.is_ssa ||
-       !instr->src[0].src.reg.reg ||
-       !instr->src[0].src.reg.reg->parent_instr)
-      return false;
-
-   if (instr->src[0].src.reg.reg->parent_instr->type !=
-       nir_instr_type_intrinsic)
+   if (!instr->src[0].src.is_ssa ||
+       instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
       return false;
 
    nir_intrinsic_instr *src0 =
-      nir_instr_as_intrinsic(instr->src[0].src.reg.reg->parent_instr);
+      nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 
    if (src0->intrinsic != nir_intrinsic_load_front_face)
       return false;
@@ -618,11 +608,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
             continue;
 
          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
-            inst = bld.MOV(offset(temp, i),
-                           offset(op[0], instr->src[0].swizzle[i]));
+            inst = bld.MOV(offset(temp, bld, i),
+                           offset(op[0], bld, instr->src[0].swizzle[i]));
          } else {
-            inst = bld.MOV(offset(temp, i),
-                           offset(op[i], instr->src[i].swizzle[0]));
+            inst = bld.MOV(offset(temp, bld, i),
+                           offset(op[i], bld, instr->src[i].swizzle[0]));
          }
          inst->saturate = instr->dest.saturate;
       }
@@ -636,7 +626,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
             if (!(instr->dest.write_mask & (1 << i)))
                continue;
 
-            bld.MOV(offset(result, i), offset(temp, i));
+            bld.MOV(offset(result, bld, i), offset(temp, bld, i));
          }
       }
       return;
@@ -657,12 +647,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
       channel = ffs(instr->dest.write_mask) - 1;
 
-      result = offset(result, channel);
+      result = offset(result, bld, channel);
    }
 
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
-      op[i] = offset(op[i], instr->src[i].swizzle[channel]);
+      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
    }
 
    switch (instr->op) {
@@ -788,67 +778,20 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_imul_high:
-   case nir_op_umul_high: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
-
-      fs_inst *mul = bld.MUL(acc, op[0], op[1]);
-      bld.MACH(result, op[0], op[1]);
-
-      /* Until Gen8, integer multiplies read 32-bits from one source, and
-       * 16-bits from the other, and relying on the MACH instruction to
-       * generate the high bits of the result.
-       *
-       * On Gen8, the multiply instruction does a full 32x32-bit multiply,
-       * but in order to do a 64x64-bit multiply we have to simulate the
-       * previous behavior and then use a MACH instruction.
-       *
-       * FINISHME: Don't use source modifiers on src1.
-       */
-      if (devinfo->gen >= 8) {
-         assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
-                mul->src[1].type == BRW_REGISTER_TYPE_UD);
-         if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
-            mul->src[1].type = BRW_REGISTER_TYPE_W;
-            mul->src[1].stride = 2;
-         } else {
-            mul->src[1].type = BRW_REGISTER_TYPE_UW;
-            mul->src[1].stride = 2;
-         }
-      }
+   case nir_op_umul_high:
+      bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
       break;
-   }
 
    case nir_op_idiv:
    case nir_op_udiv:
       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
       break;
 
-   case nir_op_uadd_carry: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
+   case nir_op_uadd_carry:
+      unreachable("Should have been lowered by carry_to_arith().");
 
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      bld.ADDC(bld.null_reg_ud(), op[0], op[1]);
-      bld.MOV(result, fs_reg(acc));
-      break;
-   }
-
-   case nir_op_usub_borrow: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      bld.SUBB(bld.null_reg_ud(), op[0], op[1]);
-      bld.MOV(result, fs_reg(acc));
-      break;
-   }
+   case nir_op_usub_borrow:
+      unreachable("Should have been lowered by borrow_to_arith().");
 
    case nir_op_umod:
       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
@@ -878,28 +821,28 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 
    case nir_op_inot:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
+         op[0] = resolve_source_modifiers(op[0]);
       }
       bld.NOT(result, op[0]);
       break;
    case nir_op_ixor:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.XOR(result, op[0], op[1]);
       break;
    case nir_op_ior:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.OR(result, op[0], op[1]);
       break;
    case nir_op_iand:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.AND(result, op[0], op[1]);
       break;
@@ -959,10 +902,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_b2i:
-      bld.AND(result, op[0], fs_reg(1));
-      break;
    case nir_op_b2f:
-      bld.AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], fs_reg(0x3f800000u));
+      bld.MOV(result, negate(op[0]));
       break;
 
    case nir_op_f2b:
@@ -1146,17 +1087,36 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    }
 }
 
+void
+fs_visitor::nir_emit_load_const(const fs_builder &bld,
+                                nir_load_const_instr *instr)
+{
+   fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
+
+   for (unsigned i = 0; i < instr->def.num_components; i++)
+      bld.MOV(offset(reg, bld, i), fs_reg(instr->value.i[i]));
+
+   nir_ssa_values[instr->def.index] = reg;
+}
+
+void
+fs_visitor::nir_emit_undef(const fs_builder &bld, nir_ssa_undef_instr *instr)
+{
+   nir_ssa_values[instr->def.index] = bld.vgrf(BRW_REGISTER_TYPE_D,
+                                               instr->def.num_components);
+}
+
 static fs_reg
 fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
                    unsigned base_offset, nir_src *indirect)
 {
    fs_reg reg;
-   if (nir_reg->is_global)
-      reg = v->nir_globals[nir_reg->index];
-   else
-      reg = v->nir_locals[nir_reg->index];
 
-   reg = offset(reg, base_offset * nir_reg->num_components);
+   assert(!nir_reg->is_global);
+
+   reg = v->nir_locals[nir_reg->index];
+
+   reg = offset(reg, v->bld, base_offset * nir_reg->num_components);
    if (indirect) {
       int multiplier = nir_reg->num_components * (v->dispatch_width / 8);
 
@@ -1171,34 +1131,77 @@ fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
 fs_reg
 fs_visitor::get_nir_src(nir_src src)
 {
+   fs_reg reg;
    if (src.is_ssa) {
-      assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
-      nir_load_const_instr *load = nir_instr_as_load_const(src.ssa->parent_instr);
-      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, src.ssa->num_components);
-
-      for (unsigned i = 0; i < src.ssa->num_components; ++i)
-         bld.MOV(offset(reg, i), fs_reg(load->value.i[i]));
-
-      return reg;
+      reg = nir_ssa_values[src.ssa->index];
    } else {
-      fs_reg reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
-                                      src.reg.indirect);
-
-      /* to avoid floating-point denorm flushing problems, set the type by
-       * default to D - instructions that need floating point semantics will set
-       * this to F if they need to
-       */
-      return retype(reg, BRW_REGISTER_TYPE_D);
+      reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
+                               src.reg.indirect);
    }
+
+   /* to avoid floating-point denorm flushing problems, set the type by
+    * default to D - instructions that need floating point semantics will set
+    * this to F if they need to
+    */
+   return retype(reg, BRW_REGISTER_TYPE_D);
 }
 
 fs_reg
 fs_visitor::get_nir_dest(nir_dest dest)
 {
+   if (dest.is_ssa) {
+      nir_ssa_values[dest.ssa.index] = bld.vgrf(BRW_REGISTER_TYPE_F,
+                                                dest.ssa.num_components);
+      return nir_ssa_values[dest.ssa.index];
+   }
+
    return fs_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
                              dest.reg.indirect);
 }
 
+fs_reg
+fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
+{
+   fs_reg image(UNIFORM, deref->var->data.driver_location,
+                BRW_REGISTER_TYPE_UD);
+
+   if (deref->deref.child) {
+      const nir_deref_array *deref_array =
+         nir_deref_as_array(deref->deref.child);
+      assert(deref->deref.child->deref_type == nir_deref_type_array &&
+             deref_array->deref.child == NULL);
+      const unsigned size = glsl_get_length(deref->var->type);
+      const unsigned base = MIN2(deref_array->base_offset, size - 1);
+
+      image = offset(image, bld, base * BRW_IMAGE_PARAM_SIZE);
+
+      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+         fs_reg *tmp = new(mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+
+         if (devinfo->gen == 7 && !devinfo->is_haswell) {
+            /* IVB hangs when trying to access an invalid surface index with
+             * the dataport.  According to the spec "if the index used to
+             * select an individual element is negative or greater than or
+             * equal to the size of the array, the results of the operation
+             * are undefined but may not lead to termination" -- which is one
+             * of the possible outcomes of the hang.  Clamp the index to
+             * prevent access outside of the array bounds.
+             */
+            bld.emit_minmax(*tmp, retype(get_nir_src(deref_array->indirect),
+                                         BRW_REGISTER_TYPE_UD),
+                            fs_reg(size - base - 1), BRW_CONDITIONAL_L);
+         } else {
+            bld.MOV(*tmp, get_nir_src(deref_array->indirect));
+         }
+
+         bld.MUL(*tmp, *tmp, fs_reg(BRW_IMAGE_PARAM_SIZE));
+         image.reladdr = tmp;
+      }
+   }
+
+   return image;
+}
+
 void
 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
                          unsigned wr_mask)
@@ -1208,15 +1211,64 @@ fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
          continue;
 
       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
-      new_inst->dst = offset(new_inst->dst, i);
+      new_inst->dst = offset(new_inst->dst, bld, i);
       for (unsigned j = 0; j < new_inst->sources; j++)
          if (new_inst->src[j].file == GRF)
-            new_inst->src[j] = offset(new_inst->src[j], i);
+            new_inst->src[j] = offset(new_inst->src[j], bld, i);
 
       bld.emit(new_inst);
    }
 }
 
+/**
+ * Get the matching channel register datatype for an image intrinsic of the
+ * specified GLSL image type.
+ */
+static brw_reg_type
+get_image_base_type(const glsl_type *type)
+{
+   switch ((glsl_base_type)type->sampler_type) {
+   case GLSL_TYPE_UINT:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_INT:
+      return BRW_REGISTER_TYPE_D;
+   case GLSL_TYPE_FLOAT:
+      return BRW_REGISTER_TYPE_F;
+   default:
+      unreachable("Not reached.");
+   }
+}
+
+/**
+ * Get the appropriate atomic op for an image atomic intrinsic.
+ */
+static unsigned
+get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
+{
+   switch (op) {
+   case nir_intrinsic_image_atomic_add:
+      return BRW_AOP_ADD;
+   case nir_intrinsic_image_atomic_min:
+      return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+              BRW_AOP_IMIN : BRW_AOP_UMIN);
+   case nir_intrinsic_image_atomic_max:
+      return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+              BRW_AOP_IMAX : BRW_AOP_UMAX);
+   case nir_intrinsic_image_atomic_and:
+      return BRW_AOP_AND;
+   case nir_intrinsic_image_atomic_or:
+      return BRW_AOP_OR;
+   case nir_intrinsic_image_atomic_xor:
+      return BRW_AOP_XOR;
+   case nir_intrinsic_image_atomic_exchange:
+      return BRW_AOP_MOV;
+   case nir_intrinsic_image_atomic_comp_swap:
+      return BRW_AOP_CMPWR;
+   default:
+      unreachable("Not reachable.");
+   }
+}
+
 void
 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
 {
@@ -1255,25 +1307,102 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_atomic_counter_inc:
    case nir_intrinsic_atomic_counter_dec:
    case nir_intrinsic_atomic_counter_read: {
-      unsigned surf_index = prog_data->binding_table.abo_start +
-                            (unsigned) instr->const_index[0];
-      fs_reg offset = fs_reg(get_nir_src(instr->src[0]));
+      using namespace surface_access;
 
+      /* Get the arguments of the atomic intrinsic. */
+      const fs_reg offset = get_nir_src(instr->src[0]);
+      const unsigned surface = (stage_prog_data->binding_table.abo_start +
+                                instr->const_index[0]);
+      fs_reg tmp;
+
+      /* Emit a surface read or atomic op. */
       switch (instr->intrinsic) {
-         case nir_intrinsic_atomic_counter_inc:
-            emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
-                                fs_reg(), fs_reg());
-            break;
-         case nir_intrinsic_atomic_counter_dec:
-            emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
-                                fs_reg(), fs_reg());
-            break;
-         case nir_intrinsic_atomic_counter_read:
-            emit_untyped_surface_read(surf_index, dest, offset);
-            break;
-         default:
-            unreachable("Unreachable");
+      case nir_intrinsic_atomic_counter_read:
+         tmp = emit_untyped_read(bld, fs_reg(surface), offset, 1, 1);
+         break;
+
+      case nir_intrinsic_atomic_counter_inc:
+         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+                                   fs_reg(), 1, 1, BRW_AOP_INC);
+         break;
+
+      case nir_intrinsic_atomic_counter_dec:
+         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+                                   fs_reg(), 1, 1, BRW_AOP_PREDEC);
+         break;
+
+      default:
+         unreachable("Unreachable");
       }
+
+      /* Assign the result. */
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
+
+      /* Mark the surface as used. */
+      brw_mark_surface_used(stage_prog_data, surface);
+      break;
+   }
+
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_atomic_min:
+   case nir_intrinsic_image_atomic_max:
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_image_atomic_comp_swap: {
+      using namespace image_access;
+
+      /* Get the referenced image variable and type. */
+      const nir_variable *var = instr->variables[0]->var;
+      const glsl_type *type = var->type->without_array();
+      const brw_reg_type base_type = get_image_base_type(type);
+
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+      const unsigned arr_dims = type->sampler_array ? 1 : 0;
+      const unsigned surf_dims = type->coordinate_components() - arr_dims;
+      const mesa_format format =
+         (var->data.image.write_only ? MESA_FORMAT_NONE :
+          _mesa_get_shader_image_format(var->data.image.format));
+
+      /* Get the arguments of the image intrinsic. */
+      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg addr = retype(get_nir_src(instr->src[0]),
+                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg src0 = (info->num_srcs >= 3 ?
+                           retype(get_nir_src(instr->src[2]), base_type) :
+                           fs_reg());
+      const fs_reg src1 = (info->num_srcs >= 4 ?
+                           retype(get_nir_src(instr->src[3]), base_type) :
+                           fs_reg());
+      fs_reg tmp;
+
+      /* Emit an image load, store or atomic op. */
+      if (instr->intrinsic == nir_intrinsic_image_load)
+         tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
+
+      else if (instr->intrinsic == nir_intrinsic_image_store)
+         emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, format);
+
+      else
+         tmp = emit_image_atomic(bld, image, addr, src0, src1,
+                                 surf_dims, arr_dims, info->dest_components,
+                                 get_image_atomic_op(instr->intrinsic, type));
+
+      /* Assign the result. */
+      for (unsigned c = 0; c < info->dest_components; ++c)
+         bld.MOV(offset(retype(dest, base_type), bld, c),
+                 offset(tmp, bld, c));
+      break;
+   }
+
+   case nir_intrinsic_memory_barrier: {
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
+      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+         ->regs_written = 2;
       break;
    }
 
@@ -1322,7 +1451,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       assert(sample_pos.file != BAD_FILE);
       dest.type = sample_pos.type;
       bld.MOV(dest, sample_pos);
-      bld.MOV(offset(dest, 1), offset(sample_pos, 1));
+      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
       break;
    }
 
@@ -1349,13 +1478,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       }
 
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = offset(retype(uniform_reg, dest.type), index);
+         fs_reg src = offset(retype(uniform_reg, dest.type), bld, index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
          index++;
 
          bld.MOV(dest, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1387,7 +1516,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[0]),
                  fs_reg(stage_prog_data->binding_table.ubo_start));
-         bld.emit_uniformize(surf_index, surf_index);
+         surf_index = bld.emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -1406,7 +1535,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
          unsigned vec4_offset = instr->const_index[1] / 4;
          for (int i = 0; i < instr->num_components; i++)
-            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, i), surf_index,
+            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
                                        base_offset, vec4_offset + i);
       } else {
          fs_reg packed_consts = vgrf(glsl_type::float_type);
@@ -1425,7 +1554,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
             assert(packed_consts.subreg_offset < 32);
 
             bld.MOV(dest, packed_consts);
-            dest = offset(dest, 1);
+            dest = offset(dest, bld, 1);
          }
       }
       break;
@@ -1437,14 +1566,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_load_input: {
       unsigned index = 0;
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = offset(retype(nir_inputs, dest.type),
+         fs_reg src = offset(retype(nir_inputs, dest.type), bld,
                              instr->const_index[0] + index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
          index++;
 
          bld.MOV(dest, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1470,11 +1599,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_interp_var_at_centroid:
    case nir_intrinsic_interp_var_at_sample:
    case nir_intrinsic_interp_var_at_offset: {
-      /* in SIMD16 mode, the pixel interpolator returns coords interleaved
-       * 8 channels at a time, same as the barycentric coords presented in
-       * the FS payload. this requires a bit of extra work to support.
-       */
-      no16("interpolate_at_* not yet supported in SIMD16 mode.");
+      assert(stage == MESA_SHADER_FRAGMENT);
+
+      ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true;
 
       fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
 
@@ -1517,7 +1644,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                                        BRW_REGISTER_TYPE_F);
             for (int i = 0; i < 2; i++) {
                fs_reg temp = vgrf(glsl_type::float_type);
-               bld.MUL(temp, offset(offset_src, i), fs_reg(16.0f));
+               bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
                fs_reg itemp = vgrf(glsl_type::int_type);
                bld.MOV(itemp, temp);  /* float to int */
 
@@ -1537,10 +1664,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
                 */
                set_condmod(BRW_CONDITIONAL_L,
-                           bld.SEL(offset(src, i), itemp, fs_reg(7)));
+                           bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
             }
 
-            mlen = 2;
+            mlen = 2 * dispatch_width / 8;
             inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
                             fs_reg(0u));
          }
@@ -1552,7 +1679,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       }
 
       inst->mlen = mlen;
-      inst->regs_written = 2; /* 2 floats per slot returned */
+      /* 2 floats per slot returned */
+      inst->regs_written = 2 * dispatch_width / 8;
       inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
                                INTERP_QUALIFIER_NOPERSPECTIVE;
 
@@ -1561,7 +1689,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          src.type = dest.type;
 
          bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1573,13 +1701,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg src = get_nir_src(instr->src[0]);
       unsigned index = 0;
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg new_dest = offset(retype(nir_outputs, src.type),
+         fs_reg new_dest = offset(retype(nir_outputs, src.type), bld,
                                   instr->const_index[0] + index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
          index++;
          bld.MOV(new_dest, src);
-         src = offset(src, 1);
+         src = offset(src, bld, 1);
       }
       break;
    }
@@ -1689,7 +1817,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
          /* Emit code to evaluate the actual indexing expression */
          sampler_reg = vgrf(glsl_type::uint_type);
          bld.ADD(sampler_reg, src, fs_reg(sampler));
-         bld.emit_uniformize(sampler_reg, sampler_reg);
+         sampler_reg = bld.emit_uniformize(sampler_reg);
          break;
       }
 
@@ -1715,20 +1843,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
       }
    }
 
-   enum glsl_base_type dest_base_type;
-   switch (instr->dest_type) {
-   case nir_type_float:
-      dest_base_type = GLSL_TYPE_FLOAT;
-      break;
-   case nir_type_int:
-      dest_base_type = GLSL_TYPE_INT;
-      break;
-   case nir_type_unsigned:
-      dest_base_type = GLSL_TYPE_UINT;
-      break;
-   default:
-      unreachable("bad type");
-   }
+   enum glsl_base_type dest_base_type =
+     brw_glsl_base_type_for_nir_type (instr->dest_type);
 
    const glsl_type *dest_type =
       glsl_type::get_instance(dest_base_type, nir_tex_instr_dest_size(instr),
@@ -1758,7 +1874,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
    fs_reg dest = get_nir_dest(instr->dest);
    dest.type = this->result.type;
    unsigned num_components = nir_tex_instr_dest_size(instr);
-   emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, dest, this->result),
+   emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
+                             dest, this->result),
                 (1 << num_components) - 1);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
index d92d4bbd81d..b75f40ba5a1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
@@ -24,6 +24,8 @@
 #include "brw_fs.h"
 #include "brw_cfg.h"
 
+using namespace brw;
+
 /** @file brw_fs_peephole_predicated_break.cpp
  *
  * Loops are often structured as
@@ -85,9 +87,9 @@ fs_visitor::opt_peephole_predicated_break()
        * instruction to set the flag register.
        */
       if (devinfo->gen == 6 && if_inst->conditional_mod) {
-         bld.at(if_block, if_inst)
-            .CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
-                 if_inst->conditional_mod);
+         const fs_builder ibld(this, if_block, if_inst);
+         ibld.CMP(ibld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                  if_inst->conditional_mod);
          jump_inst->predicate = BRW_PREDICATE_NORMAL;
       } else {
          jump_inst->predicate = if_inst->predicate;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 364fc4a5ad2..b70895ec2ff 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -73,11 +73,20 @@ fs_visitor::assign_regs_trivial()
 }
 
 static void
-brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
+brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
 {
    const struct brw_device_info *devinfo = compiler->devinfo;
    int base_reg_count = BRW_MAX_GRF;
-   int index = reg_width - 1;
+   int index = (dispatch_width / 8) - 1;
+
+   if (dispatch_width > 8 && devinfo->gen >= 7) {
+      /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
+       * SIMD16.  Therefore, we can use the exact same register sets for
+       * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
+       */
+      compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
+      return;
+   }
 
    /* The registers used to make up almost all values handled in the compiler
     * are a scalar value occupying a single register (or 2 registers in the
@@ -121,7 +130,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    /* Compute the total number of registers across all classes. */
    int ra_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          /* From the G45 PRM:
           *
           * In order to reduce the hardware complexity, the following
@@ -168,7 +177,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    int pairs_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
       int class_reg_count;
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
 
          /* See comment below.  The only difference here is that we are
@@ -214,7 +223,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
          pairs_reg_count = class_reg_count;
       }
 
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          for (int j = 0; j < class_reg_count; j++) {
             ra_class_add_reg(regs, classes[i], reg);
 
@@ -249,7 +258,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    /* Add a special class for aligned pairs, which we'll put delta_xy
     * in on Gen <= 6 so that we can do PLN.
     */
-   if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) {
+   if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) {
       aligned_pairs_class = ra_alloc_reg_class(regs);
 
       for (int i = 0; i < pairs_reg_count; i++) {
@@ -287,8 +296,8 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
 void
 brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
 {
-   brw_alloc_reg_set(compiler, 1);
-   brw_alloc_reg_set(compiler, 2);
+   brw_alloc_reg_set(compiler, 8);
+   brw_alloc_reg_set(compiler, 16);
 }
 
 static int
@@ -341,7 +350,9 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
    int loop_end_ip = 0;
 
    int payload_last_use_ip[payload_node_count];
-   memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip));
+   for (int i = 0; i < payload_node_count; i++)
+      payload_last_use_ip[i] = -1;
+
    int ip = 0;
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       switch (inst->opcode) {
@@ -380,32 +391,15 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
             if (node_nr >= payload_node_count)
                continue;
 
-            payload_last_use_ip[node_nr] = use_ip;
+            for (int j = 0; j < inst->regs_read(i); j++) {
+               payload_last_use_ip[node_nr + j] = use_ip;
+               assert(node_nr + j < payload_node_count);
+            }
          }
       }
 
       /* Special case instructions which have extra implied registers used. */
       switch (inst->opcode) {
-      case FS_OPCODE_LINTERP:
-         /* On gen6+ in SIMD16, there are 4 adjacent registers used by
-          * PLN's sourcing of the deltas, while we list only the first one
-          * in the arguments.  Pre-gen6, the deltas are computed in normal
-          * VGRFs.
-          */
-         if (devinfo->gen >= 6) {
-            int delta_x_arg = 0;
-            if (inst->src[delta_x_arg].file == HW_REG &&
-                inst->src[delta_x_arg].fixed_hw_reg.file ==
-                BRW_GENERAL_REGISTER_FILE) {
-               for (int i = 1; i < 4; ++i) {
-                  int node = inst->src[delta_x_arg].fixed_hw_reg.nr + i;
-                  assert(node < payload_node_count);
-                  payload_last_use_ip[node] = use_ip;
-               }
-            }
-         }
-         break;
-
       case CS_OPCODE_CS_TERMINATE:
          payload_last_use_ip[0] = use_ip;
          break;
@@ -428,6 +422,9 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
    }
 
    for (int i = 0; i < payload_node_count; i++) {
+      if (payload_last_use_ip[i] == -1)
+         continue;
+
       /* Mark the payload node as interfering with any virtual grf that is
        * live between the start of the program and our last use of the payload
        * node.
@@ -706,10 +703,8 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
                          uint32_t spill_offset, int count)
 {
    int reg_size = 1;
-   if (dispatch_width == 16 && count % 2 == 0) {
+   if (dispatch_width == 16 && count % 2 == 0)
       reg_size = 2;
-      dst.width = 16;
-   }
 
    const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
                               .group(reg_size * 8, 0)
@@ -752,7 +747,7 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
 
    for (int i = 0; i < count / reg_size; i++) {
       fs_inst *spill_inst =
-         ibld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src);
+         ibld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, ibld.null_reg_f(), src);
       src.reg_offset += reg_size;
       spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
       spill_inst->mlen = 1 + reg_size; /* header, value */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 2ad7079bdf8..72e873857ce 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -167,7 +167,6 @@ fs_visitor::register_coalesce()
          src_size = alloc.sizes[inst->src[0].reg];
          assert(src_size <= MAX_VGRF_SIZE);
 
-         assert(inst->src[0].width % 8 == 0);
          channels_remaining = src_size;
          memset(mov, 0, sizeof(mov));
 
@@ -196,7 +195,7 @@ fs_visitor::register_coalesce()
             continue;
          }
          reg_to_offset[offset] = inst->dst.reg_offset;
-         if (inst->src[0].width == 16)
+         if (inst->regs_written > 1)
             reg_to_offset[offset + 1] = inst->dst.reg_offset + 1;
          mov[offset] = inst;
          channels_remaining -= inst->regs_written;
@@ -229,7 +228,6 @@ fs_visitor::register_coalesce()
          continue;
 
       progress = true;
-      bool was_load_payload = inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD;
 
       for (int i = 0; i < src_size; i++) {
          if (mov[i]) {
@@ -243,22 +241,19 @@ fs_visitor::register_coalesce()
       }
 
       foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
-         for (int i = 0; i < src_size; i++) {
-            if (mov[i] || was_load_payload) {
-               if (scan_inst->dst.file == GRF &&
-                   scan_inst->dst.reg == reg_from &&
-                   scan_inst->dst.reg_offset == i) {
-                  scan_inst->dst.reg = reg_to;
-                  scan_inst->dst.reg_offset = reg_to_offset[i];
-               }
-               for (int j = 0; j < scan_inst->sources; j++) {
-                  if (scan_inst->src[j].file == GRF &&
-                      scan_inst->src[j].reg == reg_from &&
-                      scan_inst->src[j].reg_offset == i) {
-                     scan_inst->src[j].reg = reg_to;
-                     scan_inst->src[j].reg_offset = reg_to_offset[i];
-                  }
-               }
+         if (scan_inst->dst.file == GRF &&
+             scan_inst->dst.reg == reg_from) {
+            scan_inst->dst.reg = reg_to;
+            scan_inst->dst.reg_offset =
+               reg_to_offset[scan_inst->dst.reg_offset];
+         }
+
+         for (int j = 0; j < scan_inst->sources; j++) {
+            if (scan_inst->src[j].file == GRF &&
+                scan_inst->src[j].reg == reg_from) {
+               scan_inst->src[j].reg = reg_to;
+               scan_inst->src[j].reg_offset =
+                  reg_to_offset[scan_inst->src[j].reg_offset];
             }
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
index 8660ec08b8f..d190d8eb6b4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
@@ -174,6 +174,9 @@ fs_visitor::opt_peephole_sel()
 
          /* Check that the MOVs are the right form. */
          if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
+             then_mov[i]->exec_size != else_mov[i]->exec_size ||
+             then_mov[i]->force_sechalf != else_mov[i]->force_sechalf ||
+             then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
              then_mov[i]->is_partial_write() ||
              else_mov[i]->is_partial_write() ||
              then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
@@ -192,14 +195,17 @@ fs_visitor::opt_peephole_sel()
       if (movs == 0)
          continue;
 
-      const fs_builder ibld = bld.at(block, if_inst);
-
       /* Emit a CMP if our IF used the embedded comparison */
-      if (devinfo->gen == 6 && if_inst->conditional_mod)
+      if (devinfo->gen == 6 && if_inst->conditional_mod) {
+         const fs_builder ibld(this, block, if_inst);
          ibld.CMP(ibld.null_reg_d(), if_inst->src[0], if_inst->src[1],
                   if_inst->conditional_mod);
+      }
 
       for (int i = 0; i < movs; i++) {
+         const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
+                                 .at(block, if_inst);
+
          if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
             ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
          } else {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
new file mode 100644
index 00000000000..50e0acd05f5
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -0,0 +1,1096 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs_surface_builder.h"
+#include "brw_fs.h"
+
+using namespace brw;
+
+namespace brw {
+   namespace surface_access {
+      namespace {
+         /**
+          * Generate a logical send opcode for a surface message and return
+          * the result.
+          */
+         fs_reg
+         emit_send(const fs_builder &bld, enum opcode opcode,
+                   const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
+                   unsigned dims, unsigned arg, unsigned rsize,
+                   brw_predicate pred = BRW_PREDICATE_NONE)
+         {
+            /* Reduce the dynamically uniform surface index to a single
+             * scalar.
+             */
+            const fs_reg usurface = bld.emit_uniformize(surface);
+            const fs_reg srcs[] = {
+               addr, src, usurface, fs_reg(dims), fs_reg(arg)
+            };
+            const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
+            fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+
+            inst->regs_written = rsize * bld.dispatch_width() / 8;
+            inst->predicate = pred;
+            return dst;
+         }
+      }
+
+      /**
+       * Emit an untyped surface read opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the returned value.
+       */
+      fs_reg
+      emit_untyped_read(const fs_builder &bld,
+                        const fs_reg &surface, const fs_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred)
+      {
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                          addr, fs_reg(), surface, dims, size, size, pred);
+      }
+
+      /**
+       * Emit an untyped surface write opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the argument.
+       */
+      void
+      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+                         const fs_reg &addr, const fs_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred)
+      {
+         emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+                   addr, src, surface, dims, size, 0, pred);
+      }
+
+      /**
+       * Emit an untyped surface atomic opcode.  \p dims determines the number
+       * of components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      fs_reg
+      emit_untyped_atomic(const fs_builder &bld,
+                          const fs_reg &surface, const fs_reg &addr,
+                          const fs_reg &src0, const fs_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred)
+      {
+         /* FINISHME: Factor out this frequently recurring pattern into a
+          * helper function.
+          */
+         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const fs_reg srcs[] = { src0, src1 };
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+                          addr, tmp, surface, dims, op, rsize, pred);
+      }
+
+      /**
+       * Emit a typed surface read opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * returned value.
+       */
+      fs_reg
+      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+                      const fs_reg &addr, unsigned dims, unsigned size)
+      {
+         return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+                          addr, fs_reg(), surface, dims, size, size);
+      }
+
+      /**
+       * Emit a typed surface write opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * argument.
+       */
+      void
+      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned dims, unsigned size)
+      {
+         emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+                   addr, src, surface, dims, size, 0);
+      }
+
+      /**
+       * Emit a typed surface atomic opcode.  \p dims determines the number of
+       * components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      fs_reg
+      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+                        const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred)
+      {
+         /* FINISHME: Factor out this frequently recurring pattern into a
+          * helper function.
+          */
+         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const fs_reg srcs[] = { src0, src1 };
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+         return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+                          addr, tmp, surface, dims, op, rsize);
+      }
+   }
+}
+
+namespace {
+   namespace image_format_info {
+      /**
+       * Simple 4-tuple of scalars used to pass around per-color component
+       * values.
+       */
+      struct color_u {
+         color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
+         {
+         }
+
+         color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
+            r(r), g(g), b(b), a(a)
+         {
+         }
+
+         unsigned
+         operator[](unsigned i) const
+         {
+            const unsigned xs[] = { r, g, b, a };
+            return xs[i];
+         }
+
+         unsigned r, g, b, a;
+      };
+
+      /**
+       * Return the per-channel bitfield widths for a given image format.
+       */
+      inline color_u
+      get_bit_widths(mesa_format format)
+      {
+         return color_u(_mesa_get_format_bits(format, GL_RED_BITS),
+                        _mesa_get_format_bits(format, GL_GREEN_BITS),
+                        _mesa_get_format_bits(format, GL_BLUE_BITS),
+                        _mesa_get_format_bits(format, GL_ALPHA_BITS));
+      }
+
+      /**
+       * Return the per-channel bitfield shifts for a given image format.
+       */
+      inline color_u
+      get_bit_shifts(mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         return color_u(0, widths.r, widths.r + widths.g,
+                        widths.r + widths.g + widths.b);
+      }
+
+      /**
+       * Return true if all present components have the same bit width.
+       */
+      inline bool
+      is_homogeneous(mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         return ((widths.g == 0 || widths.g == widths.r) &&
+                 (widths.b == 0 || widths.b == widths.r) &&
+                 (widths.a == 0 || widths.a == widths.r));
+      }
+
+      /**
+       * Return true if the format conversion boils down to a trivial copy.
+       */
+      inline bool
+      is_conversion_trivial(const brw_device_info *devinfo, mesa_format format)
+      {
+         return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
+                 format == brw_lower_mesa_image_format(devinfo, format);
+      }
+
+      /**
+       * Return true if the hardware natively supports some format with
+       * compatible bitfield layout, but possibly different data types.
+       */
+      inline bool
+      has_supported_bit_layout(const brw_device_info *devinfo,
+                               mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         const color_u lower_widths = get_bit_widths(
+            brw_lower_mesa_image_format(devinfo, format));
+
+         return (widths.r == lower_widths.r &&
+                 widths.g == lower_widths.g &&
+                 widths.b == lower_widths.b &&
+                 widths.a == lower_widths.a);
+      }
+
+      /**
+       * Return true if we are required to spread individual components over
+       * several components of the format used by the hardware (RG32 and
+       * friends implemented as RGBA16UI).
+       */
+      inline bool
+      has_split_bit_layout(const brw_device_info *devinfo, mesa_format format)
+      {
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+
+         return (_mesa_format_num_components(format) <
+                 _mesa_format_num_components(lower_format));
+      }
+
+      /**
+       * Return true unless we have to fall back to untyped surface access.
+       * Fail!
+       */
+      inline bool
+      has_matching_typed_format(const brw_device_info *devinfo,
+                                mesa_format format)
+      {
+         return (_mesa_get_format_bytes(format) <= 4 ||
+                 (_mesa_get_format_bytes(format) <= 8 &&
+                  (devinfo->gen >= 8 || devinfo->is_haswell)) ||
+                 devinfo->gen >= 9);
+      }
+
+      /**
+       * Return true if the hardware returns garbage in the unused high bits
+       * of each component.  This may happen on IVB because we rely on the
+       * undocumented behavior that typed reads from surfaces of the
+       * unsupported R8 and R16 formats return useful data in their least
+       * significant bits.
+       */
+      inline bool
+      has_undefined_high_bits(const brw_device_info *devinfo,
+                              mesa_format format)
+      {
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+
+         return (devinfo->gen == 7 && !devinfo->is_haswell &&
+                 (lower_format == MESA_FORMAT_R_UINT16 ||
+                  lower_format == MESA_FORMAT_R_UINT8));
+      }
+
+      /**
+       * Return true if the format represents values as signed integers
+       * requiring sign extension when unpacking.
+       */
+      inline bool
+      needs_sign_extension(mesa_format format)
+      {
+         return (_mesa_get_format_datatype(format) == GL_SIGNED_NORMALIZED ||
+                 _mesa_get_format_datatype(format) == GL_INT);
+      }
+   }
+
+   namespace image_validity {
+      /**
+       * Check whether there is an image bound at the given index and write
+       * the comparison result to f0.0.  Returns an appropriate predication
+       * mode to use on subsequent image operations.
+       */
+      brw_predicate
+      emit_surface_check(const fs_builder &bld, const fs_reg &image)
+      {
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+         if (devinfo->gen == 7 && !devinfo->is_haswell) {
+            /* Check the first component of the size field to find out if the
+             * image is bound.  Necessary on IVB for typed atomics because
+             * they don't seem to respect null surfaces and will happily
+             * corrupt or read random memory when no image is bound.
+             */
+            bld.CMP(bld.null_reg_ud(),
+                    retype(size, BRW_REGISTER_TYPE_UD),
+                    fs_reg(0), BRW_CONDITIONAL_NZ);
+
+            return BRW_PREDICATE_NORMAL;
+         } else {
+            /* More recent platforms implement compliant behavior when a null
+             * surface is bound.
+             */
+            return BRW_PREDICATE_NONE;
+         }
+      }
+
+      /**
+       * Check whether the provided coordinates are within the image bounds
+       * and write the comparison result to f0.0.  Returns an appropriate
+       * predication mode to use on subsequent image operations.
+       */
+      brw_predicate
+      emit_bounds_check(const fs_builder &bld, const fs_reg &image,
+                        const fs_reg &addr, unsigned dims)
+      {
+         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+         for (unsigned c = 0; c < dims; ++c)
+            set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
+                          bld.CMP(bld.null_reg_ud(),
+                                  offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
+                                  offset(size, bld, c),
+                                  BRW_CONDITIONAL_L));
+
+         return BRW_PREDICATE_NORMAL;
+      }
+   }
+
+   namespace image_coordinates {
+      /**
+       * Return the total number of coordinates needed to address a texel of
+       * the surface, which may be more than the sum of \p surf_dims and \p
+       * arr_dims if padding is required.
+       */
+      unsigned
+      num_image_coordinates(const fs_builder &bld,
+                            unsigned surf_dims, unsigned arr_dims,
+                            mesa_format format)
+      {
+         /* HSW in vec4 mode and our software coordinate handling for untyped
+          * reads want the array index to be at the Z component.
+          */
+         const bool array_index_at_z =
+            !image_format_info::has_matching_typed_format(
+               bld.shader->devinfo, format);
+         const unsigned zero_dims =
+            ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
+
+         return surf_dims + zero_dims + arr_dims;
+      }
+
+      /**
+       * Transform image coordinates into the form expected by the
+       * implementation.
+       */
+      fs_reg
+      emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
+                             unsigned surf_dims, unsigned arr_dims,
+                             mesa_format format)
+      {
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (dims > surf_dims + arr_dims) {
+            assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
+            /* The array index is required to be passed in as the Z component,
+             * insert a zero at the Y component to shift it to the right
+             * position.
+             *
+             * FINISHME: Factor out this frequently recurring pattern into a
+             * helper function.
+             */
+            const fs_reg srcs[] = { addr, fs_reg(0), offset(addr, bld, 1) };
+            const fs_reg dst = bld.vgrf(addr.type, dims);
+            bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
+            return dst;
+         } else {
+            return addr;
+         }
+      }
+
+      /**
+       * Calculate the offset in memory of the texel given by \p coord.
+       *
+       * This is meant to be used with untyped surface messages to access a
+       * tiled surface, what involves taking into account the tiling and
+       * swizzling modes of the surface manually so it will hopefully not
+       * happen very often.
+       *
+       * The tiling algorithm implemented here matches either the X or Y
+       * tiling layouts supported by the hardware depending on the tiling
+       * coefficients passed to the program as uniforms.  See Volume 1 Part 2
+       * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
+       * explanation of the hardware tiling format.
+       */
+      fs_reg
+      emit_address_calculation(const fs_builder &bld, const fs_reg &image,
+                               const fs_reg &coord, unsigned dims)
+      {
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
+         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
+         const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
+         const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
+         const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         /* Shift the coordinates by the fixed surface offset.  It may be
+          * non-zero if the image is a single slice of a higher-dimensional
+          * surface, or if a non-zero mipmap level of the surface is bound to
+          * the pipeline.  The offset needs to be applied here rather than at
+          * surface state set-up time because the desired slice-level may
+          * start mid-tile, so simply shifting the surface base address
+          * wouldn't give a well-formed tiled surface in the general case.
+          */
+         for (unsigned c = 0; c < 2; ++c)
+            bld.ADD(offset(addr, bld, c), offset(off, bld, c),
+                    (c < dims ?
+                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
+                     fs_reg(0)));
+
+         /* The layout of 3-D textures in memory is sort-of like a tiling
+          * format.  At each miplevel, the slices are arranged in rows of
+          * 2^level slices per row.  The slice row is stored in tmp.y and
+          * the slice within the row is stored in tmp.x.
+          *
+          * The layout of 2-D array textures and cubemaps is much simpler:
+          * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+          * stored in memory as an array of slices, each one being a 2-D
+          * arrangement of miplevels, or as a 2D arrangement of miplevels,
+          * each one being an array of slices.  In either case the separation
+          * between slices of the same LOD is equal to the qpitch value
+          * provided as stride.w.
+          *
+          * This code can be made to handle either 2D arrays and 3D textures
+          * by passing in the miplevel as tile.z for 3-D textures and 0 in
+          * tile.z for 2-D array textures.
+          *
+          * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
+          * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+          * of the hardware 3D texture and 2D array layouts.
+          */
+         if (dims > 2) {
+            /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+             * index.
+             */
+            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), fs_reg(0),
+                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
+            bld.SHR(offset(tmp, bld, 1),
+                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
+                    offset(tile, bld, 2));
+
+            /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+             * slice offset.
+             */
+            for (unsigned c = 0; c < 2; ++c) {
+               bld.MUL(offset(tmp, bld, c),
+                       offset(stride, bld, 2 + c), offset(tmp, bld, c));
+               bld.ADD(offset(addr, bld, c),
+                       offset(addr, bld, c), offset(tmp, bld, c));
+            }
+         }
+
+         if (dims > 1) {
+            /* Calculate the major/minor x and y indices.  In order to
+             * accommodate both X and Y tiling, the Y-major tiling format is
+             * treated as being a bunch of narrow X-tiles placed next to each
+             * other.  This means that the tile width for Y-tiling is actually
+             * the width of one sub-column of the Y-major tile where each 4K
+             * tile has 8 512B sub-columns.
+             *
+             * The major Y value is the row of tiles in which the pixel lives.
+             * The major X value is the tile sub-column in which the pixel
+             * lives; for X tiling, this is the same as the tile column, for Y
+             * tiling, each tile has 8 sub-columns.  The minor X and Y indices
+             * are the position within the sub-column.
+             */
+            for (unsigned c = 0; c < 2; ++c) {
+               /* Calculate the minor x and y indices. */
+               bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
+                       fs_reg(0), offset(addr, bld, c));
+
+               /* Calculate the major x and y indices. */
+               bld.SHR(offset(major, bld, c),
+                       offset(addr, bld, c), offset(tile, bld, c));
+            }
+
+            /* Calculate the texel index from the start of the tile row and
+             * the vertical coordinate of the row.
+             * Equivalent to:
+             *   tmp.x = (major.x << tile.y << tile.x) +
+             *           (minor.y << tile.x) + minor.x
+             *   tmp.y = major.y << tile.y
+             */
+            bld.SHL(tmp, major, offset(tile, bld, 1));
+            bld.ADD(tmp, tmp, offset(minor, bld, 1));
+            bld.SHL(tmp, tmp, offset(tile, bld, 0));
+            bld.ADD(tmp, tmp, minor);
+            bld.SHL(offset(tmp, bld, 1),
+                    offset(major, bld, 1), offset(tile, bld, 1));
+
+            /* Add it to the start of the tile row. */
+            bld.MUL(offset(tmp, bld, 1),
+                    offset(tmp, bld, 1), offset(stride, bld, 1));
+            bld.ADD(tmp, tmp, offset(tmp, bld, 1));
+
+            /* Multiply by the Bpp value. */
+            bld.MUL(dst, tmp, stride);
+
+            if (devinfo->gen < 8 && !devinfo->is_baytrail) {
+               /* Take into account the two dynamically specified shifts.
+                * Both need are used to implement swizzling of X-tiled
+                * surfaces.  For Y-tiled surfaces only one bit needs to be
+                * XOR-ed with bit 6 of the memory address, so a swz value of
+                * 0xff (actually interpreted as 31 by the hardware) will be
+                * provided to cause the relevant bit of tmp.y to be zero and
+                * turn the first XOR into the identity.  For linear surfaces
+                * or platforms lacking address swizzling both shifts will be
+                * 0xff causing the relevant bits of both tmp.x and .y to be
+                * zero, what effectively disables swizzling.
+                */
+               for (unsigned c = 0; c < 2; ++c)
+                  bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
+
+               /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+               bld.XOR(tmp, tmp, offset(tmp, bld, 1));
+               bld.AND(tmp, tmp, fs_reg(1 << 6));
+               bld.XOR(dst, dst, tmp);
+            }
+
+         } else {
+            /* Multiply by the Bpp/stride value.  Note that the addr.y may be
+             * non-zero even if the image is one-dimensional because a
+             * vertical offset may have been applied above to select a
+             * non-zero slice or level of a higher-dimensional texture.
+             */
+            bld.MUL(offset(addr, bld, 1),
+                    offset(addr, bld, 1), offset(stride, bld, 1));
+            bld.ADD(addr, addr, offset(addr, bld, 1));
+            bld.MUL(dst, addr, stride);
+         }
+
+         return dst;
+      }
+   }
+
+   namespace image_format_conversion {
+      using image_format_info::color_u;
+
+      namespace {
+         /**
+          * Maximum representable value in an unsigned integer with the given
+          * number of bits.
+          */
+         inline unsigned
+         scale(unsigned n)
+         {
+            return (1 << n) - 1;
+         }
+      }
+
+      /**
+       * Pack the vector \p src in a bitfield given the per-component bit
+       * shifts and widths.  Note that bitfield components are not allowed to
+       * cross 32-bit boundaries.
+       */
+      fs_reg
+      emit_pack(const fs_builder &bld, const fs_reg &src,
+                const color_u &shifts, const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         bool seen[4] = {};
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+               /* Shift each component left to the correct bitfield position. */
+               bld.SHL(tmp, offset(src, bld, c), fs_reg(shifts[c] % 32));
+
+               /* Add everything up. */
+               if (seen[shifts[c] / 32]) {
+                  bld.OR(offset(dst, bld, shifts[c] / 32),
+                         offset(dst, bld, shifts[c] / 32), tmp);
+               } else {
+                  bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
+                  seen[shifts[c] / 32] = true;
+               }
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Unpack a vector from the bitfield \p src given the per-component bit
+       * shifts and widths.  Note that bitfield components are not allowed to
+       * cross 32-bit boundaries.
+       */
+      fs_reg
+      emit_unpack(const fs_builder &bld, const fs_reg &src,
+                  const color_u &shifts, const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(src.type, 4);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Shift left to discard the most significant bits. */
+               bld.SHL(offset(dst, bld, c),
+                       offset(src, bld, shifts[c] / 32),
+                       fs_reg(32 - shifts[c] % 32 - widths[c]));
+
+               /* Shift back to the least significant bits using an arithmetic
+                * shift to get sign extension on signed types.
+                */
+               bld.ASR(offset(dst, bld, c),
+                       offset(dst, bld, c), fs_reg(32 - widths[c]));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert an integer vector into another integer vector of the
+       * specified bit widths, properly handling overflow.
+       */
+      fs_reg
+      emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
+                              const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(
+            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+         assert(src.type == dst.type);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Clamp to the maximum value. */
+               bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
+                               fs_reg((int)scale(widths[c] - s)),
+                               BRW_CONDITIONAL_L);
+
+               /* Clamp to the minimum value. */
+               if (is_signed)
+                  bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
+                                  fs_reg(-(int)scale(widths[c] - s) - 1),
+                                  BRW_CONDITIONAL_G);
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert a normalized fixed-point vector of the specified signedness
+       * and bit widths into a floating point vector.
+       */
+      fs_reg
+      emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
+                               const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Convert to float. */
+               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+               /* Divide by the normalization constants. */
+               bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
+                       fs_reg(1.0f / scale(widths[c] - s)));
+
+               /* Clamp to the minimum value. */
+               if (is_signed)
+                  bld.emit_minmax(offset(dst, bld, c),
+                                  offset(dst, bld, c), fs_reg(-1.0f),
+                                  BRW_CONDITIONAL_G);
+            }
+         }
+         return dst;
+      }
+
+      /**
+       * Convert a floating-point vector into a normalized fixed-point vector
+       * of the specified signedness and bit widths.
+       */
+      fs_reg
+      emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
+                             const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(
+            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Clamp the normalized floating-point argument. */
+               if (is_signed) {
+                  bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
+                                  fs_reg(-1.0f), BRW_CONDITIONAL_G);
+
+                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+                                  fs_reg(1.0f), BRW_CONDITIONAL_L);
+               } else {
+                  set_saturate(true, bld.MOV(offset(fdst, bld, c),
+                                             offset(src, bld, c)));
+               }
+
+               /* Multiply by the normalization constants. */
+               bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
+                       fs_reg((float)scale(widths[c] - s)));
+
+               /* Convert to integer. */
+               bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
+               bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert a floating point vector of the specified bit widths into a
+       * 32-bit floating point vector.
+       */
+      fs_reg
+      emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
+                              const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+               /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
+                * This works because they have a 5-bit exponent just like the
+                * 16-bit floating point format, and they have no sign bit.
+                */
+               if (widths[c] < 16)
+                  bld.SHL(offset(dst, bld, c),
+                          offset(dst, bld, c), fs_reg(15 - widths[c]));
+
+               /* Convert to 32-bit floating point. */
+               bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
+            }
+         }
+
+         return fdst;
+      }
+
+      /**
+       * Convert a vector into a floating point vector of the specified bit
+       * widths.
+       */
+      fs_reg
+      emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
+                            const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
+
+               /* Clamp to the minimum value. */
+               if (widths[c] < 16)
+                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+                                  fs_reg(0.0f), BRW_CONDITIONAL_G);
+
+               /* Convert to 16-bit floating-point. */
+               bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
+
+               /* Discard the least significant bits to get floating point
+                * numbers of the requested width.  This works because the
+                * 10-bit and 11-bit floating point formats have a 5-bit
+                * exponent just like the 16-bit format, and they have no sign
+                * bit.
+                */
+               if (widths[c] < 16)
+                  bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
+                          fs_reg(15 - widths[c]));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Fill missing components of a vector with 0, 0, 0, 1.
+       */
+      fs_reg
+      emit_pad(const fs_builder &bld, const fs_reg &src,
+               const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(src.type, 4);
+         const unsigned pad[] = { 0, 0, 0, 1 };
+
+         for (unsigned c = 0; c < 4; ++c)
+            bld.MOV(offset(dst, bld, c),
+                    widths[c] ? offset(src, bld, c) : fs_reg(pad[c]));
+
+         return dst;
+      }
+   }
+}
+
+namespace brw {
+   namespace image_access {
+      /**
+       * Load a vector from a surface of the given format and dimensionality
+       * at the given coordinates.  \p surf_dims and \p arr_dims give the
+       * number of non-array and array coordinates of the image respectively.
+       */
+      fs_reg
+      emit_image_load(const fs_builder &bld,
+                      const fs_reg &image, const fs_reg &addr,
+                      unsigned surf_dims, unsigned arr_dims,
+                      mesa_format format)
+      {
+         using namespace image_format_info;
+         using namespace image_format_conversion;
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+         fs_reg tmp;
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (has_matching_typed_format(devinfo, format)) {
+            /* Hopefully we get here most of the time... */
+            tmp = emit_typed_read(bld, image, saddr, dims,
+                                  _mesa_format_num_components(lower_format));
+         } else {
+            /* Untyped surface reads return 32 bits of the surface per
+             * component, without any sort of unpacking or type conversion,
+             */
+            const unsigned size = _mesa_get_format_bytes(format) / 4;
+
+            /* they don't properly handle out of bounds access, so we have to
+             * check manually if the coordinates are valid and predicate the
+             * surface read on the result,
+             */
+            const brw_predicate pred =
+               emit_bounds_check(bld, image, saddr, dims);
+
+            /* and they don't know about surface coordinates, we need to
+             * convert them to a raw memory offset.
+             */
+            const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
+
+            tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
+
+            /* An out of bounds surface access should give zero as result. */
+            for (unsigned c = 0; c < 4; ++c)
+               set_predicate(pred, bld.SEL(offset(tmp, bld, c),
+                                           offset(tmp, bld, c), fs_reg(0)));
+         }
+
+         /* Set the register type to D instead of UD if the data type is
+          * represented as a signed integer in memory so that sign extension
+          * is handled correctly by unpack.
+          */
+         if (needs_sign_extension(format))
+            tmp = retype(tmp, BRW_REGISTER_TYPE_D);
+
+         if (!has_supported_bit_layout(devinfo, format)) {
+            /* Unpack individual vector components from the bitfield if the
+             * hardware is unable to do it for us.
+             */
+            if (has_split_bit_layout(devinfo, format))
+               tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
+                               get_bit_widths(lower_format));
+            else
+               tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
+                                 get_bit_widths(format));
+
+         } else if ((needs_sign_extension(format) &&
+                     !is_conversion_trivial(devinfo, format)) ||
+                    has_undefined_high_bits(devinfo, format)) {
+            /* Perform a trivial unpack even though the bit layout matches in
+             * order to get the most significant bits of each component
+             * initialized properly.
+             */
+            tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
+                              get_bit_widths(format));
+         }
+
+         if (!_mesa_is_format_integer(format)) {
+            if (is_conversion_trivial(devinfo, format)) {
+               /* Just need to cast the vector to the target type. */
+               tmp = retype(tmp, BRW_REGISTER_TYPE_F);
+            } else {
+               /* Do the right sort of type conversion to float. */
+               if (_mesa_get_format_datatype(format) == GL_FLOAT)
+                  tmp = emit_convert_from_float(
+                     bld, tmp, get_bit_widths(format));
+               else
+                  tmp = emit_convert_from_scaled(
+                     bld, tmp, get_bit_widths(format),
+                     _mesa_is_format_signed(format));
+            }
+         }
+
+         /* Initialize missing components of the result. */
+         return emit_pad(bld, tmp, get_bit_widths(format));
+      }
+
+      /**
+       * Store a vector in a surface of the given format and dimensionality at
+       * the given coordinates.  \p surf_dims and \p arr_dims give the number
+       * of non-array and array coordinates of the image respectively.
+       */
+      void
+      emit_image_store(const fs_builder &bld, const fs_reg &image,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned surf_dims, unsigned arr_dims,
+                       mesa_format format)
+      {
+         using namespace image_format_info;
+         using namespace image_format_conversion;
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         const brw_device_info *devinfo = bld.shader->devinfo;
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (format == MESA_FORMAT_NONE) {
+            /* We don't know what the format is, but that's fine because it
+             * implies write-only access, and typed surface writes are always
+             * able to take care of type conversion and packing for us.
+             */
+            emit_typed_write(bld, image, saddr, src, dims, 4);
+
+         } else {
+            const mesa_format lower_format =
+               brw_lower_mesa_image_format(devinfo, format);
+            fs_reg tmp = src;
+
+            if (!is_conversion_trivial(devinfo, format)) {
+               /* Do the right sort of type conversion. */
+               if (_mesa_get_format_datatype(format) == GL_FLOAT)
+                  tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
+
+               else if (_mesa_is_format_integer(format))
+                  tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
+                                                _mesa_is_format_signed(format));
+
+               else
+                  tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
+                                               _mesa_is_format_signed(format));
+            }
+
+            /* We're down to bit manipulation at this point. */
+            tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
+
+            if (!has_supported_bit_layout(devinfo, format)) {
+               /* Pack the vector components into a bitfield if the hardware
+                * is unable to do it for us.
+                */
+               if (has_split_bit_layout(devinfo, format))
+                  tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
+                                    get_bit_widths(lower_format));
+
+               else
+                  tmp = emit_pack(bld, tmp, get_bit_shifts(format),
+                                  get_bit_widths(format));
+            }
+
+            if (has_matching_typed_format(devinfo, format)) {
+               /* Hopefully we get here most of the time... */
+               emit_typed_write(bld, image, saddr, tmp, dims,
+                                _mesa_format_num_components(lower_format));
+
+            } else {
+               /* Untyped surface writes store 32 bits of the surface per
+                * component, without any sort of packing or type conversion,
+                */
+               const unsigned size = _mesa_get_format_bytes(format) / 4;
+
+               /* they don't properly handle out of bounds access, so we have
+                * to check manually if the coordinates are valid and predicate
+                * the surface write on the result,
+                */
+               const brw_predicate pred =
+                  emit_bounds_check(bld, image, saddr, dims);
+
+               /* and, phew, they don't know about surface coordinates, we
+                * need to convert them to a raw memory offset.
+                */
+               const fs_reg laddr = emit_address_calculation(
+                  bld, image, saddr, dims);
+
+               emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
+            }
+         }
+      }
+
+      /**
+       * Perform an atomic read-modify-write operation in a surface of the
+       * given dimensionality at the given coordinates.  \p surf_dims and \p
+       * arr_dims give the number of non-array and array coordinates of the
+       * image respectively.  Main building block of the imageAtomic GLSL
+       * built-ins.
+       */
+      fs_reg
+      emit_image_atomic(const fs_builder &bld,
+                        const fs_reg &image, const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned surf_dims, unsigned arr_dims,
+                        unsigned rsize, unsigned op)
+      {
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         /* Avoid performing an atomic operation on an unbound surface. */
+         const brw_predicate pred = emit_surface_check(bld, image);
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims,
+                                  MESA_FORMAT_R_UINT32);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims,
+                                  MESA_FORMAT_R_UINT32);
+
+         /* Thankfully we can do without untyped atomics here. */
+         const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
+                                              dims, rsize, op, pred);
+
+         /* An unbound surface access should give zero as result. */
+         if (rsize)
+            set_predicate(pred, bld.SEL(tmp, tmp, fs_reg(0)));
+
+         return tmp;
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
new file mode 100644
index 00000000000..a3dd839955b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
@@ -0,0 +1,89 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_SURFACE_BUILDER_H
+#define BRW_FS_SURFACE_BUILDER_H
+
+#include "brw_fs_builder.h"
+#include "brw_context.h"
+
+namespace brw {
+   namespace surface_access {
+      fs_reg
+      emit_untyped_read(const fs_builder &bld,
+                        const fs_reg &surface, const fs_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+
+      void
+      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+                         const fs_reg &addr, const fs_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred = BRW_PREDICATE_NONE);
+
+      fs_reg
+      emit_untyped_atomic(const fs_builder &bld,
+                          const fs_reg &surface, const fs_reg &addr,
+                          const fs_reg &src0, const fs_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred = BRW_PREDICATE_NONE);
+
+      fs_reg
+      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+                      const fs_reg &addr, unsigned dims, unsigned size);
+
+      void
+      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned dims, unsigned size);
+
+      fs_reg
+      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+                        const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+   }
+
+   namespace image_access {
+      fs_reg
+      emit_image_load(const fs_builder &bld,
+                      const fs_reg &image, const fs_reg &addr,
+                      unsigned surf_dims, unsigned arr_dims,
+                      mesa_format format);
+
+      void
+      emit_image_store(const fs_builder &bld, const fs_reg &image,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned surf_dims, unsigned arr_dims,
+                       mesa_format format);
+      fs_reg
+      emit_image_atomic(const fs_builder &bld,
+                        const fs_reg &image, const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned surf_dims, unsigned arr_dims,
+                        unsigned rsize, unsigned op);
+   }
+}
+#endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
index 01d3a569858..96d4f375da2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
@@ -173,7 +173,7 @@ ir_vector_reference_visitor::visit_enter(ir_assignment *ir)
       return visit_continue_with_parent;
    }
    if (ir->lhs->as_dereference_variable() &&
-       is_power_of_two(ir->write_mask) &&
+       _mesa_is_pow_two(ir->write_mask) &&
        !ir->condition) {
       /* If we're writing just a channel, then channel-splitting the LHS is OK.
        */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 9a4bad6bcf5..111db8c4323 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -77,612 +77,6 @@ fs_visitor::emit_vs_system_value(int location)
    return reg;
 }
 
-fs_inst *
-fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg dPdy, int grad_components,
-                              uint32_t sampler)
-{
-   int mlen;
-   int base_mrf = 1;
-   bool simd16 = false;
-   fs_reg orig_dst;
-
-   /* g0 header. */
-   mlen = 1;
-
-   if (shadow_c.file != BAD_FILE) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
-      }
-
-      /* gen4's SIMD8 sampler always has the slots for u,v,r present.
-       * the unused slots must be zeroed.
-       */
-      for (int i = coord_components; i < 3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
-      }
-      mlen += 3;
-
-      if (op == ir_tex) {
-	 /* There's no plain shadow compare message, so we use shadow
-	  * compare with a bias of 0.0.
-	  */
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
-	 mlen++;
-      } else if (op == ir_txb || op == ir_txl) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), lod);
-	 mlen++;
-      } else {
-         unreachable("Should not get here.");
-      }
-
-      bld.MOV(fs_reg(MRF, base_mrf + mlen), shadow_c);
-      mlen++;
-   } else if (op == ir_tex) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
-      }
-      /* zero the others. */
-      for (int i = coord_components; i<3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
-      }
-      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
-      mlen += 3;
-   } else if (op == ir_txd) {
-      fs_reg &dPdx = lod;
-
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
-      }
-      /* the slots for u and v are always present, but r is optional */
-      mlen += MAX2(coord_components, 2);
-
-      /*  P   = u, v, r
-       * dPdx = dudx, dvdx, drdx
-       * dPdy = dudy, dvdy, drdy
-       *
-       * 1-arg: Does not exist.
-       *
-       * 2-arg: dudx   dvdx   dudy   dvdy
-       *        dPdx.x dPdx.y dPdy.x dPdy.y
-       *        m4     m5     m6     m7
-       *
-       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
-       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
-       *        m5     m6     m7     m8     m9     m10
-       */
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdx);
-	 dPdx = offset(dPdx, 1);
-      }
-      mlen += MAX2(grad_components, 2);
-
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdy);
-	 dPdy = offset(dPdy, 1);
-      }
-      mlen += MAX2(grad_components, 2);
-   } else if (op == ir_txs) {
-      /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
-      simd16 = true;
-      bld.MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
-      mlen += 2;
-   } else {
-      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
-       * instructions.  We'll need to do SIMD16 here.
-       */
-      simd16 = true;
-      assert(op == ir_txb || op == ir_txl || op == ir_txf);
-
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
-                 coordinate);
-	 coordinate = offset(coordinate, 1);
-      }
-
-      /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
-       * be necessary for TXF (ld), but seems wise to do for all messages.
-       */
-      for (int i = coord_components; i < 3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
-      }
-
-      /* lod/bias appears after u/v/r. */
-      mlen += 6;
-
-      bld.MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod);
-      mlen++;
-
-      /* The unused upper half. */
-      mlen++;
-   }
-
-   if (simd16) {
-      /* Now, since we're doing simd16, the return is 2 interleaved
-       * vec4s where the odd-indexed ones are junk. We'll need to move
-       * this weirdness around to the expected layout.
-       */
-      orig_dst = dst;
-      dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   default:
-      unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = base_mrf;
-   inst->mlen = mlen;
-   inst->header_size = 1;
-   inst->regs_written = simd16 ? 8 : 4;
-
-   if (simd16) {
-      for (int i = 0; i < 4; i++) {
-         bld.MOV(orig_dst, dst);
-	 orig_dst = offset(orig_dst, 1);
-	 dst = offset(dst, 2);
-      }
-   }
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
-                                     fs_reg coordinate, int vector_elements,
-                                     fs_reg shadow_c, fs_reg lod,
-                                     uint32_t sampler)
-{
-   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
-   bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf;
-
-   if (has_lod && shadow_c.file != BAD_FILE)
-      no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
-
-   if (op == ir_txd)
-      no16("textureGrad unsupported in SIMD16.");
-
-   /* Copy the coordinates. */
-   for (int i = 0; i < vector_elements; i++) {
-      bld.MOV(retype(offset(message, i), coordinate.type), coordinate);
-      coordinate = offset(coordinate, 1);
-   }
-
-   fs_reg msg_end = offset(message, vector_elements);
-
-   /* Messages other than sample and ld require all three components */
-   if (has_lod || shadow_c.file != BAD_FILE) {
-      for (int i = vector_elements; i < 3; i++) {
-         bld.MOV(offset(message, i), fs_reg(0.0f));
-      }
-   }
-
-   if (has_lod) {
-      fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
-                              BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
-   }
-
-   if (shadow_c.file != BAD_FILE) {
-      fs_reg msg_ref = offset(message, 3 + has_lod);
-      bld.MOV(msg_ref, shadow_c);
-      msg_end = offset(msg_ref, 1);
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB;     break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   default: unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = message.reg - 1;
-   inst->mlen = msg_end.reg - inst->base_mrf;
-   inst->header_size = 1;
-   inst->regs_written = 8;
-
-   return inst;
-}
-
-/* gen5's sampler has slots for u, v, r, array index, then optional
- * parameters like shadow comparitor or LOD bias.  If optional
- * parameters aren't present, those base slots are optional and don't
- * need to be included in the message.
- *
- * We don't fill in the unnecessary slots regardless, which may look
- * surprising in the disassembly.
- */
-fs_inst *
-fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int vector_elements,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, uint32_t sampler,
-                              bool has_offset)
-{
-   int reg_width = dispatch_width / 8;
-   unsigned header_size = 0;
-
-   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
-   fs_reg msg_coords = message;
-
-   if (has_offset) {
-      /* The offsets set up by the ir_texture visitor are in the
-       * m1 header, so we can't go headerless.
-       */
-      header_size = 1;
-      message.reg--;
-   }
-
-   for (int i = 0; i < vector_elements; i++) {
-      bld.MOV(retype(offset(msg_coords, i), coordinate.type), coordinate);
-      coordinate = offset(coordinate, 1);
-   }
-   fs_reg msg_end = offset(msg_coords, vector_elements);
-   fs_reg msg_lod = offset(msg_coords, 4);
-
-   if (shadow_c.file != BAD_FILE) {
-      fs_reg msg_shadow = msg_lod;
-      bld.MOV(msg_shadow, shadow_c);
-      msg_lod = offset(msg_shadow, 1);
-      msg_end = msg_lod;
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex:
-      opcode = SHADER_OPCODE_TEX;
-      break;
-   case ir_txb:
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
-
-      opcode = FS_OPCODE_TXB;
-      break;
-   case ir_txl:
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
-
-      opcode = SHADER_OPCODE_TXL;
-      break;
-   case ir_txd: {
-      /**
-       *  P   =  u,    v,    r
-       * dPdx = dudx, dvdx, drdx
-       * dPdy = dudy, dvdy, drdy
-       *
-       * Load up these values:
-       * - dudx   dudy   dvdx   dvdy   drdx   drdy
-       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
-       */
-      msg_end = msg_lod;
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(msg_end, lod);
-         lod = offset(lod, 1);
-         msg_end = offset(msg_end, 1);
-
-         bld.MOV(msg_end, lod2);
-         lod2 = offset(lod2, 1);
-         msg_end = offset(msg_end, 1);
-      }
-
-      opcode = SHADER_OPCODE_TXD;
-      break;
-   }
-   case ir_txs:
-      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
-
-      opcode = SHADER_OPCODE_TXS;
-      break;
-   case ir_query_levels:
-      msg_lod = msg_end;
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      msg_end = offset(msg_lod, 1);
-
-      opcode = SHADER_OPCODE_TXS;
-      break;
-   case ir_txf:
-      msg_lod = offset(msg_coords, 3);
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
-      msg_end = offset(msg_lod, 1);
-
-      opcode = SHADER_OPCODE_TXF;
-      break;
-   case ir_txf_ms:
-      msg_lod = offset(msg_coords, 3);
-      /* lod */
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      /* sample index */
-      bld.MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index);
-      msg_end = offset(msg_lod, 2);
-
-      opcode = SHADER_OPCODE_TXF_CMS;
-      break;
-   case ir_lod:
-      opcode = SHADER_OPCODE_LOD;
-      break;
-   case ir_tg4:
-      opcode = SHADER_OPCODE_TG4;
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = message.reg;
-   inst->mlen = msg_end.reg - message.reg;
-   inst->header_size = header_size;
-   inst->regs_written = 4 * reg_width;
-
-   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
-      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
-           " disallowed by hardware\n");
-   }
-
-   return inst;
-}
-
-static bool
-is_high_sampler(const struct brw_device_info *devinfo, fs_reg sampler)
-{
-   if (devinfo->gen < 8 && !devinfo->is_haswell)
-      return false;
-
-   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
-}
-
-fs_inst *
-fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, fs_reg mcs, fs_reg sampler,
-                              fs_reg offset_value)
-{
-   int reg_width = dispatch_width / 8;
-   unsigned header_size = 0;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
-   for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
-      sources[i] = vgrf(glsl_type::float_type);
-   }
-   int length = 0;
-
-   if (op == ir_tg4 || offset_value.file != BAD_FILE ||
-       is_high_sampler(devinfo, sampler)) {
-      /* For general texture offsets (no txf workaround), we need a header to
-       * put them in.  Note that for SIMD16 we're making space for two actual
-       * hardware registers here, so the emit will have to fix up for this.
-       *
-       * * ir4_tg4 needs to place its channel select in the header,
-       * for interaction with ARB_texture_swizzle
-       *
-       * The sampler index is only 4-bits, so for larger sampler numbers we
-       * need to offset the Sampler State Pointer in the header.
-       */
-      header_size = 1;
-      sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-      length++;
-   }
-
-   if (shadow_c.file != BAD_FILE) {
-      bld.MOV(sources[length], shadow_c);
-      length++;
-   }
-
-   bool has_nonconstant_offset =
-      offset_value.file != BAD_FILE && offset_value.file != IMM;
-   bool coordinate_done = false;
-
-   /* The sampler can only meaningfully compute LOD for fragment shader
-    * messages. For all other stages, we change the opcode to ir_txl and
-    * hardcode the LOD to 0.
-    */
-   if (stage != MESA_SHADER_FRAGMENT && op == ir_tex) {
-      op = ir_txl;
-      lod = fs_reg(0.0f);
-   }
-
-   /* Set up the LOD info */
-   switch (op) {
-   case ir_tex:
-   case ir_lod:
-      break;
-   case ir_txb:
-      bld.MOV(sources[length], lod);
-      length++;
-      break;
-   case ir_txl:
-      bld.MOV(sources[length], lod);
-      length++;
-      break;
-   case ir_txd: {
-      no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
-
-      /* Load dPdx and the coordinate together:
-       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
-       */
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(sources[length], coordinate);
-	 coordinate = offset(coordinate, 1);
-	 length++;
-
-         /* For cube map array, the coordinate is (u,v,r,ai) but there are
-          * only derivatives for (u, v, r).
-          */
-         if (i < grad_components) {
-            bld.MOV(sources[length], lod);
-            lod = offset(lod, 1);
-            length++;
-
-            bld.MOV(sources[length], lod2);
-            lod2 = offset(lod2, 1);
-            length++;
-         }
-      }
-
-      coordinate_done = true;
-      break;
-   }
-   case ir_txs:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
-      length++;
-      break;
-   case ir_query_levels:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      length++;
-      break;
-   case ir_txf:
-      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
-       * On Gen9 they are u, v, lod, r
-       */
-
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-      coordinate = offset(coordinate, 1);
-      length++;
-
-      if (devinfo->gen >= 9) {
-         if (coord_components >= 2) {
-            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-            coordinate = offset(coordinate, 1);
-         }
-         length++;
-      }
-
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
-      length++;
-
-      for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-	 coordinate = offset(coordinate, 1);
-	 length++;
-      }
-
-      coordinate_done = true;
-      break;
-   case ir_txf_ms:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
-      length++;
-
-      /* data from the multisample control surface */
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
-      length++;
-
-      /* there is no offsetting for this message; just copy in the integer
-       * texture coordinates
-       */
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-         coordinate = offset(coordinate, 1);
-         length++;
-      }
-
-      coordinate_done = true;
-      break;
-   case ir_tg4:
-      if (has_nonconstant_offset) {
-         if (shadow_c.file != BAD_FILE)
-            no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
-
-         /* More crazy intermixing */
-         for (int i = 0; i < 2; i++) { /* u, v */
-            bld.MOV(sources[length], coordinate);
-            coordinate = offset(coordinate, 1);
-            length++;
-         }
-
-         for (int i = 0; i < 2; i++) { /* offu, offv */
-            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
-            offset_value = offset(offset_value, 1);
-            length++;
-         }
-
-         if (coord_components == 3) { /* r if present */
-            bld.MOV(sources[length], coordinate);
-            coordinate = offset(coordinate, 1);
-            length++;
-         }
-
-         coordinate_done = true;
-      }
-      break;
-   }
-
-   /* Set up the coordinate (except for cases where it was done above) */
-   if (!coordinate_done) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(sources[length], coordinate);
-         coordinate = offset(coordinate, 1);
-         length++;
-      }
-   }
-
-   int mlen;
-   if (reg_width == 2)
-      mlen = length * reg_width - header_size;
-   else
-      mlen = length * reg_width;
-
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_F, dispatch_width);
-   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
-
-   /* Generate the SEND */
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
-   case ir_lod: opcode = SHADER_OPCODE_LOD; break;
-   case ir_tg4:
-      if (has_nonconstant_offset)
-         opcode = SHADER_OPCODE_TG4_OFFSET;
-      else
-         opcode = SHADER_OPCODE_TG4;
-      break;
-   default:
-      unreachable("not reached");
-   }
-   fs_inst *inst = bld.emit(opcode, dst, src_payload, sampler);
-   inst->base_mrf = -1;
-   inst->mlen = mlen;
-   inst->header_size = header_size;
-   inst->regs_written = 4 * reg_width;
-
-   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
-      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
-           " disallowed by hardware\n");
-   }
-
-   return inst;
-}
-
 fs_reg
 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
                              bool is_rect, uint32_t sampler, int texunit)
@@ -746,8 +140,8 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       coordinate = dst;
 
       bld.MUL(dst, src, scale_x);
-      dst = offset(dst, 1);
-      src = offset(src, 1);
+      dst = offset(dst, bld, 1);
+      src = offset(src, bld, 1);
       bld.MUL(dst, src, scale_y);
    } else if (is_rect) {
       /* On gen6+, the sampler handles the rectangle coordinates
@@ -760,7 +154,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       for (int i = 0; i < 2; i++) {
 	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 	    fs_reg chan = coordinate;
-	    chan = offset(chan, i);
+	    chan = offset(chan, bld, i);
 
             set_condmod(BRW_CONDITIONAL_GE,
                         bld.emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f)));
@@ -785,7 +179,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       for (int i = 0; i < MIN2(coord_components, 3); i++) {
 	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 	    fs_reg chan = coordinate;
-	    chan = offset(chan, i);
+	    chan = offset(chan, bld, i);
             set_saturate(true, bld.MOV(chan, chan));
 	 }
       }
@@ -795,31 +189,21 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 
 /* Sample from the MCS surface attached to this multisample texture. */
 fs_reg
-fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
+fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+                           const fs_reg &sampler)
 {
-   int reg_width = dispatch_width / 8;
-   fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
-                           BRW_REGISTER_TYPE_F, dispatch_width);
-   fs_reg dest = vgrf(glsl_type::uvec4_type);
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
-
-   /* parameters are: u, v, r; missing parameters are treated as zero */
-   for (int i = 0; i < components; i++) {
-      sources[i] = vgrf(glsl_type::float_type);
-      bld.MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate);
-      coordinate = offset(coordinate, 1);
-   }
-
-   bld.LOAD_PAYLOAD(payload, sources, components, 0);
+   const fs_reg dest = vgrf(glsl_type::uvec4_type);
+   const fs_reg srcs[] = {
+      coordinate, fs_reg(), fs_reg(), fs_reg(), fs_reg(), fs_reg(),
+      sampler, fs_reg(), fs_reg(components), fs_reg(0)
+   };
+   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
+                            ARRAY_SIZE(srcs));
 
-   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
-   inst->base_mrf = -1;
-   inst->mlen = components * reg_width;
-   inst->header_size = 0;
-   inst->regs_written = 4 * reg_width; /* we only care about one reg of
-                                        * response, but the sampler always
-                                        * writes 4/8
-                                        */
+   /* We only care about one reg of response, but the sampler always writes
+    * 4/8.
+    */
+   inst->regs_written = 4 * dispatch_width / 8;
 
    return dest;
 }
@@ -853,12 +237,20 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 
          for (int i=0; i<4; i++) {
             bld.MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
-            res = offset(res, 1);
+            res = offset(res, bld, 1);
          }
          return;
       }
    }
 
+   if (op == ir_query_levels) {
+      /* textureQueryLevels() is implemented in terms of TXS so we need to
+       * pass a valid LOD argument.
+       */
+      assert(lod.file == BAD_FILE);
+      lod = fs_reg(0u);
+   }
+
    if (coordinate.file != BAD_FILE) {
       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
@@ -871,26 +263,50 @@ fs_visitor::emit_texture(ir_texture_opcode op,
     * samples, so don't worry about them.
     */
    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
+   const fs_reg srcs[] = {
+      coordinate, shadow_c, lod, lod2,
+      sample_index, mcs, sampler_reg, offset_value,
+      fs_reg(coord_components), fs_reg(grad_components)
+   };
+   enum opcode opcode;
 
-   if (devinfo->gen >= 7) {
-      inst = emit_texture_gen7(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sample_index, mcs, sampler_reg,
-                               offset_value);
-   } else if (devinfo->gen >= 5) {
-      inst = emit_texture_gen5(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sample_index, sampler,
-                               offset_value.file != BAD_FILE);
-   } else if (dispatch_width == 16) {
-      inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
-                                      shadow_c, lod, sampler);
-   } else {
-      inst = emit_texture_gen4(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sampler);
+   switch (op) {
+   case ir_tex:
+      opcode = SHADER_OPCODE_TEX_LOGICAL;
+      break;
+   case ir_txb:
+      opcode = FS_OPCODE_TXB_LOGICAL;
+      break;
+   case ir_txl:
+      opcode = SHADER_OPCODE_TXL_LOGICAL;
+      break;
+   case ir_txd:
+      opcode = SHADER_OPCODE_TXD_LOGICAL;
+      break;
+   case ir_txf:
+      opcode = SHADER_OPCODE_TXF_LOGICAL;
+      break;
+   case ir_txf_ms:
+      opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+      break;
+   case ir_txs:
+   case ir_query_levels:
+      opcode = SHADER_OPCODE_TXS_LOGICAL;
+      break;
+   case ir_lod:
+      opcode = SHADER_OPCODE_LOD_LOGICAL;
+      break;
+   case ir_tg4:
+      opcode = (offset_value.file != BAD_FILE && offset_value.file != IMM ?
+                SHADER_OPCODE_TG4_OFFSET_LOGICAL : SHADER_OPCODE_TG4_LOGICAL);
+      break;
+   default:
+      unreachable("Invalid texture opcode.");
    }
 
+   inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+   inst->regs_written = 4 * dispatch_width / 8;
+
    if (shadow_c.file != BAD_FILE)
       inst->shadow_compare = true;
 
@@ -907,17 +323,17 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 
    /* fixup #layers for cube map arrays */
    if (op == ir_txs && is_cube_array) {
-      fs_reg depth = offset(dst, 2);
+      fs_reg depth = offset(dst, bld, 2);
       fs_reg fixed_depth = vgrf(glsl_type::int_type);
       bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 
       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
-      int components = inst->regs_written / (dst.width / 8);
+      int components = inst->regs_written / (inst->exec_size / 8);
       for (int i = 0; i < components; i++) {
          if (i == 2) {
             fixed_payload[i] = fixed_depth;
          } else {
-            fixed_payload[i] = offset(dst, i);
+            fixed_payload[i] = offset(dst, bld, i);
          }
       }
       bld.LOAD_PAYLOAD(dst, fixed_payload, components, 0);
@@ -952,7 +368,7 @@ fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
          bld.ASR(dst, dst, fs_reg(32 - width));
       }
 
-      dst = offset(dst, 1);
+      dst = offset(dst, bld, 1);
    }
 }
 
@@ -989,7 +405,7 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
 {
    if (op == ir_query_levels) {
       /* # levels is in .w */
-      this->result = offset(orig_val, 3);
+      this->result = offset(orig_val, bld, 3);
       return;
    }
 
@@ -1010,15 +426,15 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
       for (int i = 0; i < 4; i++) {
 	 int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
 	 fs_reg l = swizzled_result;
-	 l = offset(l, i);
+	 l = offset(l, bld, i);
 
 	 if (swiz == SWIZZLE_ZERO) {
             bld.MOV(l, fs_reg(0.0f));
 	 } else if (swiz == SWIZZLE_ONE) {
             bld.MOV(l, fs_reg(1.0f));
 	 } else {
-            bld.MOV(l, offset(orig_val,
-                              GET_SWZ(key_tex->swizzles[sampler], i)));
+            bld.MOV(l, offset(orig_val, bld,
+                                  GET_SWZ(key_tex->swizzles[sampler], i)));
 	 }
       }
       this->result = swizzled_result;
@@ -1114,118 +530,6 @@ fs_visitor::try_replace_with_sel()
    return false;
 }
 
-void
-fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                                fs_reg dst, fs_reg offset, fs_reg src0,
-                                fs_reg src1)
-{
-   int reg_width = dispatch_width / 8;
-   int length = 0;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
-
-   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   /* Initialize the sample mask in the message header. */
-   bld.exec_all().MOV(sources[0], fs_reg(0u));
-
-   if (stage == MESA_SHADER_FRAGMENT) {
-      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         bld.exec_all()
-            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
-      } else {
-         bld.exec_all()
-            .MOV(component(sources[0], 7),
-                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
-      }
-   } else {
-      /* The execution mask is part of the side-band information sent together with
-       * the message payload to the data port. It's implicitly ANDed with the sample
-       * mask sent in the header to compute the actual set of channels that execute
-       * the atomic operation.
-       */
-      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      bld.exec_all()
-         .MOV(component(sources[0], 7), fs_reg(0xffffu));
-   }
-   length++;
-
-   /* Set the atomic operation offset. */
-   sources[1] = vgrf(glsl_type::uint_type);
-   bld.MOV(sources[1], offset);
-   length++;
-
-   /* Set the atomic operation arguments. */
-   if (src0.file != BAD_FILE) {
-      sources[length] = vgrf(glsl_type::uint_type);
-      bld.MOV(sources[length], src0);
-      length++;
-   }
-
-   if (src1.file != BAD_FILE) {
-      sources[length] = vgrf(glsl_type::uint_type);
-      bld.MOV(sources[length], src1);
-      length++;
-   }
-
-   int mlen = 1 + (length - 1) * reg_width;
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD, dispatch_width);
-   bld.LOAD_PAYLOAD(src_payload, sources, length, 1);
-
-   /* Emit the instruction. */
-   fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
-                            fs_reg(surf_index), fs_reg(atomic_op));
-   inst->mlen = mlen;
-}
-
-void
-fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
-                                      fs_reg offset)
-{
-   int reg_width = dispatch_width / 8;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
-
-   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   /* Initialize the sample mask in the message header. */
-   bld.exec_all()
-      .MOV(sources[0], fs_reg(0u));
-
-   if (stage == MESA_SHADER_FRAGMENT) {
-      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         bld.exec_all()
-            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
-      } else {
-         bld.exec_all()
-            .MOV(component(sources[0], 7),
-                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
-      }
-   } else {
-      /* The execution mask is part of the side-band information sent together with
-       * the message payload to the data port. It's implicitly ANDed with the sample
-       * mask sent in the header to compute the actual set of channels that execute
-       * the atomic operation.
-       */
-      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      bld.exec_all()
-         .MOV(component(sources[0], 7), fs_reg(0xffffu));
-   }
-
-   /* Set the surface read offset. */
-   sources[1] = vgrf(glsl_type::uint_type);
-   bld.MOV(sources[1], offset);
-
-   int mlen = 1 + reg_width;
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD, dispatch_width);
-   fs_inst *inst = bld.LOAD_PAYLOAD(src_payload, sources, 2, 1);
-
-   /* Emit the instruction. */
-   inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
-                   fs_reg(surf_index), fs_reg(1));
-   inst->mlen = mlen;
-}
-
 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 void
 fs_visitor::emit_dummy_fs()
@@ -1235,8 +539,8 @@ fs_visitor::emit_dummy_fs()
    /* Everyone's favorite color. */
    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
    for (int i = 0; i < 4; i++) {
-      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
-                     dispatch_width), fs_reg(color[i]));
+      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
+              fs_reg(color[i]));
    }
 
    fs_inst *write;
@@ -1315,14 +619,14 @@ fs_visitor::emit_interpolation_setup_gen4()
 
    if (devinfo->has_pln && dispatch_width == 16) {
       for (unsigned i = 0; i < 2; i++) {
-         abld.half(i).ADD(half(offset(delta_xy, i), 0),
+         abld.half(i).ADD(half(offset(delta_xy, abld, i), 0),
                           half(this->pixel_x, i), xstart);
-         abld.half(i).ADD(half(offset(delta_xy, i), 1),
+         abld.half(i).ADD(half(offset(delta_xy, abld, i), 1),
                           half(this->pixel_y, i), ystart);
       }
    } else {
-      abld.ADD(offset(delta_xy, 0), this->pixel_x, xstart);
-      abld.ADD(offset(delta_xy, 1), this->pixel_y, ystart);
+      abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
+      abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
    }
 
    abld = bld.annotate("compute pos.w and 1/pos.w");
@@ -1356,9 +660,10 @@ fs_visitor::emit_interpolation_setup_gen6()
        * compute our pixel centers.
        */
       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
-                          BRW_REGISTER_TYPE_UW, dispatch_width * 2);
-      abld.exec_all()
-          .ADD(int_pixel_xy,
+                          BRW_REGISTER_TYPE_UW);
+
+      const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0);
+      dbld.ADD(int_pixel_xy,
                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
                fs_reg(brw_imm_v(0x11001010)));
 
@@ -1407,33 +712,6 @@ fs_visitor::emit_interpolation_setup_gen6()
    }
 }
 
-void
-fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
-                                unsigned exec_size, bool use_2nd_half)
-{
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   fs_inst *inst;
-
-   if (key->clamp_fragment_color) {
-      fs_reg tmp = vgrf(glsl_type::vec4_type);
-      assert(color.type == BRW_REGISTER_TYPE_F);
-      for (unsigned i = 0; i < components; i++) {
-         inst = bld.MOV(offset(tmp, i), offset(color, i));
-         inst->saturate = true;
-      }
-      color = tmp;
-   }
-
-   if (exec_size < dispatch_width) {
-      unsigned half_idx = use_2nd_half ? 1 : 0;
-      for (unsigned i = 0; i < components; i++)
-         dst[i] = half(offset(color, i), half_idx);
-   } else {
-      for (unsigned i = 0; i < components; i++)
-         dst[i] = offset(color, i);
-   }
-}
-
 static enum brw_conditional_mod
 cond_for_alpha_func(GLenum func)
 {
@@ -1478,7 +756,7 @@ fs_visitor::emit_alpha_test()
                      BRW_CONDITIONAL_NEQ);
    } else {
       /* RT0 alpha */
-      fs_reg color = offset(outputs[0], 3);
+      fs_reg color = offset(outputs[0], bld, 3);
 
       /* f0.1 &= func(color, ref) */
       cmp = abld.CMP(bld.null_reg_f(), color, fs_reg(key->alpha_test_ref),
@@ -1491,152 +769,36 @@ fs_visitor::emit_alpha_test()
 fs_inst *
 fs_visitor::emit_single_fb_write(const fs_builder &bld,
                                  fs_reg color0, fs_reg color1,
-                                 fs_reg src0_alpha, unsigned components,
-                                 unsigned exec_size, bool use_2nd_half)
+                                 fs_reg src0_alpha, unsigned components)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   int header_size = 2, payload_header_size;
-
-   /* We can potentially have a message length of up to 15, so we have to set
-    * base_mrf to either 0 or 1 in order to fit in m0..m15.
-    */
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
-   int length = 0;
-
-   /* From the Sandy Bridge PRM, volume 4, page 198:
-    *
-    *     "Dispatched Pixel Enables. One bit per pixel indicating
-    *      which pixels were originally enabled when the thread was
-    *      dispatched. This field is only required for the end-of-
-    *      thread message and on all dual-source messages."
-    */
-   if (devinfo->gen >= 6 &&
-       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
-       color1.file == BAD_FILE &&
-       key->nr_color_regions == 1) {
-      header_size = 0;
-   }
-
-   if (header_size != 0) {
-      assert(header_size == 2);
-      /* Allocate 2 registers for a header */
-      length += 2;
-   }
 
-   if (payload.aa_dest_stencil_reg) {
-      sources[length] = fs_reg(GRF, alloc.allocate(1));
-      bld.exec_all().annotate("FB write stencil/AA alpha")
-         .MOV(sources[length],
-              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
-      length++;
-   }
-
-   prog_data->uses_omask =
-      prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
-   if (prog_data->uses_omask) {
-      assert(this->sample_mask.file != BAD_FILE);
-      /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
-       * it's unsinged single words, one vgrf is always 16-wide.
-       */
-      sources[length] = fs_reg(GRF, alloc.allocate(1),
-                               BRW_REGISTER_TYPE_UW, 16);
-      bld.exec_all().annotate("FB write oMask")
-         .emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
-      length++;
-   }
-
-   payload_header_size = length;
-
-   if (color0.file == BAD_FILE) {
-      /* Even if there's no color buffers enabled, we still need to send
-       * alpha out the pipeline to our null renderbuffer to support
-       * alpha-testing, alpha-to-coverage, and so on.
-       */
-      if (this->outputs[0].file != BAD_FILE)
-         setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3),
-                             1, exec_size, false);
-      length += 4;
-   } else if (color1.file == BAD_FILE) {
-      if (src0_alpha.file != BAD_FILE) {
-         setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
-         length++;
-      }
-
-      setup_color_payload(&sources[length], color0, components,
-                          exec_size, use_2nd_half);
-      length += 4;
-   } else {
-      setup_color_payload(&sources[length], color0, components,
-                          exec_size, use_2nd_half);
-      length += 4;
-      setup_color_payload(&sources[length], color1, components,
-                          exec_size, use_2nd_half);
-      length += 4;
-   }
+   /* Hand over gl_FragDepth or the payload depth. */
+   const fs_reg dst_depth = (payload.dest_depth_reg ?
+                             fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) :
+                             fs_reg());
+   fs_reg src_depth;
 
    if (source_depth_to_render_target) {
-      if (devinfo->gen == 6) {
-	 /* For outputting oDepth on gen6, SIMD8 writes have to be
-	  * used.  This would require SIMD8 moves of each half to
-	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
-	  * Just bail on doing so for now.
-	  */
-	 no16("Missing support for simd16 depth writes on gen6\n");
-      }
-
-      if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-	 /* Hand over gl_FragDepth. */
-	 assert(this->frag_depth.file != BAD_FILE);
-         if (exec_size < dispatch_width) {
-            sources[length] = half(this->frag_depth, use_2nd_half);
-         } else {
-            sources[length] = this->frag_depth;
-         }
-      } else {
-	 /* Pass through the payload depth. */
-         sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
-      }
-      length++;
+      if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+         src_depth = frag_depth;
+      else
+         src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
    }
 
-   if (payload.dest_depth_reg)
-      sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
-
-   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
-   fs_inst *load;
-   fs_inst *write;
-   if (devinfo->gen >= 7) {
-      /* Send from the GRF */
-      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
-      load = ubld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
-      payload.reg = alloc.allocate(load->regs_written);
-      load->dst = payload;
-      write = ubld.emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
-      write->base_mrf = -1;
-   } else {
-      /* Send from the MRF */
-      load = ubld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
-                               sources, length, payload_header_size);
-
-      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
-       * will do this for us if we just give it a COMPR4 destination.
-       */
-      if (devinfo->gen < 6 && exec_size == 16)
-         load->dst.reg |= BRW_MRF_COMPR4;
-
-      write = ubld.emit(FS_OPCODE_FB_WRITE);
-      write->exec_size = exec_size;
-      write->base_mrf = 1;
-   }
+   const fs_reg sources[] = {
+      color0, color1, src0_alpha, src_depth, dst_depth, sample_mask,
+      fs_reg(components)
+   };
+   fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
+                             sources, ARRAY_SIZE(sources));
 
-   write->mlen = load->regs_written;
-   write->header_size = header_size;
    if (prog_data->uses_kill) {
       write->predicate = BRW_PREDICATE_NORMAL;
       write->flag_subreg = 1;
    }
+
    return write;
 }
 
@@ -1648,37 +810,24 @@ fs_visitor::emit_fb_writes()
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    fs_inst *inst = NULL;
+
+   if (source_depth_to_render_target && devinfo->gen == 6) {
+      /* For outputting oDepth on gen6, SIMD8 writes have to be used.  This
+       * would require SIMD8 moves of each half to message regs, e.g. by using
+       * the SIMD lowering pass.  Unfortunately this is more difficult than it
+       * sounds because the SIMD8 single-source message lacks channel selects
+       * for the second and third subspans.
+       */
+      no16("Missing support for simd16 depth writes on gen6\n");
+   }
+
    if (do_dual_src) {
       const fs_builder abld = bld.annotate("FB dual-source write");
 
       inst = emit_single_fb_write(abld, this->outputs[0],
-                                  this->dual_src_output, reg_undef, 4, 8);
+                                  this->dual_src_output, reg_undef, 4);
       inst->target = 0;
 
-      /* SIMD16 dual source blending requires to send two SIMD8 dual source
-       * messages, where each message contains color data for 8 pixels. Color
-       * data for the first group of pixels is stored in the "lower" half of
-       * the color registers, so in SIMD16, the previous message did:
-       * m + 0: r0
-       * m + 1: g0
-       * m + 2: b0
-       * m + 3: a0
-       *
-       * Here goes the second message, which packs color data for the
-       * remaining 8 pixels. Color data for these pixels is stored in the
-       * "upper" half of the color registers, so we need to do:
-       * m + 0: r1
-       * m + 1: g1
-       * m + 2: b1
-       * m + 3: a1
-       */
-      if (dispatch_width == 16) {
-         inst = emit_single_fb_write(abld, this->outputs[0],
-                                     this->dual_src_output, reg_undef, 4, 8,
-                                     true);
-         inst->target = 0;
-      }
-
       prog_data->dual_src_blend = true;
    } else {
       for (int target = 0; target < key->nr_color_regions; target++) {
@@ -1691,12 +840,11 @@ fs_visitor::emit_fb_writes()
 
          fs_reg src0_alpha;
          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
-            src0_alpha = offset(outputs[0], 3);
+            src0_alpha = offset(outputs[0], bld, 3);
 
          inst = emit_single_fb_write(abld, this->outputs[target], reg_undef,
                                      src0_alpha,
-                                     this->output_components[target],
-                                     dispatch_width);
+                                     this->output_components[target]);
          inst->target = target;
       }
    }
@@ -1706,8 +854,15 @@ fs_visitor::emit_fb_writes()
        * alpha out the pipeline to our null renderbuffer to support
        * alpha-testing, alpha-to-coverage, and so on.
        */
-      inst = emit_single_fb_write(bld, reg_undef, reg_undef, reg_undef, 0,
-                                  dispatch_width);
+      /* FINISHME: Factor out this frequently recurring pattern into a
+       * helper function.
+       */
+      const fs_reg srcs[] = { reg_undef, reg_undef,
+                              reg_undef, offset(this->outputs[0], bld, 3) };
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+      bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
+
+      inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
       inst->target = 0;
    }
 
@@ -1730,6 +885,12 @@ fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
    }
 }
 
+/**
+ * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances.
+ *
+ * This does nothing if the shader uses gl_ClipDistance or user clipping is
+ * disabled altogether.
+ */
 void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
 {
    struct brw_vue_prog_data *vue_prog_data =
@@ -1737,6 +898,10 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
    const struct brw_vue_prog_key *key =
       (const struct brw_vue_prog_key *) this->key;
 
+   /* Bail unless some sort of legacy clipping is enabled */
+   if (!key->userclip_active || prog->UsesClipDistanceOut)
+      return;
+
    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
     *
     *     "If a linked set of shaders forming the vertex stage contains no
@@ -1774,13 +939,13 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
       abld.MUL(output, outputs[clip_vertex], u);
       for (int j = 1; j < 4; j++) {
          u.reg = userplane[i].reg + j;
-         abld.MAD(output, output, offset(outputs[clip_vertex], j), u);
+         abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u);
       }
    }
 }
 
 void
-fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
+fs_visitor::emit_urb_writes()
 {
    int slot, urb_offset, length;
    struct brw_vs_prog_data *vs_prog_data =
@@ -1793,21 +958,24 @@ fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
    bool flush;
    fs_reg sources[8];
 
-   /* Lower legacy ff and ClipVertex clipping to clip distances */
-   if (key->base.userclip_active && !prog->UsesClipDistanceOut)
-      compute_clip_distance(clip_planes);
-
    /* If we don't have any valid slots to write, just do a minimal urb write
-    * send to terminate the shader. */
+    * send to terminate the shader.  This includes 1 slot of undefined data,
+    * because it's invalid to write 0 data:
+    *
+    * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
+    * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
+    * Write Data Payload:
+    *
+    *    "The write data payload can be between 1 and 8 message phases long."
+    */
    if (vue_map->slots_valid == 0) {
-
-      fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      fs_reg payload = fs_reg(GRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
       bld.exec_all().MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
                                                 BRW_REGISTER_TYPE_UD)));
 
       fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
       inst->eot = true;
-      inst->mlen = 1;
+      inst->mlen = 2;
       inst->offset = 1;
       return;
    }
@@ -1888,13 +1056,13 @@ fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
              */
             for (int i = 0; i < 4; i++) {
                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
-               src = offset(this->outputs[varying], i);
+               src = offset(this->outputs[varying], bld, i);
                set_saturate(true, bld.MOV(reg, src));
                sources[length++] = reg;
             }
          } else {
             for (int i = 0; i < 4; i++)
-               sources[length++] = offset(this->outputs[varying], i);
+               sources[length++] = offset(this->outputs[varying], bld, i);
          }
          break;
       }
@@ -1911,7 +1079,7 @@ fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
       if (flush) {
          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
-                                 BRW_REGISTER_TYPE_F, dispatch_width);
+                                 BRW_REGISTER_TYPE_F);
          payload_sources[0] =
             fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
@@ -1944,7 +1112,7 @@ fs_visitor::emit_cs_terminate()
     */
    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   bld.exec_all().MOV(payload, g0);
+   bld.group(8, 0).exec_all().MOV(payload, g0);
 
    /* Send a message to the thread spawner to terminate the thread. */
    fs_inst *inst = bld.exec_all()
@@ -2012,7 +1180,7 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
    this->no16_msg = NULL;
 
    this->nir_locals = NULL;
-   this->nir_globals = NULL;
+   this->nir_ssa_values = NULL;
 
    memset(&this->payload, 0, sizeof(this->payload));
    memset(this->outputs, 0, sizeof(this->outputs));
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 45c132b4a9e..4ad65215756 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -68,12 +68,16 @@ brw_compile_gs_prog(struct brw_context *brw,
 
    /* We also upload clip plane data as uniforms */
    param_count += MAX_CLIP_PLANES * 4;
+   param_count += gs->NumImages * BRW_IMAGE_PARAM_SIZE;
 
    c.prog_data.base.base.param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    c.prog_data.base.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   c.prog_data.base.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, gs->NumImages);
    c.prog_data.base.base.nr_params = param_count;
+   c.prog_data.base.base.nr_image_params = gs->NumImages;
 
    if (brw->gen >= 7) {
       if (gp->program.OutputType == GL_POINTS) {
@@ -270,16 +274,6 @@ brw_compile_gs_prog(struct brw_context *brw,
       return false;
    }
 
-   /* Scratch space is used for register spilling */
-   if (c.base.last_scratch) {
-      perf_debug("Geometry shader triggered register spilling.  "
-                 "Try reducing the number of live vec4 values to "
-                 "improve performance.\n");
-
-      c.prog_data.base.base.total_scratch
-         = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
-   }
-
    output->mem_ctx = mem_ctx;
    output->program = program;
    output->program_size = program_size;
diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
index 0b8bfc3d9bd..0bb307432d0 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
@@ -119,3 +119,28 @@ const struct brw_tracked_state brw_gs_abo_surfaces = {
    },
    .emit = brw_upload_gs_abo_surfaces,
 };
+
+static void
+brw_upload_gs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
+
+   if (prog) {
+      /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY],
+                                &brw->gs.base, &brw->gs.prog_data->base.base);
+   }
+}
+
+const struct brw_tracked_state brw_gs_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_GS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS,
+   },
+   .emit = brw_upload_gs_image_surfaces,
+};
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 7a8c210118c..46eff1dd381 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -683,9 +683,9 @@ brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low)
    high %= 64;
    low %= 64;
 
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = (1ull << (high - low + 1)) - 1;
 
-   return (inst->data[word] & mask) >> low;
+   return (inst->data[word] >> low) & mask;
 }
 
 /**
@@ -702,12 +702,12 @@ brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value)
    high %= 64;
    low %= 64;
 
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
 
    /* Make sure the supplied value actually fits in the given bitfield. */
    assert((value & (mask >> low)) == value);
 
-   inst->data[word] = (inst->data[word] & ~mask) | ((value << low) & mask);
+   inst->data[word] = (inst->data[word] & ~mask) | (value << low);
 }
 
 #undef BRW_IA16_ADDR_IMM
@@ -731,9 +731,9 @@ typedef struct {
 static inline unsigned
 brw_compact_inst_bits(brw_compact_inst *inst, unsigned high, unsigned low)
 {
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = (1ull << (high - low + 1)) - 1;
 
-   return (inst->data & mask) >> low;
+   return (inst->data >> low) & mask;
 }
 
 /**
@@ -745,12 +745,12 @@ static inline void
 brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low,
                           uint64_t value)
 {
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
 
    /* Make sure the supplied value actually fits in the given bitfield. */
    assert((value & (mask >> low)) == value);
 
-   inst->data = (inst->data & ~mask) | ((value << low) & mask);
+   inst->data = (inst->data & ~mask) | (value << low);
 }
 
 #define F(name, high, low)                                      \
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 96dc20da3cf..97c6f8b2500 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -44,11 +44,16 @@ public:
    fs_reg(struct brw_reg fixed_hw_reg);
    fs_reg(enum register_file file, int reg);
    fs_reg(enum register_file file, int reg, enum brw_reg_type type);
-   fs_reg(enum register_file file, int reg, enum brw_reg_type type, uint8_t width);
 
    bool equals(const fs_reg &r) const;
    bool is_contiguous() const;
 
+   /**
+    * Return the size in bytes of a single logical component of the
+    * register assuming the given execution width.
+    */
+   unsigned component_size(unsigned width) const;
+
    /** Smear a channel of the reg to all channels. */
    fs_reg &set_smear(unsigned subreg);
 
@@ -60,14 +65,6 @@ public:
 
    fs_reg *reladdr;
 
-   /**
-    * The register width.  This indicates how many hardware values are
-    * represented by each virtual value.  Valid values are 1, 8, or 16.
-    * For immediate values, this is 1.  Most of the rest of the time, it
-    * will be equal to the dispatch width.
-    */
-   uint8_t width;
-
    /** Register region horizontal stride */
    uint8_t stride;
 };
@@ -129,33 +126,10 @@ horiz_offset(fs_reg reg, unsigned delta)
 }
 
 static inline fs_reg
-offset(fs_reg reg, unsigned delta)
-{
-   switch (reg.file) {
-   case BAD_FILE:
-      break;
-   case GRF:
-   case MRF:
-   case ATTR:
-      return byte_offset(reg,
-                         delta * MAX2(reg.width * reg.stride, 1) *
-                         type_sz(reg.type));
-   case UNIFORM:
-      reg.reg_offset += delta;
-      break;
-   default:
-      assert(delta == 0);
-   }
-   return reg;
-}
-
-static inline fs_reg
 component(fs_reg reg, unsigned idx)
 {
    assert(reg.subreg_offset == 0);
-   assert(idx < reg.width);
    reg.subreg_offset = idx * type_sz(reg.type);
-   reg.width = 1;
    reg.stride = 0;
    return reg;
 }
@@ -163,7 +137,7 @@ component(fs_reg reg, unsigned idx)
 static inline bool
 is_uniform(const fs_reg &reg)
 {
-   return (reg.width == 1 || reg.stride == 0 || reg.is_null()) &&
+   return (reg.stride == 0 || reg.is_null()) &&
           (!reg.reladdr || is_uniform(*reg.reladdr));
 }
 
@@ -185,8 +159,6 @@ half(fs_reg reg, unsigned idx)
 
    case GRF:
    case MRF:
-      assert(reg.width == 16);
-      reg.width = 8;
       return horiz_offset(reg, 8 * idx);
 
    case ATTR:
@@ -210,20 +182,13 @@ public:
 
    fs_inst();
    fs_inst(enum opcode opcode, uint8_t exec_size);
-   fs_inst(enum opcode opcode, const fs_reg &dst);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0, const fs_reg &src1);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-           const fs_reg &src1);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-           const fs_reg &src1, const fs_reg &src2);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg src[],
-           unsigned sources);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg src[], unsigned sources);
    fs_inst(const fs_inst &that);
@@ -236,6 +201,7 @@ public:
    bool is_send_from_grf() const;
    bool is_partial_write() const;
    bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
+   unsigned components_read(unsigned i) const;
    int regs_read(int arg) const;
    bool can_do_source_mods(const struct brw_device_info *devinfo);
    bool has_side_effects() const;
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index fceacae0e51..966a410a15d 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -113,6 +113,8 @@ public:
    dst_reg(register_file file, int reg);
    dst_reg(register_file file, int reg, const glsl_type *type,
            unsigned writemask);
+   dst_reg(register_file file, int reg, brw_reg_type type,
+           unsigned writemask);
    dst_reg(struct brw_reg reg);
    dst_reg(class vec4_visitor *v, const struct glsl_type *type);
 
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index 49f2e3e498c..f5ecbb54989 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -128,7 +128,7 @@ brw_bind_rep_write_shader(struct brw_context *brw, float *color)
    _mesa_AttachShader(clear->shader_prog, vs);
    _mesa_DeleteShader(vs);
    _mesa_BindAttribLocation(clear->shader_prog, 0, "position");
-   _mesa_ObjectLabel(GL_PROGRAM, clear->shader_prog, -1, "meta clear");
+   _mesa_ObjectLabel(GL_PROGRAM, clear->shader_prog, -1, "meta repclear");
    _mesa_LinkProgram(clear->shader_prog);
 
    clear->color_location =
@@ -200,7 +200,7 @@ brw_draw_rectlist(struct gl_context *ctx, struct rect *rect, int num_instances)
 
    brw_draw_prims(ctx, &prim, 1, NULL,
                   GL_TRUE, start, start + count - 1,
-                  NULL, NULL);
+                  NULL, 0, NULL);
 }
 
 static void
@@ -348,7 +348,7 @@ is_color_fast_clear_compatible(struct brw_context *brw,
    }
 
    for (int i = 0; i < 4; i++) {
-      if (color->f[i] != 0.0 && color->f[i] != 1.0 &&
+      if (color->f[i] != 0.0f && color->f[i] != 1.0f &&
           _mesa_format_has_color_component(format, i)) {
          return false;
       }
@@ -366,7 +366,7 @@ compute_fast_clear_color_bits(const union gl_color_union *color)
    uint32_t bits = 0;
    for (int i = 0; i < 4; i++) {
       /* Testing for non-0 works for integer and float colors */
-      if (color->f[i] != 0.0)
+      if (color->f[i] != 0.0f)
          bits |= 1 << (GEN7_SURFACE_CLEAR_COLOR_SHIFT + (3 - i));
    }
    return bits;
@@ -623,7 +623,7 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
     *     write-flush must be issued before sending any DRAW commands on that
     *     render target.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* If we had to fall back to plain clear for any buffers, clear those now
     * by calling into meta.
@@ -677,7 +677,7 @@ brw_meta_resolve_color(struct brw_context *brw,
    GLuint fbo, rbo;
    struct rect rect;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_meta_begin(ctx, MESA_META_ALL);
 
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index d079197a2a9..aa6df16eb04 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -239,10 +239,10 @@ setup_coord_coeff(GLuint prog, GLuint multiplier, GLuint offset,
 
    if (mirror) {
       _mesa_Uniform1f(multiplier, -scale);
-      _mesa_Uniform1f(offset, src_0 + (dst_1 - 0.5) * scale);
+      _mesa_Uniform1f(offset, src_0 + (dst_1 - 0.5f) * scale);
    } else {
       _mesa_Uniform1f(multiplier, scale);
-      _mesa_Uniform1f(offset, src_0 + (-dst_0 + 0.5) * scale);
+      _mesa_Uniform1f(offset, src_0 + (-dst_0 + 0.5f) * scale);
    }
 }
 
@@ -500,11 +500,11 @@ brw_meta_fbo_stencil_blit(struct brw_context *brw,
                              .mirror_x = mirror_x, .mirror_y = mirror_y };
    adjust_mip_level(dst_mt, dst_irb->mt_level, dst_irb->mt_layer, &dims);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    _mesa_meta_begin(ctx, MESA_META_ALL);
    brw_meta_stencil_blit(brw,
                          dst_mt, dst_irb->mt_level, dst_irb->mt_layer, &dims);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 void
@@ -524,7 +524,7 @@ brw_meta_stencil_updownsample(struct brw_context *brw,
    if (dst->stencil_mt)
       dst = dst->stencil_mt;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    _mesa_meta_begin(ctx, MESA_META_ALL);
 
    _mesa_GenFramebuffers(1, &fbo);
@@ -535,7 +535,7 @@ brw_meta_stencil_updownsample(struct brw_context *brw,
                                  GL_RENDERBUFFER, rbo);
 
    brw_meta_stencil_blit(brw, dst, 0, 0, &dims);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_DeleteRenderbuffers(1, &rbo);
    _mesa_DeleteFramebuffers(1, &fbo);
diff --git a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
index 21507b1ad2a..f39d50a69e6 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
@@ -116,7 +116,7 @@ brw_meta_updownsample(struct brw_context *brw,
       blit_bit = GL_COLOR_BUFFER_BIT;
    }
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_meta_begin(ctx, MESA_META_ALL);
    _mesa_GenFramebuffers(2, fbos);
@@ -147,5 +147,5 @@ brw_meta_updownsample(struct brw_context *brw,
 
    _mesa_meta_end(ctx);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 5a4515b582d..e9d9467d330 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -44,7 +44,8 @@
 #include "main/glformats.h"
 
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
-static void upload_drawing_rect(struct brw_context *brw)
+static void
+upload_drawing_rect(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    const struct gl_framebuffer *fb = ctx->DrawBuffer;
@@ -73,7 +74,8 @@ const struct brw_tracked_state brw_drawing_rect = {
  * The state pointers in this packet are all relative to the general state
  * base address set by CMD_STATE_BASE_ADDRESS, which is 0.
  */
-static void upload_pipelined_state_pointers(struct brw_context *brw )
+static void
+upload_pipelined_state_pointers(struct brw_context *brw)
 {
    if (brw->gen == 5) {
       /* Need to flush before changing clip max threads for errata. */
@@ -104,7 +106,8 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
    brw->ctx.NewDriverState |= BRW_NEW_PSP;
 }
 
-static void upload_psp_urb_cbs(struct brw_context *brw )
+static void
+upload_psp_urb_cbs(struct brw_context *brw)
 {
    upload_pipelined_state_pointers(brw);
    brw_upload_urb_fence(brw);
@@ -580,7 +583,7 @@ brw_emit_depth_stencil_hiz(struct brw_context *brw,
     * non-pipelined state that will need the PIPE_CONTROL workaround.
     */
    if (brw->gen == 6) {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
    }
 
    unsigned int len;
@@ -700,13 +703,11 @@ const struct brw_tracked_state brw_depthbuffer = {
    .emit = brw_emit_depthbuffer,
 };
 
-
-
-/***********************************************************************
+/**
  * Polygon stipple packet
  */
-
-static void upload_polygon_stipple(struct brw_context *brw)
+static void
+upload_polygon_stipple(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    GLuint i;
@@ -728,8 +729,7 @@ static void upload_polygon_stipple(struct brw_context *brw)
    if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       for (i = 0; i < 32; i++)
 	  OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */
-   }
-   else {
+   } else {
       for (i = 0; i < 32; i++)
 	 OUT_BATCH(ctx->PolygonStipple[i]);
    }
@@ -745,12 +745,11 @@ const struct brw_tracked_state brw_polygon_stipple = {
    .emit = upload_polygon_stipple
 };
 
-
-/***********************************************************************
+/**
  * Polygon stipple offset packet
  */
-
-static void upload_polygon_stipple_offset(struct brw_context *brw)
+static void
+upload_polygon_stipple_offset(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
 
@@ -785,10 +784,11 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
    .emit = upload_polygon_stipple_offset
 };
 
-/**********************************************************************
+/**
  * AA Line parameters
  */
-static void upload_aa_line_parameters(struct brw_context *brw)
+static void
+upload_aa_line_parameters(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
 
@@ -815,11 +815,11 @@ const struct brw_tracked_state brw_aa_line_parameters = {
    .emit = upload_aa_line_parameters
 };
 
-/***********************************************************************
+/**
  * Line stipple packet
  */
-
-static void upload_line_stipple(struct brw_context *brw)
+static void
+upload_line_stipple(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    GLfloat tmp;
@@ -834,13 +834,12 @@ static void upload_line_stipple(struct brw_context *brw)
 
    if (brw->gen >= 7) {
       /* in U1.16 */
-      tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+      tmp = 1.0f / ctx->Line.StippleFactor;
       tmpi = tmp * (1<<16);
       OUT_BATCH(tmpi << 15 | ctx->Line.StippleFactor);
-   }
-   else {
+   } else {
       /* in U1.13 */
-      tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+      tmp = 1.0f / ctx->Line.StippleFactor;
       tmpi = tmp * (1<<13);
       OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
    }
@@ -856,7 +855,6 @@ const struct brw_tracked_state brw_line_stipple = {
    .emit = upload_line_stipple
 };
 
-
 void
 brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
 {
@@ -872,11 +870,9 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
    ADVANCE_BATCH();
 }
 
-
-/***********************************************************************
+/**
  * Misc invariant state packets
  */
-
 void
 brw_upload_invariant_state(struct brw_context *brw)
 {
@@ -930,7 +926,8 @@ const struct brw_tracked_state brw_invariant_state = {
  * surface state objects, but not the surfaces that the surface state
  * objects point to.
  */
-static void upload_state_base_address( struct brw_context *brw )
+static void
+upload_state_base_address(struct brw_context *brw)
 {
    /* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of
     * vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index e7e16b6686a..79e31d86759 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -27,19 +27,27 @@
 #include "program/prog_to_nir.h"
 
 static void
-nir_optimize(nir_shader *nir)
+nir_optimize(nir_shader *nir, bool is_scalar)
 {
    bool progress;
    do {
       progress = false;
       nir_lower_vars_to_ssa(nir);
       nir_validate_shader(nir);
-      nir_lower_alu_to_scalar(nir);
-      nir_validate_shader(nir);
+
+      if (is_scalar) {
+         nir_lower_alu_to_scalar(nir);
+         nir_validate_shader(nir);
+      }
+
       progress |= nir_copy_prop(nir);
       nir_validate_shader(nir);
-      nir_lower_phis_to_scalar(nir);
-      nir_validate_shader(nir);
+
+      if (is_scalar) {
+         nir_lower_phis_to_scalar(nir);
+         nir_validate_shader(nir);
+      }
+
       progress |= nir_copy_prop(nir);
       nir_validate_shader(nir);
       progress |= nir_opt_dce(nir);
@@ -57,33 +65,12 @@ nir_optimize(nir_shader *nir)
    } while (progress);
 }
 
-static bool
-count_nir_instrs_in_block(nir_block *block, void *state)
-{
-   int *count = (int *) state;
-   nir_foreach_instr(block, instr) {
-      *count = *count + 1;
-   }
-   return true;
-}
-
-static int
-count_nir_instrs(nir_shader *nir)
-{
-   int count = 0;
-   nir_foreach_overload(nir, overload) {
-      if (!overload->impl)
-         continue;
-      nir_foreach_block(overload->impl, count_nir_instrs_in_block, &count);
-   }
-   return count;
-}
-
 nir_shader *
 brw_create_nir(struct brw_context *brw,
                const struct gl_shader_program *shader_prog,
                const struct gl_program *prog,
-               gl_shader_stage stage)
+               gl_shader_stage stage,
+               bool is_scalar)
 {
    struct gl_context *ctx = &brw->ctx;
    const nir_shader_compiler_options *options =
@@ -100,16 +87,15 @@ brw_create_nir(struct brw_context *brw,
    }
    nir_validate_shader(nir);
 
-   brw_process_nir(nir, brw->intelScreen->devinfo, shader_prog, stage);
+   brw_process_nir(nir, brw->intelScreen->devinfo, shader_prog, stage, is_scalar);
 
    static GLuint msg_id = 0;
    _mesa_gl_debug(&brw->ctx, &msg_id,
                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
                   MESA_DEBUG_TYPE_OTHER,
                   MESA_DEBUG_SEVERITY_NOTIFICATION,
-                  "%s NIR shader: %d inst\n",
-                  _mesa_shader_stage_to_abbrev(stage),
-                  count_nir_instrs(nir));
+                  "%s NIR shader:\n",
+                  _mesa_shader_stage_to_abbrev(stage));
 
    return nir;
 }
@@ -118,7 +104,7 @@ void
 brw_process_nir(nir_shader *nir,
                 const struct brw_device_info *devinfo,
                 const struct gl_shader_program *shader_prog,
-                gl_shader_stage stage)
+                gl_shader_stage stage, bool is_scalar)
 {
    bool debug_enabled = INTEL_DEBUG & intel_debug_flag_for_shader_stage(stage);
 
@@ -134,22 +120,33 @@ brw_process_nir(nir_shader *nir,
    nir_split_var_copies(nir);
    nir_validate_shader(nir);
 
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
 
    /* Lower a bunch of stuff */
    nir_lower_var_copies(nir);
    nir_validate_shader(nir);
 
    /* Get rid of split copies */
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
+
+   if (is_scalar) {
+      nir_assign_var_locations_direct_first(nir, &nir->uniforms,
+                                            &nir->num_direct_uniforms,
+                                            &nir->num_uniforms,
+                                            is_scalar);
+      nir_assign_var_locations(&nir->outputs, &nir->num_outputs, is_scalar);
+   } else {
+      nir_assign_var_locations(&nir->uniforms,
+                               &nir->num_uniforms,
+                               is_scalar);
 
-   nir_assign_var_locations_scalar_direct_first(nir, &nir->uniforms,
-                                                &nir->num_direct_uniforms,
-                                                &nir->num_uniforms);
-   nir_assign_var_locations_scalar(&nir->inputs, &nir->num_inputs);
-   nir_assign_var_locations_scalar(&nir->outputs, &nir->num_outputs);
+      foreach_list_typed(nir_variable, var, node, &nir->outputs)
+         var->data.driver_location = var->data.location;
+   }
+   nir_assign_var_locations(&nir->inputs, &nir->num_inputs, is_scalar);
+
+   nir_lower_io(nir, is_scalar);
 
-   nir_lower_io(nir);
    nir_validate_shader(nir);
 
    nir_remove_dead_variables(nir);
@@ -168,7 +165,7 @@ brw_process_nir(nir_shader *nir,
    nir_lower_atomics(nir);
    nir_validate_shader(nir);
 
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
 
    if (devinfo->gen >= 6) {
       /* Try and fuse multiply-adds */
@@ -201,9 +198,14 @@ brw_process_nir(nir_shader *nir,
       nir_print_shader(nir, stderr);
    }
 
-   nir_convert_from_ssa(nir);
+   nir_convert_from_ssa(nir, is_scalar);
    nir_validate_shader(nir);
 
+   if (!is_scalar) {
+      nir_lower_vec_to_movs(nir);
+      nir_validate_shader(nir);
+   }
+
    /* This is the last pass we run before we start emitting stuff.  It
     * determines when we need to insert boolean resolves on Gen <= 5.  We
     * run it last because it stashes data in instr->pass_flags and we don't
@@ -220,3 +222,42 @@ brw_process_nir(nir_shader *nir,
       nir_print_shader(nir, stderr);
    }
 }
+
+enum brw_reg_type
+brw_type_for_nir_type(nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_unsigned:
+      return BRW_REGISTER_TYPE_UD;
+   case nir_type_bool:
+   case nir_type_int:
+      return BRW_REGISTER_TYPE_D;
+   case nir_type_float:
+      return BRW_REGISTER_TYPE_F;
+   default:
+      unreachable("unknown type");
+   }
+
+   return BRW_REGISTER_TYPE_F;
+}
+
+/* Returns the glsl_base_type corresponding to a nir_alu_type.
+ * This is used by both brw_vec4_nir and brw_fs_nir.
+ */
+enum glsl_base_type
+brw_glsl_base_type_for_nir_type(nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_float:
+      return GLSL_TYPE_FLOAT;
+
+   case nir_type_int:
+      return GLSL_TYPE_INT;
+
+   case nir_type_unsigned:
+      return GLSL_TYPE_UINT;
+
+   default:
+      unreachable("bad type");
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 8487cef0901..5a1358890cc 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -24,6 +24,7 @@
 #pragma once
 
 #include "brw_context.h"
+#include "brw_reg.h"
 #include "glsl/nir/nir.h"
 
 #ifdef __cplusplus
@@ -77,13 +78,18 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir);
 nir_shader *brw_create_nir(struct brw_context *brw,
                            const struct gl_shader_program *shader_prog,
                            const struct gl_program *prog,
-                           gl_shader_stage stage);
+                           gl_shader_stage stage,
+                           bool is_scalar);
+
+enum brw_reg_type brw_type_for_nir_type(nir_alu_type type);
+
+enum glsl_base_type brw_glsl_base_type_for_nir_type(nir_alu_type type);
 
 void
 brw_process_nir(nir_shader *nir,
                 const struct brw_device_info *devinfo,
                 const struct gl_shader_program *shader_prog,
-                gl_shader_stage stage);
+                gl_shader_stage stage, bool is_scalar);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index f0b018cf84a..c995d2b7e2d 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -43,8 +43,8 @@
 static uint8_t
 get_resolve_status_for_src(nir_src *src)
 {
-   nir_instr *src_instr = nir_src_get_parent_instr(src);
-   if (src_instr) {
+   if (src->is_ssa) {
+      nir_instr *src_instr = src->ssa->parent_instr;
       uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
 
       /* If the source instruction needs resolve, then from the perspective
@@ -66,8 +66,8 @@ get_resolve_status_for_src(nir_src *src)
 static bool
 src_mark_needs_resolve(nir_src *src, void *void_state)
 {
-   nir_instr *src_instr = nir_src_get_parent_instr(src);
-   if (src_instr) {
+   if (src->is_ssa) {
+      nir_instr *src_instr = src->ssa->parent_instr;
       uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
 
       /* If the source instruction is unresolved, then mark it as needing
@@ -109,28 +109,27 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          uint8_t resolve_status;
          nir_alu_instr *alu = nir_instr_as_alu(instr);
          switch (alu->op) {
-         case nir_op_flt:
-         case nir_op_ilt:
-         case nir_op_ult:
-         case nir_op_fge:
-         case nir_op_ige:
-         case nir_op_uge:
-         case nir_op_feq:
-         case nir_op_ieq:
-         case nir_op_fne:
-         case nir_op_ine:
-         case nir_op_f2b:
-         case nir_op_i2b:
-            /* This instruction will turn into a CMP when we actually emit
-             * so the result will have to be resolved before it can be used.
+         case nir_op_bany2:
+         case nir_op_bany3:
+         case nir_op_bany4:
+         case nir_op_ball_fequal2:
+         case nir_op_ball_iequal2:
+         case nir_op_ball_fequal3:
+         case nir_op_ball_iequal3:
+         case nir_op_ball_fequal4:
+         case nir_op_ball_iequal4:
+         case nir_op_bany_fnequal2:
+         case nir_op_bany_inequal2:
+         case nir_op_bany_fnequal3:
+         case nir_op_bany_inequal3:
+         case nir_op_bany_fnequal4:
+         case nir_op_bany_inequal4:
+            /* These are only implemented by the vec4 backend and its
+             * implementation emits resolved booleans.  At some point in the
+             * future, this may change and we'll have to remove some of the
+             * above cases.
              */
-            resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
-
-            /* Even though the destination is allowed to be left unresolved,
-             * the sources are treated as regular integers or floats so
-             * they need to be resolved.
-             */
-            nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
             break;
 
          case nir_op_imov:
@@ -169,14 +168,28 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          }
 
          default:
-            resolve_status = BRW_NIR_NON_BOOLEAN;
+            if (nir_op_infos[alu->op].output_type == nir_type_bool) {
+               /* This instructions will turn into a CMP when we actually emit
+                * them so the result will have to be resolved before it can be
+                * used.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
+
+               /* Even though the destination is allowed to be left
+                * unresolved, the sources are treated as regular integers or
+                * floats so they need to be resolved.
+                */
+               nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            } else {
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            }
          }
 
-         /* If the destination is SSA-like, go ahead allow unresolved booleans.
+         /* If the destination is SSA, go ahead allow unresolved booleans.
           * If the destination register doesn't have a well-defined parent_instr
           * we need to resolve immediately.
           */
-         if (alu->dest.dest.reg.reg->parent_instr == NULL &&
+         if (!alu->dest.dest.is_ssa &&
              resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
             resolve_status = BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
          }
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
index 2c8cd491a8e..7e90e8a8fa1 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -581,7 +581,7 @@ snapshot_statistics_registers(struct brw_context *brw,
    const int group = PIPELINE_STATS_COUNTERS;
    const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    for (int i = 0; i < num_counters; i++) {
       if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
@@ -687,7 +687,7 @@ stop_oa_counters(struct brw_context *brw)
  * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
  * including the required PIPE_CONTROL flushes.
  *
- * Sandybridge is the worst case scenario: intel_batchbuffer_emit_mi_flush
+ * Sandybridge is the worst case scenario: brw_emit_mi_flush
  * expands to three PIPE_CONTROLs which are 4 DWords each.  We have to flush
  * before and after MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add
  * the 3 DWords for MI_REPORT_PERF_COUNT itself.
@@ -710,10 +710,10 @@ emit_mi_report_perf_count(struct brw_context *brw,
    /* Make sure the commands to take a snapshot fits in a single batch. */
    intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
                                    RENDER_RING);
-   int batch_used = brw->batch.used;
+   int batch_used = USED_BATCH(brw->batch);
 
    /* Reports apparently don't always get written unless we flush first. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen == 5) {
       /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
@@ -751,10 +751,10 @@ emit_mi_report_perf_count(struct brw_context *brw,
    }
 
    /* Reports apparently don't always get written unless we flush after. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    (void) batch_used;
-   assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
+   assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
 }
 
 /**
@@ -1386,7 +1386,7 @@ void
 brw_perf_monitor_new_batch(struct brw_context *brw)
 {
    assert(brw->batch.ring == RENDER_RING);
-   assert(brw->gen < 6 || brw->batch.used == 0);
+   assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
 
    if (brw->perfmon.oa_users == 0)
       return;
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
new file mode 100644
index 00000000000..7ee3cb680f7
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_fbo.h"
+#include "intel_reg.h"
+
+/**
+ * According to the latest documentation, any PIPE_CONTROL with the
+ * "Command Streamer Stall" bit set must also have another bit set,
+ * with five different options:
+ *
+ *  - Render Target Cache Flush
+ *  - Depth Cache Flush
+ *  - Stall at Pixel Scoreboard
+ *  - Post-Sync Operation
+ *  - Depth Stall
+ *
+ * I chose "Stall at Pixel Scoreboard" since we've used it effectively
+ * in the past, but the choice is fairly arbitrary.
+ */
+static void
+gen8_add_cs_stall_workaround_bits(uint32_t *flags)
+{
+   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                      PIPE_CONTROL_WRITE_IMMEDIATE |
+                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                      PIPE_CONTROL_WRITE_TIMESTAMP |
+                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                      PIPE_CONTROL_DEPTH_STALL;
+
+   /* If we're doing a CS stall, and don't already have one of the
+    * workaround bits set, add "Stall at Pixel Scoreboard."
+    */
+   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
+      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+}
+
+/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+ *
+ * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+ *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ *
+ * Note that the kernel does CS stalls between batches, so we only need
+ * to count them within a batch.
+ */
+static uint32_t
+gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen == 7 && !brw->is_haswell) {
+      if (flags & PIPE_CONTROL_CS_STALL) {
+         /* If we're doing a CS stall, reset the counter and carry on. */
+         brw->pipe_controls_since_last_cs_stall = 0;
+         return 0;
+      }
+
+      /* If this is the fourth pipe control without a CS stall, do one now. */
+      if (++brw->pipe_controls_since_last_cs_stall == 4) {
+         brw->pipe_controls_since_last_cs_stall = 0;
+         return PIPE_CONTROL_CS_STALL;
+      }
+   }
+   return 0;
+}
+
+/**
+ * Emit a PIPE_CONTROL with various flushing flags.
+ *
+ * The caller is responsible for deciding what flags are appropriate for the
+ * given generation.
+ */
+void
+brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen >= 8) {
+      gen8_add_cs_stall_workaround_bits(&flags);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+}
+
+/**
+ * Emit a PIPE_CONTROL that writes to a buffer object.
+ *
+ * \p flags should contain one of the following items:
+ *  - PIPE_CONTROL_WRITE_IMMEDIATE
+ *  - PIPE_CONTROL_WRITE_TIMESTAMP
+ *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
+ */
+void
+brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
+                            drm_intel_bo *bo, uint32_t offset,
+                            uint32_t imm_lower, uint32_t imm_upper)
+{
+   if (brw->gen >= 8) {
+      gen8_add_cs_stall_workaround_bits(&flags);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
+       * on later platforms.  We always use PPGTT on Gen7+.
+       */
+      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                gen6_gtt | offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   }
+}
+
+/**
+ * Restriction [DevSNB, DevIVB]:
+ *
+ * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
+ * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
+ * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
+ * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
+ * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
+ * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
+ * unless SW can otherwise guarantee that the pipeline from WM onwards is
+ * already flushed (e.g., via a preceding MI_FLUSH).
+ */
+void
+brw_emit_depth_stall_flushes(struct brw_context *brw)
+{
+   assert(brw->gen >= 6 && brw->gen <= 9);
+
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+}
+
+/**
+ * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
+ * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
+ *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
+ *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
+ *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
+ *  to be sent before any combination of VS associated 3DSTATE."
+ */
+void
+gen7_emit_vs_workaround_flush(struct brw_context *brw)
+{
+   assert(brw->gen == 7);
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_WRITE_IMMEDIATE
+                               | PIPE_CONTROL_DEPTH_STALL,
+                               brw->workaround_bo, 0,
+                               0, 0);
+}
+
+
+/**
+ * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
+ */
+void
+gen7_emit_cs_stall_flush(struct brw_context *brw)
+{
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_CS_STALL
+                               | PIPE_CONTROL_WRITE_IMMEDIATE,
+                               brw->workaround_bo, 0,
+                               0, 0);
+}
+
+
+/**
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6.  From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ *     "1 of the following must also be set:
+ *      - Render Target Cache Flush Enable ([12] of DW1)
+ *      - Depth Cache Flush Enable ([0] of DW1)
+ *      - Stall at Pixel Scoreboard ([1] of DW1)
+ *      - Depth Stall ([13] of DW1)
+ *      - Post-Sync Operation ([13] of DW1)
+ *      - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it.  Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either.  Notify enable is IRQs, which aren't
+ * really our business.  That leaves only stall at scoreboard.
+ */
+void
+brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
+{
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_CS_STALL |
+                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
+
+   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
+                               brw->workaround_bo, 0, 0, 0);
+}
+
+/* Emit a pipelined flush to either flush render and texture cache for
+ * reading from a FBO-drawn texture, or flush so that frontbuffer
+ * render appears on the screen in DRI1.
+ *
+ * This is also used for the always_flush_cache driconf debug option.
+ */
+void
+brw_emit_mi_flush(struct brw_context *brw)
+{
+   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
+      BEGIN_BATCH_BLT(4);
+      OUT_BATCH(MI_FLUSH_DW);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
+      if (brw->gen >= 6) {
+         if (brw->gen == 9) {
+            /* Hardware workaround: SKL
+             *
+             * Emit Pipe Control with all bits set to zero before emitting
+             * a Pipe Control with VF Cache Invalidate set.
+             */
+            brw_emit_pipe_control_flush(brw, 0);
+         }
+
+         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
+                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                  PIPE_CONTROL_CS_STALL;
+
+         if (brw->gen == 6) {
+            /* Hardware workaround: SNB B-Spec says:
+             *
+             * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
+             * Flush Enable =1, a PIPE_CONTROL with any non-zero
+             * post-sync-op is required.
+             */
+            brw_emit_post_sync_nonzero_flush(brw);
+         }
+      }
+      brw_emit_pipe_control_flush(brw, flags);
+   }
+
+   brw_render_cache_set_clear(brw);
+}
+
+int
+brw_init_pipe_control(struct brw_context *brw,
+                      const struct brw_device_info *devinfo)
+{
+   if (devinfo->gen < 6)
+      return 0;
+
+   /* We can't just use brw_state_batch to get a chunk of space for
+    * the gen6 workaround because it involves actually writing to
+    * the buffer, and the kernel doesn't let us write to the batch.
+    */
+   brw->workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
+                                           "pipe_control workaround",
+                                           4096, 4096);
+   if (brw->workaround_bo == NULL)
+      return -ENOMEM;
+
+   brw->pipe_controls_since_last_cs_stall = 0;
+
+   return 0;
+}
+
+void
+brw_fini_pipe_control(struct brw_context *brw)
+{
+   drm_intel_bo_unreference(brw->workaround_bo);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_primitive_restart.c b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
index 2c7a7e8b8dd..6ed79d7cb75 100644
--- a/src/mesa/drivers/dri/i965/brw_primitive_restart.c
+++ b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
@@ -161,7 +161,8 @@ brw_handle_primitive_restart(struct gl_context *ctx,
       /* Cut index should work for primitive restart, so use it
        */
       brw->prim_restart.enable_cut_index = true;
-      brw_draw_prims(ctx, prims, nr_prims, ib, GL_FALSE, -1, -1, NULL, indirect);
+      brw_draw_prims(ctx, prims, nr_prims, ib, GL_FALSE, -1, -1, NULL, 0,
+                     indirect);
       brw->prim_restart.enable_cut_index = false;
    } else {
       /* Not all the primitive draw modes are supported by the cut index,
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index ea128ccb670..5a54cd39076 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -143,7 +143,7 @@ brwProgramStringNotify(struct gl_context *ctx,
       brw_add_texrect_params(prog);
 
       if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
-         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT);
+         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
       }
 
       brw_fs_precompile(ctx, NULL, prog);
@@ -169,7 +169,8 @@ brwProgramStringNotify(struct gl_context *ctx,
       brw_add_texrect_params(prog);
 
       if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
-         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX);
+         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
+                                    brw->intelScreen->compiler->scalar_vs);
       }
 
       brw_vs_precompile(ctx, NULL, prog);
@@ -196,7 +197,7 @@ brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
    unsigned bits = (PIPE_CONTROL_DATA_CACHE_INVALIDATE |
                     PIPE_CONTROL_NO_WRITE |
                     PIPE_CONTROL_CS_STALL);
-   assert(brw->gen >= 7 && brw->gen <= 8);
+   assert(brw->gen >= 7 && brw->gen <= 9);
 
    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
                    GL_ELEMENT_ARRAY_BARRIER_BIT |
@@ -574,10 +575,13 @@ brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
             struct gl_shader *shader, struct gl_program *prog)
 {
    if (shader_prog) {
-      fprintf(stderr,
-              "GLSL IR for native %s shader %d:\n", stage, shader_prog->Name);
-      _mesa_print_ir(stderr, shader->ir, NULL);
-      fprintf(stderr, "\n\n");
+      if (shader->ir) {
+         fprintf(stderr,
+                 "GLSL IR for native %s shader %d:\n",
+                 stage, shader_prog->Name);
+         _mesa_print_ir(stderr, shader->ir, NULL);
+         fprintf(stderr, "\n\n");
+      }
    } else {
       fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
               stage, prog->Id, stage);
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index aea4d9b77d3..d6b012c392e 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -497,13 +497,22 @@ brw_get_timestamp(struct gl_context *ctx)
    struct brw_context *brw = brw_context(ctx);
    uint64_t result = 0;
 
-   drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+   switch (brw->intelScreen->hw_has_timestamp) {
+   case 3: /* New kernel, always full 36bit accuracy */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP | 1, &result);
+      break;
+   case 2: /* 64bit kernel, result is left-shifted by 32bits, losing 4bits */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+      result = result >> 32;
+      break;
+   case 1: /* 32bit kernel, result is 36bit wide but may be inaccurate! */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+      break;
+   }
 
    /* See logic in brw_queryobj_get_results() */
-   result = result >> 32;
    result *= 80;
    result &= (1ull << 36) - 1;
-
    return result;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index c8b134103bb..31806f769bd 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -853,7 +853,7 @@ static inline struct brw_reg
 spread(struct brw_reg reg, unsigned s)
 {
    if (s) {
-      assert(is_power_of_two(s));
+      assert(_mesa_is_pow_two(s));
 
       if (reg.hstride)
          reg.hstride += cvt(s) - 1;
@@ -950,6 +950,12 @@ brw_set_writemask(struct brw_reg reg, unsigned mask)
    return reg;
 }
 
+static inline unsigned
+brw_writemask_for_size(unsigned n)
+{
+   return (1 << n) - 1;
+}
+
 static inline struct brw_reg
 negate(struct brw_reg reg)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 22ccbfe8461..2021bb3b460 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -425,11 +425,11 @@ brw_update_sampler_state(struct brw_context *brw,
 
    /* Enable anisotropic filtering if desired. */
    unsigned max_anisotropy = BRW_ANISORATIO_2;
-   if (sampler->MaxAnisotropy > 1.0) {
+   if (sampler->MaxAnisotropy > 1.0f) {
       min_filter = BRW_MAPFILTER_ANISOTROPIC;
       mag_filter = BRW_MAPFILTER_ANISOTROPIC;
 
-      if (sampler->MaxAnisotropy > 2.0) {
+      if (sampler->MaxAnisotropy > 2.0f) {
 	 max_anisotropy =
             MIN2((sampler->MaxAnisotropy - 2) / 2, BRW_ANISORATIO_16);
       }
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index ee0add5d765..b49961fff68 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -1314,8 +1314,8 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
                 * single-result send is probably actually reducing register
                 * pressure.
                 */
-               if (inst->regs_written <= inst->dst.width / 8 &&
-                   chosen_inst->regs_written > chosen_inst->dst.width / 8) {
+               if (inst->regs_written <= inst->exec_size / 8 &&
+                   chosen_inst->regs_written > chosen_inst->exec_size / 8) {
                   chosen = n;
                   continue;
                } else if (inst->regs_written > chosen_inst->regs_written) {
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 5d9892214a9..b126f82ebbf 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -45,7 +45,7 @@ static void upload_sf_vp(struct brw_context *brw)
    struct gl_context *ctx = &brw->ctx;
    struct brw_sf_viewport *sfv;
    GLfloat y_scale, y_bias;
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
    sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
@@ -220,7 +220,7 @@ static void upload_sf_unit( struct brw_context *brw )
 
    /* _NEW_LINE */
    sf->sf6.line_width =
-      CLAMP(ctx->Line.Width, 1.0, ctx->Const.MaxLineWidth) * (1<<1);
+      CLAMP(ctx->Line.Width, 1.0f, ctx->Const.MaxLineWidth) * (1<<1);
 
    sf->sf6.line_endcap_aa_region_width = 1;
    if (ctx->Line.SmoothFlag)
@@ -259,9 +259,10 @@ static void upload_sf_unit( struct brw_context *brw )
 
    /* _NEW_POINT */
    sf->sf7.sprite_point = ctx->Point.PointSprite;
-   sf->sf7.point_size = CLAMP(rint(CLAMP(ctx->Point.Size,
-					 ctx->Point.MinSize,
-					 ctx->Point.MaxSize)), 1, 255) * (1<<3);
+   sf->sf7.point_size = CLAMP(rintf(CLAMP(ctx->Point.Size,
+                                          ctx->Point.MinSize,
+                                          ctx->Point.MaxSize)), 1.0f, 255.0f) *
+                        (1<<3);
    /* _NEW_PROGRAM | _NEW_POINT */
    sf->sf7.use_point_size_state = !(ctx->VertexProgram.PointSizeEnabled ||
 				    ctx->Point._Attenuated);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 06393c8ff2b..67b8dde7cc8 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -113,22 +113,32 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 	 (i == MESA_SHADER_FRAGMENT);
       compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
       compiler->glsl_compiler_options[i].LowerClipDistance = true;
+
+      /* !ARB_gpu_shader5 */
+      if (devinfo->gen < 7)
+         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
    }
 
    compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = true;
    compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
 
-   if (compiler->scalar_vs) {
-      /* If we're using the scalar backend for vertex shaders, we need to
-       * configure these accordingly.
-       */
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
+   if (compiler->scalar_vs || brw_env_var_as_boolean("INTEL_USE_NIR", false)) {
+      if (compiler->scalar_vs) {
+         /* If we're using the scalar backend for vertex shaders, we need to
+          * configure these accordingly.
+          */
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
+      }
 
       compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions = nir_options;
    }
 
+   if (brw_env_var_as_boolean("INTEL_USE_NIR", false)) {
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].NirOptions = nir_options;
+   }
+
    compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions = nir_options;
    compiler->glsl_compiler_options[MESA_SHADER_COMPUTE].NirOptions = nir_options;
 
@@ -229,7 +239,8 @@ brw_lower_packing_builtins(struct brw_context *brw,
 }
 
 static void
-process_glsl_ir(struct brw_context *brw,
+process_glsl_ir(gl_shader_stage stage,
+                struct brw_context *brw,
                 struct gl_shader_program *shader_prog,
                 struct gl_shader *shader)
 {
@@ -255,7 +266,9 @@ process_glsl_ir(struct brw_context *brw,
                       EXP_TO_EXP2 |
                       LOG_TO_LOG2 |
                       bitfield_insert |
-                      LDEXP_TO_ARITH);
+                      LDEXP_TO_ARITH |
+                      CARRY_TO_ARITH |
+                      BORROW_TO_ARITH);
 
    /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
     * if-statements need to be flattened.
@@ -275,15 +288,17 @@ process_glsl_ir(struct brw_context *brw,
    lower_quadop_vector(shader->ir, false);
 
    bool lowered_variable_indexing =
-      lower_variable_index_to_cond_assign(shader->ir,
+      lower_variable_index_to_cond_assign((gl_shader_stage)stage,
+                                          shader->ir,
                                           options->EmitNoIndirectInput,
                                           options->EmitNoIndirectOutput,
                                           options->EmitNoIndirectTemp,
                                           options->EmitNoIndirectUniform);
 
    if (unlikely(brw->perf_debug && lowered_variable_indexing)) {
-      perf_debug("Unsupported form of variable indexing in FS; falling "
-                 "back to very inefficient code generation\n");
+      perf_debug("Unsupported form of variable indexing in %s; falling "
+                 "back to very inefficient code generation\n",
+                 _mesa_shader_stage_to_abbrev(shader->Stage));
    }
 
    lower_ubo_reference(shader, shader->ir);
@@ -308,7 +323,7 @@ process_glsl_ir(struct brw_context *brw,
    } while (progress);
 
    if (options->NirOptions != NULL)
-      lower_output_reads(shader->ir);
+      lower_output_reads(stage, shader->ir);
 
    validate_ir_tree(shader->ir);
 
@@ -352,7 +367,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       _mesa_copy_linked_program_data((gl_shader_stage) stage, shProg, prog);
 
-      process_glsl_ir(brw, shProg, shader);
+      process_glsl_ir((gl_shader_stage) stage, brw, shProg, shader);
 
       /* Make a pass over the IR to add state references for any built-in
        * uniforms that are used.  This has to be done now (during linking).
@@ -387,8 +402,10 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       brw_add_texrect_params(prog);
 
-      if (options->NirOptions)
-         prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage);
+      if (options->NirOptions) {
+         prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage,
+                                    is_scalar_shader_stage(brw, stage));
+      }
 
       _mesa_reference_program(ctx, &prog, NULL);
    }
@@ -422,6 +439,7 @@ brw_type_for_base_type(const struct glsl_type *type)
       return BRW_REGISTER_TYPE_F;
    case GLSL_TYPE_INT:
    case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_SUBROUTINE:
       return BRW_REGISTER_TYPE_D;
    case GLSL_TYPE_UINT:
       return BRW_REGISTER_TYPE_UD;
@@ -528,6 +546,8 @@ brw_instruction_name(enum opcode op)
       return opcode_descs[op].name;
    case FS_OPCODE_FB_WRITE:
       return "fb_write";
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      return "fb_write_logical";
    case FS_OPCODE_BLORP_FB_WRITE:
       return "blorp_fb_write";
    case FS_OPCODE_REP_FB_WRITE:
@@ -556,43 +576,80 @@ brw_instruction_name(enum opcode op)
 
    case SHADER_OPCODE_TEX:
       return "tex";
+   case SHADER_OPCODE_TEX_LOGICAL:
+      return "tex_logical";
    case SHADER_OPCODE_TXD:
       return "txd";
+   case SHADER_OPCODE_TXD_LOGICAL:
+      return "txd_logical";
    case SHADER_OPCODE_TXF:
       return "txf";
+   case SHADER_OPCODE_TXF_LOGICAL:
+      return "txf_logical";
    case SHADER_OPCODE_TXL:
       return "txl";
+   case SHADER_OPCODE_TXL_LOGICAL:
+      return "txl_logical";
    case SHADER_OPCODE_TXS:
       return "txs";
+   case SHADER_OPCODE_TXS_LOGICAL:
+      return "txs_logical";
    case FS_OPCODE_TXB:
       return "txb";
+   case FS_OPCODE_TXB_LOGICAL:
+      return "txb_logical";
    case SHADER_OPCODE_TXF_CMS:
       return "txf_cms";
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+      return "txf_cms_logical";
    case SHADER_OPCODE_TXF_UMS:
       return "txf_ums";
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+      return "txf_ums_logical";
    case SHADER_OPCODE_TXF_MCS:
       return "txf_mcs";
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+      return "txf_mcs_logical";
    case SHADER_OPCODE_LOD:
       return "lod";
+   case SHADER_OPCODE_LOD_LOGICAL:
+      return "lod_logical";
    case SHADER_OPCODE_TG4:
       return "tg4";
+   case SHADER_OPCODE_TG4_LOGICAL:
+      return "tg4_logical";
    case SHADER_OPCODE_TG4_OFFSET:
       return "tg4_offset";
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return "tg4_offset_logical";
+
    case SHADER_OPCODE_SHADER_TIME_ADD:
       return "shader_time_add";
 
    case SHADER_OPCODE_UNTYPED_ATOMIC:
       return "untyped_atomic";
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      return "untyped_atomic_logical";
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
       return "untyped_surface_read";
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      return "untyped_surface_read_logical";
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
       return "untyped_surface_write";
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      return "untyped_surface_write_logical";
    case SHADER_OPCODE_TYPED_ATOMIC:
       return "typed_atomic";
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      return "typed_atomic_logical";
    case SHADER_OPCODE_TYPED_SURFACE_READ:
       return "typed_surface_read";
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      return "typed_surface_read_logical";
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
       return "typed_surface_write";
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return "typed_surface_write_logical";
    case SHADER_OPCODE_MEMORY_FENCE:
       return "memory_fence";
 
@@ -653,8 +710,6 @@ brw_instruction_name(enum opcode op)
    case FS_OPCODE_DISCARD_JUMP:
       return "discard_jump";
 
-   case FS_OPCODE_SET_OMASK:
-      return "set_omask";
    case FS_OPCODE_SET_SAMPLE_ID:
       return "set_sample_id";
    case FS_OPCODE_SET_SIMD4X2_OFFSET:
@@ -724,6 +779,8 @@ brw_instruction_name(enum opcode op)
       return "cs_terminate";
    case SHADER_OPCODE_BARRIER:
       return "barrier";
+   case SHADER_OPCODE_MULH:
+      return "mulh";
    }
 
    unreachable("not reached");
@@ -942,6 +999,7 @@ backend_instruction::is_commutative() const
    case BRW_OPCODE_XOR:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
       return true;
    case BRW_OPCODE_SEL:
       /* MIN and MAX are commutative. */
@@ -1049,6 +1107,7 @@ backend_instruction::can_do_saturate() const
    case BRW_OPCODE_MATH:
    case BRW_OPCODE_MOV:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_PLN:
    case BRW_OPCODE_RNDD:
    case BRW_OPCODE_RNDE:
@@ -1147,10 +1206,14 @@ backend_instruction::has_side_effects() const
 {
    switch (opcode) {
    case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_MEMORY_FENCE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
    case FS_OPCODE_FB_WRITE:
@@ -1356,3 +1419,34 @@ backend_shader::assign_common_binding_table_offsets(uint32_t next_binding_table_
 
    /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
 }
+
+void
+backend_shader::setup_image_uniform_values(const gl_uniform_storage *storage)
+{
+   const unsigned stage = _mesa_program_enum_to_shader_stage(prog->Target);
+
+   for (unsigned i = 0; i < MAX2(storage->array_elements, 1); i++) {
+      const unsigned image_idx = storage->image[stage].index + i;
+      const brw_image_param *param = &stage_prog_data->image_param[image_idx];
+
+      /* Upload the brw_image_param structure.  The order is expected to match
+       * the BRW_IMAGE_PARAM_*_OFFSET defines.
+       */
+      setup_vector_uniform_values(
+         (const gl_constant_value *)&param->surface_idx, 1);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->offset, 2);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->size, 3);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->stride, 4);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->tiling, 3);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->swizzling, 2);
+
+      brw_mark_surface_used(
+         stage_prog_data,
+         stage_prog_data->binding_table.image_start + image_idx);
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index b2c1a0b8d69..2cc97f24972 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "main/compiler.h"
 #include "glsl/ir.h"
+#include "program/prog_parameter.h"
 
 #ifdef __cplusplus
 #include "brw_ir_allocator.h"
@@ -268,6 +269,10 @@ public:
    void assign_common_binding_table_offsets(uint32_t next_binding_table_offset);
 
    virtual void invalidate_live_intervals() = 0;
+
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n) = 0;
+   void setup_image_uniform_values(const gl_uniform_storage *storage);
 };
 
 uint32_t brw_texture_offset(int *offsets, unsigned num_components);
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 987672f8815..78a1f874b4e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -72,8 +72,10 @@ extern const struct brw_tracked_state brw_vs_samplers;
 extern const struct brw_tracked_state brw_gs_samplers;
 extern const struct brw_tracked_state brw_vs_ubo_surfaces;
 extern const struct brw_tracked_state brw_vs_abo_surfaces;
+extern const struct brw_tracked_state brw_vs_image_surfaces;
 extern const struct brw_tracked_state brw_gs_ubo_surfaces;
 extern const struct brw_tracked_state brw_gs_abo_surfaces;
+extern const struct brw_tracked_state brw_gs_image_surfaces;
 extern const struct brw_tracked_state brw_vs_unit;
 extern const struct brw_tracked_state brw_gs_prog;
 extern const struct brw_tracked_state brw_wm_prog;
@@ -84,7 +86,9 @@ extern const struct brw_tracked_state brw_gs_binding_table;
 extern const struct brw_tracked_state brw_vs_binding_table;
 extern const struct brw_tracked_state brw_wm_ubo_surfaces;
 extern const struct brw_tracked_state brw_wm_abo_surfaces;
+extern const struct brw_tracked_state brw_wm_image_surfaces;
 extern const struct brw_tracked_state brw_cs_abo_surfaces;
+extern const struct brw_tracked_state brw_cs_image_surfaces;
 extern const struct brw_tracked_state brw_wm_unit;
 extern const struct brw_tracked_state brw_interpolation_map;
 
@@ -121,7 +125,6 @@ extern const struct brw_tracked_state gen6_wm_state;
 extern const struct brw_tracked_state gen7_depthbuffer;
 extern const struct brw_tracked_state gen7_clip_state;
 extern const struct brw_tracked_state gen7_disable_stages;
-extern const struct brw_tracked_state gen7_gs_push_constants;
 extern const struct brw_tracked_state gen7_gs_state;
 extern const struct brw_tracked_state gen7_ps_state;
 extern const struct brw_tracked_state gen7_push_constant_space;
@@ -132,6 +135,7 @@ extern const struct brw_tracked_state gen7_sol_state;
 extern const struct brw_tracked_state gen7_urb;
 extern const struct brw_tracked_state gen7_vs_state;
 extern const struct brw_tracked_state gen7_wm_state;
+extern const struct brw_tracked_state gen7_hw_binding_tables;
 extern const struct brw_tracked_state haswell_cut_index;
 extern const struct brw_tracked_state gen8_blend_state;
 extern const struct brw_tracked_state gen8_disable_stages;
@@ -266,15 +270,6 @@ void brw_update_renderbuffer_surfaces(struct brw_context *brw,
                                       uint32_t render_target_start,
                                       uint32_t *surf_offset);
 
-/* gen7_wm_state.c */
-void
-gen7_upload_ps_state(struct brw_context *brw,
-                     const struct gl_fragment_program *fp,
-                     const struct brw_stage_state *stage_state,
-                     const struct brw_wm_prog_data *prog_data,
-                     bool enable_dual_src_blend, unsigned sample_mask,
-                     unsigned fast_clear_op);
-
 /* gen7_wm_surface_state.c */
 uint32_t gen7_surface_tiling_mode(uint32_t tiling);
 uint32_t gen7_surface_msaa_bits(unsigned num_samples, enum intel_msaa_layout l);
@@ -372,6 +367,20 @@ gen7_upload_constant_state(struct brw_context *brw,
                            const struct brw_stage_state *stage_state,
                            bool active, unsigned opcode);
 
+void gen7_rs_control(struct brw_context *brw, int enable);
+
+void gen7_edit_hw_binding_table_entry(struct brw_context *brw,
+                                      gl_shader_stage stage,
+                                      uint32_t index,
+                                      uint32_t surf_offset);
+void gen7_update_binding_table_from_array(struct brw_context *brw,
+                                          gl_shader_stage stage,
+                                          const uint32_t* binding_table,
+                                          int num_surfaces);
+void gen7_enable_hw_binding_tables(struct brw_context *brw);
+void gen7_disable_hw_binding_tables(struct brw_context *brw);
+void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index a405a80ef6e..d79e0ea00c7 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -87,7 +87,7 @@ brw_annotate_aub(struct brw_context *brw)
    drm_intel_aub_annotation annotations[annotation_count];
    int a = 0;
    make_annotation(&annotations[a++], AUB_TRACE_TYPE_BATCH, 0,
-                   4*brw->batch.used);
+                   4 * USED_BATCH(brw->batch));
    for (int i = brw->state_batch_count; i-- > 0; ) {
       uint32_t type = brw->state_batch_list[i].type;
       uint32_t start_offset = brw->state_batch_list[i].offset;
@@ -136,7 +136,7 @@ __brw_state_batch(struct brw_context *brw,
     * space, then flush and try again.
     */
    if (batch->state_batch_offset < size ||
-       offset < 4*batch->used + batch->reserved_space) {
+       offset < 4 * USED_BATCH(*batch) + batch->reserved_space) {
       intel_batchbuffer_flush(brw);
       offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 24778d25379..5effb4c8829 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -200,36 +200,23 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 }
 
 /**
- * Attempts to find an item in the cache with identical data and aux
- * data to use
+ * Attempts to find an item in the cache with identical data.
  */
-static bool
-brw_try_upload_using_copy(struct brw_cache *cache,
-			  struct brw_cache_item *result_item,
-			  const void *data,
-			  const void *aux)
+static const struct brw_cache_item *
+brw_lookup_prog(const struct brw_cache *cache,
+                enum brw_cache_id cache_id,
+                const void *data, unsigned data_size)
 {
-   struct brw_context *brw = cache->brw;
+   const struct brw_context *brw = cache->brw;
    int i;
-   struct brw_cache_item *item;
+   const struct brw_cache_item *item;
 
    for (i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
-	 const void *item_aux = item->key + item->key_size;
 	 int ret;
 
-	 if (item->cache_id != result_item->cache_id ||
-	     item->size != result_item->size ||
-	     item->aux_size != result_item->aux_size) {
-	    continue;
-	 }
-
-         if (cache->aux_compare[result_item->cache_id]) {
-            if (!cache->aux_compare[result_item->cache_id](item_aux, aux))
-               continue;
-         } else if (memcmp(item_aux, aux, item->aux_size) != 0) {
+	 if (item->cache_id != cache_id || item->size != data_size)
 	    continue;
-	 }
 
          if (!brw->has_llc)
             drm_intel_bo_map(cache->bo, false);
@@ -239,27 +226,24 @@ brw_try_upload_using_copy(struct brw_cache *cache,
 	 if (ret)
 	    continue;
 
-	 result_item->offset = item->offset;
-
-	 return true;
+	 return item;
       }
    }
 
-   return false;
+   return NULL;
 }
 
-static void
-brw_upload_item_data(struct brw_cache *cache,
-		     struct brw_cache_item *item,
-		     const void *data)
+static uint32_t
+brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
 {
+   uint32_t offset;
    struct brw_context *brw = cache->brw;
 
    /* Allocate space in the cache BO for our new program. */
-   if (cache->next_offset + item->size > cache->bo->size) {
+   if (cache->next_offset + size > cache->bo->size) {
       uint32_t new_size = cache->bo->size * 2;
 
-      while (cache->next_offset + item->size > new_size)
+      while (cache->next_offset + size > new_size)
 	 new_size *= 2;
 
       brw_cache_new_bo(cache, new_size);
@@ -273,10 +257,12 @@ brw_upload_item_data(struct brw_cache *cache,
       brw_cache_new_bo(cache, cache->bo->size);
    }
 
-   item->offset = cache->next_offset;
+   offset = cache->next_offset;
 
    /* Programs are always 64-byte aligned, so set up the next one now */
-   cache->next_offset = ALIGN(item->offset + item->size, 64);
+   cache->next_offset = ALIGN(offset + size, 64);
+
+   return offset;
 }
 
 void
@@ -293,6 +279,8 @@ brw_upload_cache(struct brw_cache *cache,
 {
    struct brw_context *brw = cache->brw;
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
+   const struct brw_cache_item *matching_data =
+      brw_lookup_prog(cache, cache_id, data, data_size);
    GLuint hash;
    void *tmp;
 
@@ -304,15 +292,23 @@ brw_upload_cache(struct brw_cache *cache,
    hash = hash_key(item);
    item->hash = hash;
 
-   /* If we can find a matching prog/prog_data combo in the cache
-    * already, then reuse the existing stuff.  This will mean not
-    * flagging CACHE_NEW_* when transitioning between the two
-    * equivalent hash keys.  This is notably useful for programs
-    * generating shaders at runtime, where multiple shaders may
-    * compile to the thing in our backend.
+   /* If we can find a matching prog in the cache already, then reuse the
+    * existing stuff without creating new copy into the underlying buffer
+    * object. This is notably useful for programs generating shaders at
+    * runtime, where multiple shaders may compile to the same thing in our
+    * backend.
     */
-   if (!brw_try_upload_using_copy(cache, item, data, aux)) {
-      brw_upload_item_data(cache, item, data);
+   if (matching_data) {
+      item->offset = matching_data->offset;
+   } else {
+      item->offset = brw_alloc_item_data(cache, data_size);
+
+      /* Copy data to the buffer */
+      if (brw->has_llc) {
+         memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
+      } else {
+         drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
+      }
    }
 
    /* Set up the memory containing the key and aux_data */
@@ -323,7 +319,7 @@ brw_upload_cache(struct brw_cache *cache,
 
    item->key = tmp;
 
-   if (cache->n_items > cache->size * 1.5)
+   if (cache->n_items > cache->size * 1.5f)
       rehash(cache);
 
    hash %= cache->size;
@@ -331,13 +327,6 @@ brw_upload_cache(struct brw_cache *cache,
    cache->items[hash] = item;
    cache->n_items++;
 
-   /* Copy data to the buffer */
-   if (brw->has_llc) {
-      memcpy((char *) cache->bo->virtual + item->offset, data, data_size);
-   } else {
-      drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
-   }
-
    *out_offset = item->offset;
    *(void **)out_aux = (void *)((char *)item->key + item->key_size);
    cache->brw->ctx.NewDriverState |= 1 << cache_id;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 08d1ac28885..9de42ce8503 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -192,6 +192,12 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
    &gen6_color_calc_state,	/* must do before cc unit */
    &gen6_depth_stencil_state,	/* must do before cc unit */
 
+   &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Haswell */
+
+   &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+   &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
+   &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
+
    &gen6_vs_push_constants, /* Before vs_state */
    &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
@@ -251,6 +257,7 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
 static const struct brw_tracked_state *gen7_compute_atoms[] =
 {
    &brw_state_base_address,
+   &brw_cs_image_surfaces,
    &brw_cs_abo_surfaces,
    &brw_cs_state,
 };
@@ -268,6 +275,12 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
    &gen8_blend_state,
    &gen6_color_calc_state,
 
+   &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Broadwell */
+
+   &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+   &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
+   &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
+
    &gen6_vs_push_constants, /* Before vs_state */
    &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
@@ -334,6 +347,7 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
 static const struct brw_tracked_state *gen8_compute_atoms[] =
 {
    &gen8_state_base_address,
+   &brw_cs_image_surfaces,
    &brw_cs_abo_surfaces,
    &brw_cs_state,
 };
@@ -349,7 +363,7 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
       return;
 
    if (brw->gen == 6)
-      intel_emit_post_sync_nonzero_flush(brw);
+      brw_emit_post_sync_nonzero_flush(brw);
 
    brw_upload_invariant_state(brw);
 
@@ -468,6 +482,7 @@ void brw_init_state( struct brw_context *brw )
    ctx->DriverFlags.NewUniformBuffer = BRW_NEW_UNIFORM_BUFFER;
    ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER;
    ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER;
+   ctx->DriverFlags.NewImageUnits = BRW_NEW_IMAGE_UNITS;
 }
 
 
@@ -581,6 +596,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_STATS_WM),
    DEFINE_BIT(BRW_NEW_UNIFORM_BUFFER),
    DEFINE_BIT(BRW_NEW_ATOMIC_BUFFER),
+   DEFINE_BIT(BRW_NEW_IMAGE_UNITS),
    DEFINE_BIT(BRW_NEW_META_IN_PROGRESS),
    DEFINE_BIT(BRW_NEW_INTERPOLATION_MAP),
    DEFINE_BIT(BRW_NEW_PUSH_CONSTANT_ALLOCATION),
@@ -710,7 +726,7 @@ brw_upload_pipeline_state(struct brw_context *brw,
 
    /* Emit Sandybridge workaround flushes on every primitive, for safety. */
    if (brw->gen == 6)
-      intel_emit_post_sync_nonzero_flush(brw);
+      brw_emit_post_sync_nonzero_flush(brw);
 
    brw_upload_programs(brw, pipeline);
    merge_ctx_state(brw, &state);
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index 05016067bba..a33fd88a026 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -813,3 +813,112 @@ brw_depth_format(struct brw_context *brw, mesa_format format)
       unreachable("Unexpected depth format.");
    }
 }
+
+mesa_format
+brw_lower_mesa_image_format(const struct brw_device_info *devinfo,
+                            mesa_format format)
+{
+   switch (format) {
+   /* These are never lowered.  Up to BDW we'll have to fall back to untyped
+    * surface access for 128bpp formats.
+    */
+   case MESA_FORMAT_RGBA_UINT32:
+   case MESA_FORMAT_RGBA_SINT32:
+   case MESA_FORMAT_RGBA_FLOAT32:
+   case MESA_FORMAT_R_UINT32:
+   case MESA_FORMAT_R_SINT32:
+   case MESA_FORMAT_R_FLOAT32:
+      return format;
+
+   /* From HSW to BDW the only 64bpp format supported for typed access is
+    * RGBA_UINT16.  IVB falls back to untyped.
+    */
+   case MESA_FORMAT_RGBA_UINT16:
+   case MESA_FORMAT_RGBA_SINT16:
+   case MESA_FORMAT_RGBA_FLOAT16:
+   case MESA_FORMAT_RG_UINT32:
+   case MESA_FORMAT_RG_SINT32:
+   case MESA_FORMAT_RG_FLOAT32:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT16 : MESA_FORMAT_RG_UINT32);
+
+   /* Up to BDW no SINT or FLOAT formats of less than 32 bits per component
+    * are supported.  IVB doesn't support formats with more than one component
+    * for typed access.  For 8 and 16 bpp formats IVB relies on the
+    * undocumented behavior that typed reads from R_UINT8 and R_UINT16
+    * surfaces actually do a 32-bit misaligned read.  The alternative would be
+    * to use two surface state entries with different formats for each image,
+    * one for reading (using R_UINT32) and another one for writing (using
+    * R_UINT8 or R_UINT16), but that would complicate the shaders we generate
+    * even more.
+    */
+   case MESA_FORMAT_RGBA_UINT8:
+   case MESA_FORMAT_RGBA_SINT8:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT8 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_RG_UINT16:
+   case MESA_FORMAT_RG_SINT16:
+   case MESA_FORMAT_RG_FLOAT16:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT16 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_RG_UINT8:
+   case MESA_FORMAT_RG_SINT8:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT8 : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UINT16:
+   case MESA_FORMAT_R_FLOAT16:
+   case MESA_FORMAT_R_SINT16:
+      return (devinfo->gen >= 9 ? format : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UINT8:
+   case MESA_FORMAT_R_SINT8:
+      return (devinfo->gen >= 9 ? format : MESA_FORMAT_R_UINT8);
+
+   /* Neither the 2/10/10/10 nor the 11/11/10 packed formats are supported
+    * by the hardware.
+    */
+   case MESA_FORMAT_R10G10B10A2_UINT:
+   case MESA_FORMAT_R10G10B10A2_UNORM:
+   case MESA_FORMAT_R11G11B10_FLOAT:
+      return MESA_FORMAT_R_UINT32;
+
+   /* No normalized fixed-point formats are supported by the hardware. */
+   case MESA_FORMAT_RGBA_UNORM16:
+   case MESA_FORMAT_RGBA_SNORM16:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT16 : MESA_FORMAT_RG_UINT32);
+
+   case MESA_FORMAT_R8G8B8A8_UNORM:
+   case MESA_FORMAT_R8G8B8A8_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT8 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_R16G16_UNORM:
+   case MESA_FORMAT_R16G16_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT16 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_R8G8_UNORM:
+   case MESA_FORMAT_R8G8_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT8 : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UNORM16:
+   case MESA_FORMAT_R_SNORM16:
+      return MESA_FORMAT_R_UINT16;
+
+   case MESA_FORMAT_R_UNORM8:
+   case MESA_FORMAT_R_SNORM8:
+      return MESA_FORMAT_R_UINT8;
+
+   default:
+      unreachable("Unknown image format");
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 998d8c42770..b8b03932065 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -63,7 +63,7 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    int i = 0;
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
 
    switch(mt->target) {
    case GL_TEXTURE_1D:
@@ -95,7 +95,7 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
                align_yf[i] : align_ys[i];
 
-   assert(is_power_of_two(mt->num_samples));
+   assert(_mesa_is_pow_two(mt->num_samples));
 
    switch (mt->num_samples) {
    case 2:
@@ -199,7 +199,7 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
           mt->target != GL_TEXTURE_1D_ARRAY);
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp)) ;
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp)) ;
 
    switch(mt->target) {
    case GL_TEXTURE_2D:
@@ -226,7 +226,7 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
    ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
                align_yf[i] : align_ys[i];
 
-   assert(is_power_of_two(mt->num_samples));
+   assert(_mesa_is_pow_two(mt->num_samples));
 
    switch (mt->num_samples) {
    case 4:
@@ -366,9 +366,8 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
 
    mt->total_width = mt->physical_width0;
 
-   if (mt->compressed) {
-       mt->total_width = ALIGN(mt->physical_width0, mt->align_w);
-   }
+   if (mt->compressed)
+       mt->total_width = ALIGN(mt->total_width, bw);
 
    /* May need to adjust width to accommodate the placement of
     * the 2nd mipmap.  This occurs when the alignment
@@ -433,9 +432,7 @@ brw_miptree_get_horizontal_slice_pitch(const struct brw_context *brw,
                                        const struct intel_mipmap_tree *mt,
                                        unsigned level)
 {
-   assert(brw->gen < 9);
-
-   if (mt->target == GL_TEXTURE_3D ||
+   if ((brw->gen < 9 && mt->target == GL_TEXTURE_3D) ||
        (brw->gen == 4 && mt->target == GL_TEXTURE_CUBE_MAP)) {
       return ALIGN(minify(mt->physical_width0, level), mt->align_w);
    } else {
@@ -615,8 +612,8 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
  */
 static uint32_t
 brw_miptree_choose_tiling(struct brw_context *brw,
-                          enum intel_miptree_tiling_mode requested,
-                          const struct intel_mipmap_tree *mt)
+                          const struct intel_mipmap_tree *mt,
+                          uint32_t layout_flags)
 {
    if (mt->format == MESA_FORMAT_S_UINT8) {
       /* The stencil buffer is W tiled. However, we request from the kernel a
@@ -625,15 +622,18 @@ brw_miptree_choose_tiling(struct brw_context *brw,
       return I915_TILING_NONE;
    }
 
+   /* Do not support changing the tiling for miptrees with pre-allocated BOs. */
+   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
+
    /* Some usages may want only one type of tiling, like depth miptrees (Y
     * tiled), or temporary BOs for uploading data once (linear).
     */
-   switch (requested) {
-   case INTEL_MIPTREE_TILING_ANY:
+   switch (layout_flags & MIPTREE_LAYOUT_TILING_ANY) {
+   case MIPTREE_LAYOUT_TILING_ANY:
       break;
-   case INTEL_MIPTREE_TILING_Y:
+   case MIPTREE_LAYOUT_TILING_Y:
       return I915_TILING_Y;
-   case INTEL_MIPTREE_TILING_NONE:
+   case MIPTREE_LAYOUT_TILING_NONE:
       return I915_TILING_NONE;
    }
 
@@ -762,16 +762,13 @@ intel_miptree_set_total_width_height(struct brw_context *brw,
        mt->total_width, mt->total_height, mt->cpp);
 }
 
-void
-brw_miptree_layout(struct brw_context *brw,
-                   struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
-                   uint32_t layout_flags)
+static void
+intel_miptree_set_alignment(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt,
+                            uint32_t layout_flags)
 {
    bool gen6_hiz_or_stencil = false;
 
-   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
-
    if (brw->gen == 6 && mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
       const GLenum base_format = _mesa_get_format_base_format(mt->format);
       gen6_hiz_or_stencil = _mesa_is_depth_or_stencil_format(base_format);
@@ -806,7 +803,16 @@ brw_miptree_layout(struct brw_context *brw,
          intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);
       mt->align_h = intel_vertical_texture_alignment_unit(brw, mt);
    }
+}
+
+void
+brw_miptree_layout(struct brw_context *brw,
+                   struct intel_mipmap_tree *mt,
+                   uint32_t layout_flags)
+{
+   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
 
+   intel_miptree_set_alignment(brw, mt, layout_flags);
    intel_miptree_set_total_width_height(brw, mt);
 
    if (!mt->total_width || !mt->total_height) {
@@ -825,6 +831,6 @@ brw_miptree_layout(struct brw_context *brw,
    }
 
    if ((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0)
-      mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
+      mt->tiling = brw_miptree_choose_tiling(brw, mt, layout_flags);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 6fcf1b0cb1d..6078c3810d4 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -249,10 +249,10 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits1.cs_fence  = brw->urb.size;
 
    /* erratum: URB_FENCE must not cross a 64byte cacheline */
-   if ((brw->batch.used & 15) > 12) {
-      int pad = 16 - (brw->batch.used & 15);
+   if ((USED_BATCH(brw->batch) & 15) > 12) {
+      int pad = 16 - (USED_BATCH(brw->batch) & 15);
       do
-	 brw->batch.map[brw->batch.used++] = MI_NOOP;
+         *brw->batch.map_next++ = MI_NOOP;
       while (--pad);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index 04e4e944118..68f4318d371 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -53,14 +53,14 @@ brw_get_line_width(struct brw_context *brw)
    float line_width =
       CLAMP(!brw->ctx.Multisample._Enabled && !brw->ctx.Line.SmoothFlag
             ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width,
-            0.0, brw->ctx.Const.MaxLineWidth);
+            0.0f, brw->ctx.Const.MaxLineWidth);
    uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
 
    /* Line width of 0 is not allowed when MSAA enabled */
    if (brw->ctx.Multisample._Enabled) {
       if (line_width_u3_7 == 0)
          line_width_u3_7 = 1;
-   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5) {
+   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5f) {
       /* For 1 pixel line thickness or less, the general
        * anti-aliasing algorithm gives up, and a garbage line is
        * generated.  Setting a Line Width of 0.0 specifies the
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 05f188fe116..63f75da7e99 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -171,6 +171,17 @@ dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
    this->writemask = writemask;
 }
 
+dst_reg::dst_reg(register_file file, int reg, brw_reg_type type,
+                 unsigned writemask)
+{
+   init();
+
+   this->file = file;
+   this->reg = reg;
+   this->type = type;
+   this->writemask = writemask;
+}
+
 dst_reg::dst_reg(struct brw_reg reg)
 {
    init();
@@ -1709,6 +1720,9 @@ vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
 bool
 vec4_visitor::run(gl_clip_plane *clip_planes)
 {
+   bool use_vec4_nir =
+      compiler->glsl_compiler_options[stage].NirOptions != NULL;
+
    sanity_param_count = prog->Parameters->NumParameters;
 
    if (shader_time_index >= 0)
@@ -1718,10 +1732,15 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
 
    emit_prolog();
 
-   /* Generate VS IR for main().  (the visitor only descends into
-    * functions called "main").
-    */
-   if (shader) {
+   if (use_vec4_nir) {
+      assert(prog->nir != NULL);
+      emit_nir_code();
+      if (failed)
+         return false;
+   } else if (shader) {
+      /* Generate VS IR for main().  (the visitor only descends into
+       * functions called "main").
+       */
       visit_instructions(shader->base.ir);
    } else {
       emit_program_code();
@@ -1741,7 +1760,7 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
     * that we have reladdr computations available for CSE, since we'll
     * often do repeated subexpressions for those.
     */
-   if (shader) {
+   if (shader || use_vec4_nir) {
       move_grf_array_access_to_scratch();
       move_uniform_array_access_to_pull_constants();
    } else {
@@ -1827,15 +1846,30 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
       }
    }
 
-   while (!reg_allocate()) {
-      if (failed)
-         return false;
+   bool allocated_without_spills = reg_allocate();
+
+   if (!allocated_without_spills) {
+      compiler->shader_perf_log(log_data,
+                                "%s shader triggered register spilling.  "
+                                "Try reducing the number of live vec4 values "
+                                "to improve performance.\n",
+                                stage_name);
+
+      while (!reg_allocate()) {
+         if (failed)
+            return false;
+      }
    }
 
    opt_schedule_instructions();
 
    opt_set_dependency_control();
 
+   if (last_scratch > 0) {
+      prog_data->base.total_scratch =
+         brw_get_scratch_size(last_scratch * REG_SIZE);
+   }
+
    /* If any state parameters were appended, then ParameterValues could have
     * been realloced, in which case the driver uniform storage set up by
     * _mesa_associate_uniform_storage() would point to freed memory.  Make
@@ -1857,10 +1891,11 @@ extern "C" {
  */
 const unsigned *
 brw_vs_emit(struct brw_context *brw,
-            struct gl_shader_program *prog,
-            struct brw_vs_compile *c,
-            struct brw_vs_prog_data *prog_data,
             void *mem_ctx,
+            const struct brw_vs_prog_key *key,
+            struct brw_vs_prog_data *prog_data,
+            struct gl_vertex_program *vp,
+            struct gl_shader_program *prog,
             unsigned *final_assembly_size)
 {
    bool start_busy = false;
@@ -1879,29 +1914,31 @@ brw_vs_emit(struct brw_context *brw,
 
    int st_index = -1;
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
-      st_index = brw_get_shader_time_index(brw, prog, &c->vp->program.Base,
-                                           ST_VS);
+      st_index = brw_get_shader_time_index(brw, prog, &vp->Base, ST_VS);
 
    if (unlikely(INTEL_DEBUG & DEBUG_VS) && shader->base.ir)
-      brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
+      brw_dump_ir("vertex", prog, &shader->base, &vp->Base);
+
+   if (!vp->Base.nir &&
+       (brw->intelScreen->compiler->scalar_vs ||
+        brw->intelScreen->compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions != NULL)) {
+      /* Normally we generate NIR in LinkShader() or
+       * ProgramStringNotify(), but Mesa's fixed-function vertex program
+       * handling doesn't notify the driver at all.  Just do it here, at
+       * the last minute, even though it's lame.
+       */
+      assert(vp->Base.Id == 0 && prog == NULL);
+      vp->Base.nir =
+         brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX,
+                        brw->intelScreen->compiler->scalar_vs);
+   }
 
    if (brw->intelScreen->compiler->scalar_vs) {
-      if (!c->vp->program.Base.nir) {
-         /* Normally we generate NIR in LinkShader() or
-          * ProgramStringNotify(), but Mesa's fixed-function vertex program
-          * handling doesn't notify the driver at all.  Just do it here, at
-          * the last minute, even though it's lame.
-          */
-         assert(c->vp->program.Base.Id == 0 && prog == NULL);
-         c->vp->program.Base.nir =
-            brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
-      }
-
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_visitor v(brw->intelScreen->compiler, brw,
-                   mem_ctx, MESA_SHADER_VERTEX, &c->key,
-                   &prog_data->base.base, prog, &c->vp->program.Base,
+                   mem_ctx, MESA_SHADER_VERTEX, key,
+                   &prog_data->base.base, prog, &vp->Base,
                    8, st_index);
       if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
@@ -1916,8 +1953,8 @@ brw_vs_emit(struct brw_context *brw,
       }
 
       fs_generator g(brw->intelScreen->compiler, brw,
-                     mem_ctx, (void *) &c->key, &prog_data->base.base,
-                     &c->vp->program.Base, v.promoted_constants,
+                     mem_ctx, (void *) key, &prog_data->base.base,
+                     &vp->Base, v.promoted_constants,
                      v.runtime_check_aads_emit, "VS");
       if (INTEL_DEBUG & DEBUG_VS) {
          char *name;
@@ -1927,21 +1964,19 @@ brw_vs_emit(struct brw_context *brw,
                                    prog->Name);
          } else {
             name = ralloc_asprintf(mem_ctx, "vertex program %d",
-                                   c->vp->program.Base.Id);
+                                   vp->Base.Id);
          }
          g.enable_debug(name);
       }
       g.generate_code(v.cfg, 8);
       assembly = g.get_assembly(final_assembly_size);
-
-      c->base.last_scratch = v.last_scratch;
    }
 
    if (!assembly) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-      vec4_vs_visitor v(brw->intelScreen->compiler,
-                        c, prog_data, prog, mem_ctx, st_index,
+      vec4_vs_visitor v(brw->intelScreen->compiler, brw, key, prog_data,
+                        vp, prog, mem_ctx, st_index,
                         !_mesa_is_gles3(&brw->ctx));
       if (!v.run(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
@@ -1956,14 +1991,14 @@ brw_vs_emit(struct brw_context *brw,
       }
 
       vec4_generator g(brw->intelScreen->compiler, brw,
-                       prog, &c->vp->program.Base, &prog_data->base,
+                       prog, &vp->Base, &prog_data->base,
                        mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
       assembly = g.generate_assembly(v.cfg, final_assembly_size);
    }
 
    if (unlikely(brw->perf_debug) && shader) {
       if (shader->compiled_once) {
-         brw_vs_debug_recompile(brw, prog, &c->key);
+         brw_vs_debug_recompile(brw, prog, key);
       }
       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
          perf_debug("VS compile took %.03f ms and stalled the GPU\n",
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 2ac16932189..341c516b39a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -45,12 +45,9 @@ extern "C" {
 #endif
 
 #include "glsl/ir.h"
+#include "glsl/nir/nir.h"
 
 
-struct brw_vec4_compile {
-   GLuint last_scratch; /**< measured in 32-byte (register size) units */
-};
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -77,7 +74,7 @@ class vec4_visitor : public backend_shader, public ir_visitor
 {
 public:
    vec4_visitor(const struct brw_compiler *compiler,
-                struct brw_vec4_compile *c,
+                void *log_data,
                 struct gl_program *prog,
                 const struct brw_vue_prog_key *key,
                 struct brw_vue_prog_data *prog_data,
@@ -103,7 +100,6 @@ public:
       return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
    }
 
-   struct brw_vec4_compile * const c;
    const struct brw_vue_prog_key * const key;
    struct brw_vue_prog_data * const prog_data;
    unsigned int sanity_param_count;
@@ -181,9 +177,12 @@ public:
    void fail(const char *msg, ...);
 
    void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n);
    void setup_uniform_values(ir_variable *ir);
    void setup_builtin_uniform_values(ir_variable *ir);
    int setup_uniforms(int payload_reg);
+
    bool reg_allocate_trivial();
    bool reg_allocate();
    void evaluate_spill_costs(float *spill_costs, bool *no_spill);
@@ -292,14 +291,17 @@ public:
    void emit_bool_to_cond_code(ir_rvalue *ir, enum brw_predicate *predicate);
    void emit_if_gen6(ir_if *ir);
 
-   void emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
-                    src_reg src0, src_reg src1);
+   vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+                                 src_reg src0, src_reg src1);
 
-   void emit_lrp(const dst_reg &dst,
-                 const src_reg &x, const src_reg &y, const src_reg &a);
+   vec4_instruction *emit_lrp(const dst_reg &dst, const src_reg &x,
+                              const src_reg &y, const src_reg &a);
 
-   /** Copy any live channel from \p src to the first channel of \p dst. */
-   void emit_uniformize(const dst_reg &dst, const src_reg &src);
+   /**
+    * Copy any live channel from \p src to the first channel of the
+    * result.
+    */
+   src_reg emit_uniformize(const src_reg &src);
 
    void emit_block_move(dst_reg *dst, src_reg *src,
                         const struct glsl_type *type, brw_predicate predicate);
@@ -317,11 +319,13 @@ public:
    void emit_scalar(ir_instruction *ir, enum prog_opcode op,
 		    dst_reg dst, src_reg src0, src_reg src1);
 
-   src_reg fix_3src_operand(src_reg src);
+   src_reg fix_3src_operand(const src_reg &src);
+   src_reg resolve_source_modifiers(const src_reg &src);
+
+   vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                               const src_reg &src1 = src_reg());
 
-   void emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
-                  const src_reg &src1 = src_reg());
-   src_reg fix_math_operand(src_reg src);
+   src_reg fix_math_operand(const src_reg &src);
 
    void emit_pack_half_2x16(dst_reg dst, src_reg src0);
    void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
@@ -330,10 +334,27 @@ public:
    void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
    void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
 
-   uint32_t gather_channel(ir_texture *ir, uint32_t sampler);
-   src_reg emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler);
+   void emit_texture(ir_texture_opcode op,
+                     dst_reg dest,
+                     const glsl_type *dest_type,
+                     src_reg coordinate,
+                     int coord_components,
+                     src_reg shadow_comparitor,
+                     src_reg lod, src_reg lod2,
+                     src_reg sample_index,
+                     uint32_t constant_offset,
+                     src_reg offset_value,
+                     src_reg mcs,
+                     bool is_cube_array,
+                     uint32_t sampler, src_reg sampler_reg);
+
+   uint32_t gather_channel(unsigned gather_component, uint32_t sampler);
+   src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
+                          src_reg sampler);
    void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
-   void swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler);
+   void swizzle_result(ir_texture_opcode op, dst_reg dest,
+                       src_reg orig_val, uint32_t sampler,
+                       const glsl_type *dest_type);
 
    void emit_ndc_computation();
    void emit_psiz_and_flags(dst_reg reg);
@@ -388,13 +409,53 @@ public:
 
    void visit_atomic_counter_intrinsic(ir_call *ir);
 
+   int type_size(const struct glsl_type *type);
+   bool is_high_sampler(src_reg sampler);
+
+   virtual void emit_nir_code();
+   virtual void nir_setup_inputs(nir_shader *shader);
+   virtual void nir_setup_uniforms(nir_shader *shader);
+   virtual void nir_setup_uniform(nir_variable *var);
+   virtual void nir_setup_builtin_uniform(nir_variable *var);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_setup_system_values(nir_shader *shader);
+   virtual void nir_emit_impl(nir_function_impl *impl);
+   virtual void nir_emit_cf_list(exec_list *list);
+   virtual void nir_emit_if(nir_if *if_stmt);
+   virtual void nir_emit_loop(nir_loop *loop);
+   virtual void nir_emit_block(nir_block *block);
+   virtual void nir_emit_instr(nir_instr *instr);
+   virtual void nir_emit_load_const(nir_load_const_instr *instr);
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_emit_alu(nir_alu_instr *instr);
+   virtual void nir_emit_jump(nir_jump_instr *instr);
+   virtual void nir_emit_texture(nir_tex_instr *instr);
+
+   dst_reg get_nir_dest(nir_dest dest, enum brw_reg_type type);
+   dst_reg get_nir_dest(nir_dest dest, nir_alu_type type);
+   dst_reg get_nir_dest(nir_dest dest);
+   src_reg get_nir_src(nir_src src, enum brw_reg_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(nir_src src, nir_alu_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(nir_src src,
+                       unsigned num_components = 4);
+
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type) = 0;
+
+   dst_reg *nir_locals;
+   dst_reg *nir_ssa_values;
+   src_reg *nir_inputs;
+   unsigned *nir_uniform_driver_location;
+   dst_reg *nir_system_values;
+
 protected:
    void emit_vertex();
    void lower_attributes_to_hw_regs(const int *attribute_map,
                                     bool interleaved);
    void setup_payload_interference(struct ra_graph *g, int first_payload_node,
                                    int reg_node_count);
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir) = 0;
    virtual void assign_binding_table_offsets();
    virtual void setup_payload() = 0;
    virtual void emit_prolog() = 0;
@@ -403,6 +464,8 @@ protected:
    virtual void emit_urb_write_header(int mrf) = 0;
    virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
    virtual int compute_array_stride(ir_dereference_array *ir);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
 
 private:
    /**
@@ -411,6 +474,8 @@ private:
    const bool no_spills;
 
    int shader_time_index;
+
+   unsigned last_scratch; /**< measured in 32-byte (register size) units */
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
index 2d9afa8145f..5a15eb89766 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
@@ -179,6 +179,7 @@ try_constant_propagate(const struct brw_device_info *devinfo,
 
    case BRW_OPCODE_MACH:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_OR:
    case BRW_OPCODE_AND:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index c9fe0cebf27..5a277f74c44 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -62,6 +62,7 @@ is_expression(const vec4_instruction *const inst)
    case BRW_OPCODE_CMPN:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_FRC:
    case BRW_OPCODE_RNDU:
    case BRW_OPCODE_RNDD:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index d2de2f0be25..92050b94d33 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1465,19 +1465,15 @@ vec4_generator::generate_code(const cfg_t *cfg)
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen,
                             !inst->dst.is_null());
-         brw_mark_surface_used(&prog_data->base, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
                                   src[2].dw1.ud);
-         brw_mark_surface_used(&prog_data->base, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -1549,7 +1545,7 @@ vec4_generator::generate_code(const cfg_t *cfg)
           *
           * where they pack the four bytes from the low and high four DW.
           */
-         assert(is_power_of_two(dst.dw1.bits.writemask) &&
+         assert(_mesa_is_pow_two(dst.dw1.bits.writemask) &&
                 dst.dw1.bits.writemask != 0);
          unsigned offset = __builtin_ctz(dst.dw1.bits.writemask);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
new file mode 100644
index 00000000000..d85fb6f31ec
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4_gs_visitor.h"
+
+namespace brw {
+
+void
+vec4_gs_visitor::nir_setup_inputs(nir_shader *shader)
+{
+   nir_inputs = ralloc_array(mem_ctx, src_reg, shader->num_inputs);
+
+   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
+      int offset = var->data.driver_location;
+      if (var->type->base_type == GLSL_TYPE_ARRAY) {
+         /* Geometry shader inputs are arrays, but they use an unusual array
+          * layout: instead of all array elements for a given geometry shader
+          * input being stored consecutively, all geometry shader inputs are
+          * interleaved into one giant array. At this stage of compilation, we
+          * assume that the stride of the array is BRW_VARYING_SLOT_COUNT.
+          * Later, setup_attributes() will remap our accesses to the actual
+          * input array.
+          */
+         assert(var->type->length > 0);
+         int length = var->type->length;
+         int size = type_size(var->type) / length;
+         for (int i = 0; i < length; i++) {
+            int location = var->data.location + i * BRW_VARYING_SLOT_COUNT;
+            for (int j = 0; j < size; j++) {
+               src_reg src = src_reg(ATTR, location + j, var->type);
+               src = retype(src, brw_type_for_base_type(var->type));
+               nir_inputs[offset] = src;
+               offset++;
+            }
+         }
+      } else {
+         int size = type_size(var->type);
+         for (int i = 0; i < size; i++) {
+            src_reg src = src_reg(ATTR, var->data.location + i, var->type);
+            src = retype(src, brw_type_for_base_type(var->type));
+            nir_inputs[offset] = src;
+            offset++;
+         }
+      }
+   }
+}
+
+void
+vec4_gs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg *reg;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_invocation_id:
+      reg = &this->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INVOCATION_ID,
+                                                 glsl_type::int_type);
+      break;
+
+   default:
+      vec4_visitor::nir_setup_system_value_intrinsic(instr);
+   }
+
+}
+
+void
+vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_emit_vertex: {
+      int stream_id = instr->const_index[0];
+      gs_emit_vertex(stream_id);
+      break;
+   }
+
+   case nir_intrinsic_end_primitive:
+      gs_end_primitive();
+      break;
+
+   case nir_intrinsic_load_invocation_id: {
+      src_reg invocation_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]);
+      assert(invocation_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, invocation_id.type);
+      emit(MOV(dest, invocation_id));
+      break;
+   }
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 69bcf5afc51..019efecac66 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -35,12 +35,14 @@ const unsigned MAX_GS_INPUT_VERTICES = 6;
 namespace brw {
 
 vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
+                                 void *log_data,
                                  struct brw_gs_compile *c,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
                                  bool no_spills,
                                  int shader_time_index)
-   : vec4_visitor(compiler, &c->base, &c->gp->program.Base, &c->key.base,
+   : vec4_visitor(compiler, log_data,
+                  &c->gp->program.Base, &c->key.base,
                   &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
                   no_spills, shader_time_index),
      c(c)
@@ -49,11 +51,12 @@ vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
 
 
 dst_reg *
-vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir)
+vec4_gs_visitor::make_reg_for_system_value(int location,
+                                           const glsl_type *type)
 {
-   dst_reg *reg = new(mem_ctx) dst_reg(this, ir->type);
+   dst_reg *reg = new(mem_ctx) dst_reg(this, type);
 
-   switch (ir->data.location) {
+   switch (location) {
    case SYSTEM_VALUE_INVOCATION_ID:
       this->current_annotation = "initialize gl_InvocationID";
       emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
@@ -346,90 +349,82 @@ vec4_gs_visitor::emit_control_data_bits()
    if (c->control_data_header_size_bits > 128)
       urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
 
-   /* If vertex_count is 0, then no control data bits have been accumulated
-    * yet, so we should do nothing.
+   /* If we are using either channel masks or a per-slot offset, then we
+    * need to figure out which DWORD we are trying to write to, using the
+    * formula:
+    *
+    *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
+    *
+    * Since bits_per_vertex is a power of two, and is known at compile
+    * time, this can be optimized to:
+    *
+    *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
     */
-   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
-   emit(IF(BRW_PREDICATE_NORMAL));
-   {
-      /* If we are using either channel masks or a per-slot offset, then we
-       * need to figure out which DWORD we are trying to write to, using the
-       * formula:
-       *
-       *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
-       *
-       * Since bits_per_vertex is a power of two, and is known at compile
-       * time, this can be optimized to:
-       *
-       *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+   src_reg dword_index(this, glsl_type::uint_type);
+   if (urb_write_flags) {
+      src_reg prev_count(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
+      unsigned log2_bits_per_vertex =
+         _mesa_fls(c->control_data_bits_per_vertex);
+      emit(SHR(dst_reg(dword_index), prev_count,
+               (uint32_t) (6 - log2_bits_per_vertex)));
+   }
+
+   /* Start building the URB write message.  The first MRF gets a copy of
+    * R0.
+    */
+   int base_mrf = 1;
+   dst_reg mrf_reg(MRF, base_mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+
+   if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
+      /* Set the per-slot offset to dword_index / 4, to that we'll write to
+       * the appropriate OWORD within the control data header.
        */
-      src_reg dword_index(this, glsl_type::uint_type);
-      if (urb_write_flags) {
-         src_reg prev_count(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
-         unsigned log2_bits_per_vertex =
-            _mesa_fls(c->control_data_bits_per_vertex);
-         emit(SHR(dst_reg(dword_index), prev_count,
-                  (uint32_t) (6 - log2_bits_per_vertex)));
-      }
+      src_reg per_slot_offset(this, glsl_type::uint_type);
+      emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
+      emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
+   }
 
-      /* Start building the URB write message.  The first MRF gets a copy of
-       * R0.
+   if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
+      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+       * write to the appropriate DWORD within the OWORD.  We need to do
+       * this computation with force_writemask_all, otherwise garbage data
+       * from invocation 0 might clobber the mask for invocation 1 when
+       * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
+       * together.
        */
-      int base_mrf = 1;
-      dst_reg mrf_reg(MRF, base_mrf);
-      src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-      vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+      src_reg channel(this, glsl_type::uint_type);
+      inst = emit(AND(dst_reg(channel), dword_index, 3u));
       inst->force_writemask_all = true;
-
-      if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
-         /* Set the per-slot offset to dword_index / 4, to that we'll write to
-          * the appropriate OWORD within the control data header.
-          */
-         src_reg per_slot_offset(this, glsl_type::uint_type);
-         emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
-         emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
-      }
-
-      if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
-         /* Set the channel masks to 1 << (dword_index % 4), so that we'll
-          * write to the appropriate DWORD within the OWORD.  We need to do
-          * this computation with force_writemask_all, otherwise garbage data
-          * from invocation 0 might clobber the mask for invocation 1 when
-          * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
-          * together.
-          */
-         src_reg channel(this, glsl_type::uint_type);
-         inst = emit(AND(dst_reg(channel), dword_index, 3u));
-         inst->force_writemask_all = true;
-         src_reg one(this, glsl_type::uint_type);
-         inst = emit(MOV(dst_reg(one), 1u));
-         inst->force_writemask_all = true;
-         src_reg channel_mask(this, glsl_type::uint_type);
-         inst = emit(SHL(dst_reg(channel_mask), one, channel));
-         inst->force_writemask_all = true;
-         emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
-                                               channel_mask);
-         emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
-      }
-
-      /* Store the control data bits in the message payload and send it. */
-      dst_reg mrf_reg2(MRF, base_mrf + 1);
-      inst = emit(MOV(mrf_reg2, this->control_data_bits));
+      src_reg one(this, glsl_type::uint_type);
+      inst = emit(MOV(dst_reg(one), 1u));
       inst->force_writemask_all = true;
-      inst = emit(GS_OPCODE_URB_WRITE);
-      inst->urb_write_flags = urb_write_flags;
-      /* We need to increment Global Offset by 256-bits to make room for
-       * Broadwell's extra "Vertex Count" payload at the beginning of the
-       * URB entry.  Since this is an OWord message, Global Offset is counted
-       * in 128-bit units, so we must set it to 2.
-       */
-      if (devinfo->gen >= 8)
-         inst->offset = 2;
-      inst->base_mrf = base_mrf;
-      inst->mlen = 2;
+      src_reg channel_mask(this, glsl_type::uint_type);
+      inst = emit(SHL(dst_reg(channel_mask), one, channel));
+      inst->force_writemask_all = true;
+      emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
+                                            channel_mask);
+      emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
    }
-   emit(BRW_OPCODE_ENDIF);
+
+   /* Store the control data bits in the message payload and send it. */
+   dst_reg mrf_reg2(MRF, base_mrf + 1);
+   inst = emit(MOV(mrf_reg2, this->control_data_bits));
+   inst->force_writemask_all = true;
+   inst = emit(GS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = urb_write_flags;
+   /* We need to increment Global Offset by 256-bits to make room for
+    * Broadwell's extra "Vertex Count" payload at the beginning of the
+    * URB entry.  Since this is an OWord message, Global Offset is counted
+    * in 128-bit units, so we must set it to 2.
+    */
+   if (devinfo->gen >= 8)
+      inst->offset = 2;
+   inst->base_mrf = base_mrf;
+   inst->mlen = 2;
 }
 
 void
@@ -472,7 +467,7 @@ vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
 }
 
 void
-vec4_gs_visitor::visit(ir_emit_vertex *ir)
+vec4_gs_visitor::gs_emit_vertex(int stream_id)
 {
    this->current_annotation = "emit vertex: safety check";
 
@@ -486,7 +481,7 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
     * be recorded by transform feedback, we can simply discard all geometry
     * bound to these streams when transform feedback is disabled.
     */
-   if (ir->stream_id() > 0 && shader_prog->TransformFeedback.NumVarying == 0)
+   if (stream_id > 0 && shader_prog->TransformFeedback.NumVarying == 0)
       return;
 
    /* To ensure that we don't output more vertices than the shader specified
@@ -529,9 +524,17 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
             emit(AND(dst_null_d(), this->vertex_count,
                      (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
          inst->conditional_mod = BRW_CONDITIONAL_Z;
+
          emit(IF(BRW_PREDICATE_NORMAL));
          {
+            /* If vertex_count is 0, then no control data bits have been
+             * accumulated yet, so we skip emitting them.
+             */
+            emit(CMP(dst_null_d(), this->vertex_count, 0u,
+                     BRW_CONDITIONAL_NEQ));
+            emit(IF(BRW_PREDICATE_NORMAL));
             emit_control_data_bits();
+            emit(BRW_OPCODE_ENDIF);
 
             /* Reset control_data_bits to 0 so we can start accumulating a new
              * batch.
@@ -557,7 +560,7 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
           c->prog_data.control_data_format ==
              GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
           this->current_annotation = "emit vertex: Stream control data bits";
-          set_stream_control_data_bits(ir->stream_id());
+          set_stream_control_data_bits(stream_id);
       }
 
       this->current_annotation = "emit vertex: increment vertex count";
@@ -570,7 +573,13 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
 }
 
 void
-vec4_gs_visitor::visit(ir_end_primitive *)
+vec4_gs_visitor::visit(ir_emit_vertex *ir)
+{
+   gs_emit_vertex(ir->stream_id());
+}
+
+void
+vec4_gs_visitor::gs_end_primitive()
 {
    /* We can only do EndPrimitive() functionality when the control data
     * consists of cut bits.  Fortunately, the only time it isn't is when the
@@ -620,6 +629,12 @@ vec4_gs_visitor::visit(ir_end_primitive *)
    emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
 }
 
+void
+vec4_gs_visitor::visit(ir_end_primitive *)
+{
+   gs_end_primitive();
+}
+
 static const unsigned *
 generate_assembly(struct brw_context *brw,
                   struct gl_shader_program *shader_prog,
@@ -662,7 +677,7 @@ brw_gs_emit(struct brw_context *brw,
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
          c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(brw->intelScreen->compiler,
+         vec4_gs_visitor v(brw->intelScreen->compiler, brw,
                            c, prog, mem_ctx, true /* no_spills */, st_index);
          if (v.run(NULL /* clip planes */)) {
             return generate_assembly(brw, prog, &c->gp->program.Base,
@@ -704,11 +719,11 @@ brw_gs_emit(struct brw_context *brw,
    const unsigned *ret = NULL;
 
    if (brw->gen >= 7)
-      gs = new vec4_gs_visitor(brw->intelScreen->compiler,
+      gs = new vec4_gs_visitor(brw->intelScreen->compiler, brw,
                                c, prog, mem_ctx, false /* no_spills */,
                                st_index);
    else
-      gs = new gen6_gs_visitor(brw->intelScreen->compiler,
+      gs = new gen6_gs_visitor(brw->intelScreen->compiler, brw,
                                c, prog, mem_ctx, false /* no_spills */,
                                st_index);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index e693c56b58f..0e8fefabecc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -37,7 +37,6 @@
  */
 struct brw_gs_compile
 {
-   struct brw_vec4_compile base;
    struct brw_gs_prog_key key;
    struct brw_gs_prog_data prog_data;
    struct brw_vue_map input_vue_map;
@@ -69,14 +68,19 @@ class vec4_gs_visitor : public vec4_visitor
 {
 public:
    vec4_gs_visitor(const struct brw_compiler *compiler,
+                   void *log_data,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index);
 
+   virtual void nir_setup_inputs(nir_shader *shader);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_program_code();
@@ -86,6 +90,9 @@ protected:
    virtual int compute_array_stride(ir_dereference_array *ir);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
 
 protected:
    int setup_varying_inputs(int payload_reg, int *attribute_map,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
index 95b9d9017e2..cc688ef8083 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
@@ -96,7 +96,8 @@ vec4_live_variables::setup_def_use()
 	  * are the things that screen off preceding definitions of a
 	  * variable, and thus qualify for being in def[].
 	  */
-	 if (inst->dst.file == GRF && !inst->predicate) {
+	 if (inst->dst.file == GRF &&
+	     (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) {
             for (unsigned i = 0; i < inst->regs_written; i++) {
                for (int c = 0; c < 4; c++) {
                   if (inst->dst.writemask & (1 << c)) {
@@ -133,27 +134,9 @@ vec4_live_variables::compute_live_variables()
    while (cont) {
       cont = false;
 
-      foreach_block (block, cfg) {
+      foreach_block_reverse (block, cfg) {
          struct block_data *bd = &block_data[block->num];
 
-	 /* Update livein */
-	 for (int i = 0; i < bitset_words; i++) {
-            BITSET_WORD new_livein = (bd->use[i] |
-                                      (bd->liveout[i] &
-                                       ~bd->def[i]));
-            if (new_livein & ~bd->livein[i]) {
-               bd->livein[i] |= new_livein;
-               cont = true;
-	    }
-	 }
-         BITSET_WORD new_livein = (bd->flag_use[0] |
-                                   (bd->flag_liveout[0] &
-                                    ~bd->flag_def[0]));
-         if (new_livein & ~bd->flag_livein[0]) {
-            bd->flag_livein[0] |= new_livein;
-            cont = true;
-         }
-
 	 /* Update liveout */
 	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
             struct block_data *child_bd = &block_data[child_link->block->num];
@@ -173,6 +156,24 @@ vec4_live_variables::compute_live_variables()
                cont = true;
             }
 	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
       }
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
new file mode 100644
index 00000000000..923e2d30a4c
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -0,0 +1,1548 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4.h"
+#include "glsl/ir_uniform.h"
+
+namespace brw {
+
+void
+vec4_visitor::emit_nir_code()
+{
+   nir_shader *nir = prog->nir;
+
+   if (nir->num_inputs > 0)
+      nir_setup_inputs(nir);
+
+   if (nir->num_uniforms > 0)
+      nir_setup_uniforms(nir);
+
+   nir_setup_system_values(nir);
+
+   /* get the main function and emit it */
+   nir_foreach_overload(nir, overload) {
+      assert(strcmp(overload->function->name, "main") == 0);
+      assert(overload->impl);
+      nir_emit_impl(overload->impl);
+   }
+}
+
+void
+vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg *reg;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id().");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+      reg = &this->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+      if (reg->file == BAD_FILE)
+         *reg =
+            *this->make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
+                                             glsl_type::int_type);
+      break;
+
+   case nir_intrinsic_load_base_vertex:
+      reg = &this->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX,
+                                                 glsl_type::int_type);
+      break;
+
+   case nir_intrinsic_load_instance_id:
+      reg = &this->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID,
+                                                 glsl_type::int_type);
+      break;
+
+   default:
+      break;
+   }
+}
+
+static bool
+setup_system_values_block(nir_block *block, void *void_visitor)
+{
+   vec4_visitor *v = (vec4_visitor *)void_visitor;
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      v->nir_setup_system_value_intrinsic(intrin);
+   }
+
+   return true;
+}
+
+void
+vec4_visitor::nir_setup_system_values(nir_shader *shader)
+{
+   nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
+
+   nir_foreach_overload(shader, overload) {
+      assert(strcmp(overload->function->name, "main") == 0);
+      assert(overload->impl);
+      nir_foreach_block(overload->impl, setup_system_values_block, this);
+   }
+}
+
+void
+vec4_visitor::nir_setup_inputs(nir_shader *shader)
+{
+   nir_inputs = ralloc_array(mem_ctx, src_reg, shader->num_inputs);
+
+   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
+      int offset = var->data.driver_location;
+      unsigned size = type_size(var->type);
+      for (unsigned i = 0; i < size; i++) {
+         src_reg src = src_reg(ATTR, var->data.location + i, var->type);
+         nir_inputs[offset + i] = src;
+      }
+   }
+}
+
+void
+vec4_visitor::nir_setup_uniforms(nir_shader *shader)
+{
+   uniforms = 0;
+
+   nir_uniform_driver_location =
+      rzalloc_array(mem_ctx, unsigned, this->uniform_array_size);
+
+   if (shader_prog) {
+      foreach_list_typed(nir_variable, var, node, &shader->uniforms) {
+         /* UBO's, atomics and samplers don't take up space in the
+            uniform file */
+         if (var->interface_type != NULL || var->type->contains_atomic() ||
+             type_size(var->type) == 0) {
+            continue;
+         }
+
+         assert(uniforms < uniform_array_size);
+         this->uniform_size[uniforms] = type_size(var->type);
+
+         if (strncmp(var->name, "gl_", 3) == 0)
+            nir_setup_builtin_uniform(var);
+         else
+            nir_setup_uniform(var);
+      }
+   } else {
+      /* For ARB_vertex_program, only a single "parameters" variable is
+       * generated to support uniform data.
+       */
+      nir_variable *var = (nir_variable *) shader->uniforms.get_head();
+      assert(shader->uniforms.length() == 1 &&
+             strcmp(var->name, "parameters") == 0);
+
+      assert(uniforms < uniform_array_size);
+      this->uniform_size[uniforms] = type_size(var->type);
+
+      struct gl_program_parameter_list *plist = prog->Parameters;
+      for (unsigned p = 0; p < plist->NumParameters; p++) {
+         uniform_vector_size[uniforms] = plist->Parameters[p].Size;
+
+         /* Parameters should be either vec4 uniforms or single component
+          * constants; matrices and other larger types should have been broken
+          * down earlier.
+          */
+         assert(uniform_vector_size[uniforms] <= 4);
+
+         int i;
+         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
+            stage_prog_data->param[uniforms * 4 + i] = &plist->ParameterValues[p][i];
+         }
+         for (; i < 4; i++) {
+            static const gl_constant_value zero = { 0.0 };
+            stage_prog_data->param[uniforms * 4 + i] = &zero;
+         }
+
+         nir_uniform_driver_location[uniforms] = var->data.driver_location;
+         uniforms++;
+      }
+   }
+}
+
+void
+vec4_visitor::nir_setup_uniform(nir_variable *var)
+{
+   int namelen = strlen(var->name);
+
+   /* The data for our (non-builtin) uniforms is stored in a series of
+    * gl_uniform_driver_storage structs for each subcomponent that
+    * glGetUniformLocation() could name.  We know it's been set up in the same
+    * order we'd walk the type, so walk the list of storage and find anything
+    * with our name, or the prefix of a component that starts with our name.
+    */
+    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
+       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
+
+       if (storage->builtin)
+          continue;
+
+       if (strncmp(var->name, storage->name, namelen) != 0 ||
+           (storage->name[namelen] != 0 &&
+            storage->name[namelen] != '.' &&
+            storage->name[namelen] != '[')) {
+          continue;
+       }
+
+       gl_constant_value *components = storage->storage;
+       unsigned vector_count = (MAX2(storage->array_elements, 1) *
+                                storage->type->matrix_columns);
+
+       for (unsigned s = 0; s < vector_count; s++) {
+          assert(uniforms < uniform_array_size);
+          uniform_vector_size[uniforms] = storage->type->vector_elements;
+
+          int i;
+          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
+             stage_prog_data->param[uniforms * 4 + i] = components;
+             components++;
+          }
+          for (; i < 4; i++) {
+             static const gl_constant_value zero = { 0.0 };
+             stage_prog_data->param[uniforms * 4 + i] = &zero;
+          }
+
+          nir_uniform_driver_location[uniforms] = var->data.driver_location;
+          uniforms++;
+       }
+    }
+}
+
+void
+vec4_visitor::nir_setup_builtin_uniform(nir_variable *var)
+{
+   const nir_state_slot *const slots = var->state_slots;
+   assert(var->state_slots != NULL);
+
+   for (unsigned int i = 0; i < var->num_state_slots; i++) {
+      /* This state reference has already been setup by ir_to_mesa,
+       * but we'll get the same index back here.  We can reference
+       * ParameterValues directly, since unlike brw_fs.cpp, we never
+       * add new state references during compile.
+       */
+      int index = _mesa_add_state_reference(this->prog->Parameters,
+					    (gl_state_index *)slots[i].tokens);
+      gl_constant_value *values =
+         &this->prog->Parameters->ParameterValues[index][0];
+
+      assert(uniforms < uniform_array_size);
+
+      for (unsigned j = 0; j < 4; j++)
+         stage_prog_data->param[uniforms * 4 + j] =
+            &values[GET_SWZ(slots[i].swizzle, j)];
+
+      this->uniform_vector_size[uniforms] =
+         (var->type->is_scalar() || var->type->is_vector() ||
+          var->type->is_matrix() ? var->type->vector_elements : 4);
+
+      nir_uniform_driver_location[uniforms] = var->data.driver_location;
+      uniforms++;
+   }
+}
+
+void
+vec4_visitor::nir_emit_impl(nir_function_impl *impl)
+{
+   nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc);
+
+   foreach_list_typed(nir_register, reg, node, &impl->registers) {
+      unsigned array_elems =
+         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
+
+      nir_locals[reg->index] = dst_reg(GRF, alloc.allocate(array_elems));
+   }
+
+   nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
+
+   nir_emit_cf_list(&impl->body);
+}
+
+void
+vec4_visitor::nir_emit_cf_list(exec_list *list)
+{
+   exec_list_validate(list);
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_if:
+         nir_emit_if(nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         nir_emit_loop(nir_cf_node_as_loop(node));
+         break;
+
+      case nir_cf_node_block:
+         nir_emit_block(nir_cf_node_as_block(node));
+         break;
+
+      default:
+         unreachable("Invalid CFG node block");
+      }
+   }
+}
+
+void
+vec4_visitor::nir_emit_if(nir_if *if_stmt)
+{
+   /* First, put the condition in f0 */
+   src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1);
+   vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+   emit(IF(BRW_PREDICATE_NORMAL));
+
+   nir_emit_cf_list(&if_stmt->then_list);
+
+   /* note: if the else is empty, dead CF elimination will remove it */
+   emit(BRW_OPCODE_ELSE);
+
+   nir_emit_cf_list(&if_stmt->else_list);
+
+   emit(BRW_OPCODE_ENDIF);
+}
+
+void
+vec4_visitor::nir_emit_loop(nir_loop *loop)
+{
+   emit(BRW_OPCODE_DO);
+
+   nir_emit_cf_list(&loop->body);
+
+   emit(BRW_OPCODE_WHILE);
+}
+
+void
+vec4_visitor::nir_emit_block(nir_block *block)
+{
+   nir_foreach_instr(block, instr) {
+      nir_emit_instr(instr);
+   }
+}
+
+void
+vec4_visitor::nir_emit_instr(nir_instr *instr)
+{
+   this->base_ir = instr;
+
+   switch (instr->type) {
+   case nir_instr_type_load_const:
+      nir_emit_load_const(nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_intrinsic:
+      nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
+      break;
+
+   case nir_instr_type_alu:
+      nir_emit_alu(nir_instr_as_alu(instr));
+      break;
+
+   case nir_instr_type_jump:
+      nir_emit_jump(nir_instr_as_jump(instr));
+      break;
+
+   case nir_instr_type_tex:
+      nir_emit_texture(nir_instr_as_tex(instr));
+      break;
+
+   default:
+      fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n");
+      break;
+   }
+}
+
+static dst_reg
+dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg,
+                    unsigned base_offset, nir_src *indirect)
+{
+   dst_reg reg;
+
+   reg = v->nir_locals[nir_reg->index];
+   reg = offset(reg, base_offset);
+   if (indirect) {
+      reg.reladdr =
+         new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
+                                                BRW_REGISTER_TYPE_D,
+                                                1));
+   }
+   return reg;
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest)
+{
+   assert(!dest.is_ssa);
+   return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
+                              dest.reg.indirect);
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest, enum brw_reg_type type)
+{
+   return retype(get_nir_dest(dest), type);
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest, nir_alu_type type)
+{
+   return get_nir_dest(dest, brw_type_for_nir_type(type));
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, enum brw_reg_type type,
+                          unsigned num_components)
+{
+   dst_reg reg;
+
+   if (src.is_ssa) {
+      assert(src.ssa != NULL);
+      reg = nir_ssa_values[src.ssa->index];
+   }
+   else {
+     reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
+                               src.reg.indirect);
+   }
+
+   reg = retype(reg, type);
+
+   src_reg reg_as_src = src_reg(reg);
+   reg_as_src.swizzle = brw_swizzle_for_size(num_components);
+   return reg_as_src;
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, nir_alu_type type,
+                          unsigned num_components)
+{
+   return get_nir_src(src, brw_type_for_nir_type(type), num_components);
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, unsigned num_components)
+{
+   /* if type is not specified, default to signed int */
+   return get_nir_src(src, nir_type_int, num_components);
+}
+
+void
+vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
+{
+   dst_reg reg = dst_reg(GRF, alloc.allocate(1));
+   reg.type =  BRW_REGISTER_TYPE_F;
+
+   /* @FIXME: consider emitting vector operations to save some MOVs in
+    * cases where the components are representable in 8 bits.
+    * By now, we emit a MOV for each component.
+    */
+   for (unsigned i = 0; i < instr->def.num_components; ++i) {
+      reg.writemask = 1 << i;
+      emit(MOV(reg, src_reg(instr->value.f[i])));
+   }
+
+   /* Set final writemask */
+   reg.writemask = brw_writemask_for_size(instr->def.num_components);
+
+   nir_ssa_values[instr->def.index] = reg;
+}
+
+void
+vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   bool has_indirect = false;
+
+   switch (instr->intrinsic) {
+
+   case nir_intrinsic_load_input_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_load_input: {
+      int offset = instr->const_index[0];
+      src = nir_inputs[offset];
+
+      if (has_indirect) {
+         dest.reladdr = new(mem_ctx) src_reg(get_nir_src(instr->src[0],
+                                                         BRW_REGISTER_TYPE_D,
+                                                         1));
+      }
+      dest = get_nir_dest(instr->dest, src.type);
+      dest.writemask = brw_writemask_for_size(instr->num_components);
+
+      emit(MOV(dest, src));
+      break;
+   }
+
+   case nir_intrinsic_store_output_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_store_output: {
+      int varying = instr->const_index[0];
+
+      src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
+                        instr->num_components);
+      dest = dst_reg(src);
+
+      if (has_indirect) {
+         dest.reladdr = new(mem_ctx) src_reg(get_nir_src(instr->src[1],
+                                                         BRW_REGISTER_TYPE_D,
+                                                         1));
+      }
+      output_reg[varying] = dest;
+      break;
+   }
+
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id()");
+
+   case nir_intrinsic_load_vertex_id_zero_base: {
+      src_reg vertex_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE]);
+      assert(vertex_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, vertex_id.type);
+      emit(MOV(dest, vertex_id));
+      break;
+   }
+
+   case nir_intrinsic_load_base_vertex: {
+      src_reg base_vertex =
+         src_reg(nir_system_values[SYSTEM_VALUE_BASE_VERTEX]);
+      assert(base_vertex.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, base_vertex.type);
+      emit(MOV(dest, base_vertex));
+      break;
+   }
+
+   case nir_intrinsic_load_instance_id: {
+      src_reg instance_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_INSTANCE_ID]);
+      assert(instance_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, instance_id.type);
+      emit(MOV(dest, instance_id));
+      break;
+   }
+
+   case nir_intrinsic_load_uniform_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_load_uniform: {
+      int uniform = instr->const_index[0];
+
+      dest = get_nir_dest(instr->dest);
+
+      if (has_indirect) {
+         /* Split addressing into uniform and offset */
+         int offset = uniform - nir_uniform_driver_location[uniform];
+         assert(offset >= 0);
+
+         uniform -= offset;
+         assert(uniform >= 0);
+
+         src = src_reg(dst_reg(UNIFORM, uniform));
+         src.reg_offset = offset;
+         src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1);
+         src.reladdr = new(mem_ctx) src_reg(tmp);
+      } else {
+         src = src_reg(dst_reg(UNIFORM, uniform));
+      }
+
+      emit(MOV(dest, src));
+      break;
+   }
+
+   case nir_intrinsic_atomic_counter_read:
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_dec: {
+      unsigned surf_index = prog_data->base.binding_table.abo_start +
+         (unsigned) instr->const_index[0];
+      src_reg offset = get_nir_src(instr->src[0], nir_type_int,
+                                   instr->num_components);
+      dest = get_nir_dest(instr->dest);
+
+      switch (instr->intrinsic) {
+         case nir_intrinsic_atomic_counter_inc:
+            emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
+                                src_reg(), src_reg());
+            break;
+         case nir_intrinsic_atomic_counter_dec:
+            emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
+                                src_reg(), src_reg());
+            break;
+         case nir_intrinsic_atomic_counter_read:
+            emit_untyped_surface_read(surf_index, dest, offset);
+            break;
+         default:
+            unreachable("Unreachable");
+      }
+
+      brw_mark_surface_used(stage_prog_data, surf_index);
+      break;
+   }
+
+   case nir_intrinsic_load_ubo_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_load_ubo: {
+      nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]);
+      src_reg surf_index;
+
+      dest = get_nir_dest(instr->dest);
+
+      if (const_block_index) {
+         /* The block index is a constant, so just emit the binding table entry
+          * as an immediate.
+          */
+         surf_index = src_reg(prog_data->base.binding_table.ubo_start +
+                              const_block_index->u[0]);
+      } else {
+         /* The block index is not a constant. Evaluate the index expression
+          * per-channel and add the base UBO index; we have to select a value
+          * from any live channel.
+          */
+         surf_index = src_reg(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int,
+                                                   instr->num_components),
+                  src_reg(prog_data->base.binding_table.ubo_start)));
+         surf_index = emit_uniformize(surf_index);
+
+         /* Assume this may touch any UBO. It would be nice to provide
+          * a tighter bound, but the array information is already lowered away.
+          */
+         brw_mark_surface_used(&prog_data->base,
+                               prog_data->base.binding_table.ubo_start +
+                               shader_prog->NumUniformBlocks - 1);
+      }
+
+      unsigned const_offset = instr->const_index[0];
+      src_reg offset;
+
+      if (!has_indirect)  {
+         offset = src_reg(const_offset / 16);
+      } else {
+         offset = src_reg(this, glsl_type::uint_type);
+         emit(SHR(dst_reg(offset), get_nir_src(instr->src[1], nir_type_int, 1),
+                  src_reg(4u)));
+      }
+
+      src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
+      packed_consts.type = dest.type;
+
+      emit_pull_constant_load_reg(dst_reg(packed_consts),
+                                  surf_index,
+                                  offset,
+                                  NULL, NULL /* before_block/inst */);
+
+      packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
+      packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
+                                            const_offset % 16 / 4,
+                                            const_offset % 16 / 4,
+                                            const_offset % 16 / 4);
+
+      emit(MOV(dest, packed_consts));
+      break;
+   }
+
+   default:
+      unreachable("Unknown intrinsic");
+   }
+}
+
+static unsigned
+brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
+{
+   return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+}
+
+static enum brw_conditional_mod
+brw_conditional_for_nir_comparison(nir_op op)
+{
+   switch (op) {
+   case nir_op_flt:
+   case nir_op_ilt:
+   case nir_op_ult:
+      return BRW_CONDITIONAL_L;
+
+   case nir_op_fge:
+   case nir_op_ige:
+   case nir_op_uge:
+      return BRW_CONDITIONAL_GE;
+
+   case nir_op_feq:
+   case nir_op_ieq:
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4:
+      return BRW_CONDITIONAL_Z;
+
+   case nir_op_fne:
+   case nir_op_ine:
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4:
+      return BRW_CONDITIONAL_NZ;
+
+   default:
+      unreachable("not reached: bad operation for comparison");
+   }
+}
+
+void
+vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
+{
+   vec4_instruction *inst;
+
+   dst_reg dst = get_nir_dest(instr->dest.dest,
+                              nir_op_infos[instr->op].output_type);
+   dst.writemask = instr->dest.write_mask;
+
+   src_reg op[4];
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      op[i] = get_nir_src(instr->src[i].src,
+                          nir_op_infos[instr->op].input_types[i], 4);
+      op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle);
+      op[i].abs = instr->src[i].abs;
+      op[i].negate = instr->src[i].negate;
+   }
+
+   switch (instr->op) {
+   case nir_op_imov:
+   case nir_op_fmov:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+      unreachable("not reached: should be handled by lower_vec_to_movs()");
+
+   case nir_op_i2f:
+   case nir_op_u2f:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_f2i:
+   case nir_op_f2u:
+      inst = emit(MOV(dst, op[0]));
+      break;
+
+   case nir_op_fadd:
+      /* fall through */
+   case nir_op_iadd:
+      inst = emit(ADD(dst, op[0], op[1]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmul:
+      inst = emit(MUL(dst, op[0], op[1]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_imul: {
+      if (devinfo->gen < 8) {
+         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
+         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+
+         /* For integer multiplication, the MUL uses the low 16 bits of one of
+          * the operands (src0 through SNB, src1 on IVB and later). The MACH
+          * accumulates in the contribution of the upper 16 bits of that
+          * operand. If we can determine that one of the args is in the low
+          * 16 bits, though, we can just emit a single MUL.
+          */
+         if (value0 && value0->u[0] < (1 << 16)) {
+            if (devinfo->gen < 7)
+               emit(MUL(dst, op[0], op[1]));
+            else
+               emit(MUL(dst, op[1], op[0]));
+         } else if (value1 && value1->u[0] < (1 << 16)) {
+            if (devinfo->gen < 7)
+               emit(MUL(dst, op[1], op[0]));
+            else
+               emit(MUL(dst, op[0], op[1]));
+         } else {
+            struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+            emit(MUL(acc, op[0], op[1]));
+            emit(MACH(dst_null_d(), op[0], op[1]));
+            emit(MOV(dst, src_reg(acc)));
+         }
+      } else {
+	 emit(MUL(dst, op[0], op[1]));
+      }
+      break;
+   }
+
+   case nir_op_imul_high:
+   case nir_op_umul_high: {
+      struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+      emit(MUL(acc, op[0], op[1]));
+      emit(MACH(dst, op[0], op[1]));
+      break;
+   }
+
+   case nir_op_frcp:
+      inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fexp2:
+      inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_flog2:
+      inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fsin:
+      inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fcos:
+      inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_idiv:
+   case nir_op_udiv:
+      emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
+      break;
+
+   case nir_op_umod:
+      emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+      break;
+
+   case nir_op_ldexp:
+      unreachable("not reached: should be handled by ldexp_to_arith()");
+
+   case nir_op_fsqrt:
+      inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_frsq:
+      inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fpow:
+      inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_uadd_carry: {
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(ADDC(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_usub_borrow: {
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(SUBB(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_ftrunc:
+      inst = emit(RNDZ(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fceil: {
+      src_reg tmp = src_reg(this, glsl_type::float_type);
+      tmp.swizzle =
+         brw_swizzle_for_size(instr->src[0].src.is_ssa ?
+                              instr->src[0].src.ssa->num_components :
+                              instr->src[0].src.reg.reg->num_components);
+
+      op[0].negate = !op[0].negate;
+      emit(RNDD(dst_reg(tmp), op[0]));
+      tmp.negate = true;
+      inst = emit(MOV(dst, tmp));
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
+   case nir_op_ffloor:
+      inst = emit(RNDD(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_ffract:
+      inst = emit(FRC(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fround_even:
+      inst = emit(RNDE(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmin:
+   case nir_op_imin:
+   case nir_op_umin:
+      inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmax:
+   case nir_op_imax:
+   case nir_op_umax:
+      inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fddx:
+   case nir_op_fddx_coarse:
+   case nir_op_fddx_fine:
+   case nir_op_fddy:
+   case nir_op_fddy_coarse:
+   case nir_op_fddy_fine:
+      unreachable("derivatives are not valid in vertex shaders");
+
+   case nir_op_flt:
+   case nir_op_ilt:
+   case nir_op_ult:
+   case nir_op_fge:
+   case nir_op_ige:
+   case nir_op_uge:
+   case nir_op_feq:
+   case nir_op_ieq:
+   case nir_op_fne:
+   case nir_op_ine:
+      emit(CMP(dst, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+      break;
+
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+
+      switch (instr->op) {
+      case nir_op_ball_fequal2:
+      case nir_op_ball_iequal2:
+         tmp.writemask = WRITEMASK_XY;
+         break;
+      case nir_op_ball_fequal3:
+      case nir_op_ball_iequal3:
+         tmp.writemask = WRITEMASK_XYZ;
+         break;
+      case nir_op_ball_fequal4:
+      case nir_op_ball_iequal4:
+         tmp.writemask = WRITEMASK_XYZW;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      emit(CMP(tmp, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   }
+
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+
+      switch (instr->op) {
+      case nir_op_bany_fnequal2:
+      case nir_op_bany_inequal2:
+         tmp.writemask = WRITEMASK_XY;
+         break;
+      case nir_op_bany_fnequal3:
+      case nir_op_bany_inequal3:
+         tmp.writemask = WRITEMASK_XYZ;
+         break;
+      case nir_op_bany_fnequal4:
+      case nir_op_bany_inequal4:
+         tmp.writemask = WRITEMASK_XYZW;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      emit(CMP(tmp, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   }
+
+   case nir_op_inot:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+      }
+      emit(NOT(dst, op[0]));
+      break;
+
+   case nir_op_ixor:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(XOR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ior:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(OR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_iand:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(AND(dst, op[0], op[1]));
+      break;
+
+   case nir_op_b2i:
+      emit(AND(dst, op[0], src_reg(1)));
+      break;
+
+   case nir_op_b2f:
+      op[0].type = BRW_REGISTER_TYPE_D;
+      dst.type = BRW_REGISTER_TYPE_D;
+      emit(AND(dst, op[0], src_reg(0x3f800000u)));
+      dst.type = BRW_REGISTER_TYPE_F;
+      break;
+
+   case nir_op_f2b:
+      emit(CMP(dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+      break;
+
+   case nir_op_i2b:
+      emit(CMP(dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      break;
+
+   case nir_op_fnoise1_1:
+   case nir_op_fnoise1_2:
+   case nir_op_fnoise1_3:
+   case nir_op_fnoise1_4:
+   case nir_op_fnoise2_1:
+   case nir_op_fnoise2_2:
+   case nir_op_fnoise2_3:
+   case nir_op_fnoise2_4:
+   case nir_op_fnoise3_1:
+   case nir_op_fnoise3_2:
+   case nir_op_fnoise3_3:
+   case nir_op_fnoise3_4:
+   case nir_op_fnoise4_1:
+   case nir_op_fnoise4_2:
+   case nir_op_fnoise4_3:
+   case nir_op_fnoise4_4:
+      unreachable("not reached: should be handled by lower_noise");
+
+   case nir_op_unpack_half_2x16_split_x:
+   case nir_op_unpack_half_2x16_split_y:
+   case nir_op_pack_half_2x16_split:
+      unreachable("not reached: should not occur in vertex shader");
+
+   case nir_op_unpack_snorm_2x16:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_pack_snorm_2x16:
+   case nir_op_pack_unorm_2x16:
+      unreachable("not reached: should be handled by lower_packing_builtins");
+
+   case nir_op_unpack_half_2x16:
+      /* As NIR does not guarantee that we have a correct swizzle outside the
+       * boundaries of a vector, and the implementation of emit_unpack_half_2x16
+       * uses the source operand in an operation with WRITEMASK_Y while our
+       * source operand has only size 1, it accessed incorrect data producing
+       * regressions in Piglit. We repeat the swizzle of the first component on the
+       * rest of components to avoid regressions. In the vec4_visitor IR code path
+       * this is not needed because the operand has already the correct swizzle.
+       */
+      op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle);
+      emit_unpack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_pack_half_2x16:
+      emit_pack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_unpack_unorm_4x8:
+      emit_unpack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_unorm_4x8:
+      emit_pack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_unpack_snorm_4x8:
+      emit_unpack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_snorm_4x8:
+      emit_pack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_bitfield_reverse:
+      emit(BFREV(dst, op[0]));
+      break;
+
+   case nir_op_bit_count:
+      emit(CBIT(dst, op[0]));
+      break;
+
+   case nir_op_ufind_msb:
+   case nir_op_ifind_msb: {
+      src_reg temp = src_reg(this, glsl_type::uint_type);
+
+      inst = emit(FBH(dst_reg(temp), op[0]));
+      inst->dst.writemask = WRITEMASK_XYZW;
+
+      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
+       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
+       * subtract the result from 31 to convert the MSB count into an LSB count.
+       */
+
+      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
+      temp.swizzle = BRW_SWIZZLE_NOOP;
+      emit(MOV(dst, temp));
+
+      src_reg src_tmp = src_reg(dst);
+      emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
+
+      src_tmp.negate = true;
+      inst = emit(ADD(dst, src_tmp, src_reg(31)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_find_lsb:
+      emit(FBL(dst, op[0]));
+      break;
+
+   case nir_op_ubitfield_extract:
+   case nir_op_ibitfield_extract:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFE(dst, op[2], op[1], op[0]));
+      break;
+
+   case nir_op_bfm:
+      emit(BFI1(dst, op[0], op[1]));
+      break;
+
+   case nir_op_bfi:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFI2(dst, op[0], op[1], op[2]));
+      break;
+
+   case nir_op_bitfield_insert:
+      unreachable("not reached: should be handled by "
+                  "lower_instructions::bitfield_insert_to_bfm_bfi");
+
+   case nir_op_fsign:
+      /* AND(val, 0x80000000) gives the sign bit.
+       *
+       * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+       * zero.
+       */
+      emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+
+      op[0].type = BRW_REGISTER_TYPE_UD;
+      dst.type = BRW_REGISTER_TYPE_UD;
+      emit(AND(dst, op[0], src_reg(0x80000000u)));
+
+      inst = emit(OR(dst, src_reg(dst), src_reg(0x3f800000u)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      dst.type = BRW_REGISTER_TYPE_F;
+
+      if (instr->dest.saturate) {
+         inst = emit(MOV(dst, src_reg(dst)));
+         inst->saturate = true;
+      }
+      break;
+
+   case nir_op_isign:
+      /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
+       *               -> non-negative val generates 0x00000000.
+       *  Predicated OR sets 1 if val is positive.
+       */
+      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
+      emit(ASR(dst, op[0], src_reg(31)));
+      inst = emit(OR(dst, src_reg(dst), src_reg(1)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case nir_op_ishl:
+      emit(SHL(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ishr:
+      emit(ASR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ushr:
+      emit(SHR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ffma:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      inst = emit(MAD(dst, op[2], op[1], op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_flrp:
+      inst = emit_lrp(dst, op[0], op[1], op[2]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_bcsel:
+      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case nir_op_fdot2:
+      inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdot3:
+      inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdot4:
+      inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_bany2:
+   case nir_op_bany3:
+   case nir_op_bany4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+      tmp.writemask = brw_writemask_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+      emit(CMP(tmp, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   }
+
+   case nir_op_fabs:
+   case nir_op_iabs:
+   case nir_op_fneg:
+   case nir_op_ineg:
+   case nir_op_fsat:
+      unreachable("not reached: should be lowered by lower_source mods");
+
+   case nir_op_fdiv:
+      unreachable("not reached: should be lowered by DIV_TO_MUL_RCP in the compiler");
+
+   case nir_op_fmod:
+      unreachable("not reached: should be lowered by MOD_TO_FLOOR in the compiler");
+
+   case nir_op_fsub:
+   case nir_op_isub:
+      unreachable("not reached: should be handled by ir_sub_to_add_neg");
+
+   default:
+      unreachable("Unimplemented ALU operation");
+   }
+
+   /* If we need to do a boolean resolve, replace the result with -(x & 1)
+    * to sign extend the low bit to 0/~0
+    */
+   if (devinfo->gen <= 5 &&
+       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) ==
+       BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+      dst_reg masked = dst_reg(this, glsl_type::int_type);
+      masked.writemask = dst.writemask;
+      emit(AND(masked, src_reg(dst), src_reg(1)));
+      src_reg masked_neg = src_reg(masked);
+      masked_neg.negate = true;
+      emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg));
+   }
+}
+
+void
+vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
+{
+   switch (instr->type) {
+   case nir_jump_break:
+      emit(BRW_OPCODE_BREAK);
+      break;
+
+   case nir_jump_continue:
+      emit(BRW_OPCODE_CONTINUE);
+      break;
+
+   case nir_jump_return:
+      /* fall through */
+   default:
+      unreachable("unknown jump");
+   }
+}
+
+enum ir_texture_opcode
+ir_texture_opcode_for_nir_texop(nir_texop texop)
+{
+   enum ir_texture_opcode op;
+
+   switch (texop) {
+   case nir_texop_lod: op = ir_lod; break;
+   case nir_texop_query_levels: op = ir_query_levels; break;
+   case nir_texop_tex: op = ir_tex; break;
+   case nir_texop_tg4: op = ir_tg4; break;
+   case nir_texop_txb: op = ir_txb; break;
+   case nir_texop_txd: op = ir_txd; break;
+   case nir_texop_txf: op = ir_txf; break;
+   case nir_texop_txf_ms: op = ir_txf_ms; break;
+   case nir_texop_txl: op = ir_txl; break;
+   case nir_texop_txs: op = ir_txs; break;
+   default:
+      unreachable("unknown texture opcode");
+   }
+
+   return op;
+}
+const glsl_type *
+glsl_type_for_nir_alu_type(nir_alu_type alu_type,
+                           unsigned components)
+{
+   switch (alu_type) {
+   case nir_type_float:
+      return glsl_type::vec(components);
+   case nir_type_int:
+      return glsl_type::ivec(components);
+   case nir_type_unsigned:
+      return glsl_type::uvec(components);
+   case nir_type_bool:
+      return glsl_type::bvec(components);
+   default:
+      return glsl_type::error_type;
+   }
+
+   return glsl_type::error_type;
+}
+
+void
+vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
+{
+   unsigned sampler = instr->sampler_index;
+   src_reg sampler_reg = src_reg(sampler);
+   src_reg coordinate;
+   const glsl_type *coord_type = NULL;
+   src_reg shadow_comparitor;
+   src_reg offset_value;
+   src_reg lod, lod2;
+   src_reg sample_index;
+   src_reg mcs;
+
+   const glsl_type *dest_type =
+      glsl_type_for_nir_alu_type(instr->dest_type,
+                                 nir_tex_instr_dest_size(instr));
+   dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
+
+   /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
+    * emitting anything other than setting up the constant result.
+    */
+   if (instr->op == nir_texop_tg4) {
+      int swiz = GET_SWZ(key->tex.swizzles[sampler], instr->component);
+      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
+         emit(MOV(dest, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
+         return;
+      }
+   }
+
+   /* Load the texture operation sources */
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_comparitor:
+         shadow_comparitor = get_nir_src(instr->src[i].src,
+                                         BRW_REGISTER_TYPE_F, 1);
+         break;
+
+      case nir_tex_src_coord: {
+         unsigned src_size = nir_tex_instr_src_size(instr, i);
+
+         switch (instr->op) {
+         case nir_texop_txf:
+         case nir_texop_txf_ms:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D,
+                                     src_size);
+            coord_type = glsl_type::ivec(src_size);
+            break;
+
+         default:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                                     src_size);
+            coord_type = glsl_type::vec(src_size);
+            break;
+         }
+         break;
+      }
+
+      case nir_tex_src_ddx:
+         lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_ddy:
+         lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_lod:
+         switch (instr->op) {
+         case nir_texop_txs:
+         case nir_texop_txf:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+            break;
+
+         default:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1);
+            break;
+         }
+         break;
+
+      case nir_tex_src_ms_index: {
+         sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+         assert(coord_type != NULL);
+         if (devinfo->gen >= 7 &&
+             key->tex.compressed_multisample_layout_mask & (1<<sampler)) {
+            mcs = emit_mcs_fetch(coord_type, coordinate, sampler_reg);
+         } else {
+            mcs = src_reg(0u);
+         }
+         mcs = retype(mcs, BRW_REGISTER_TYPE_UD);
+         break;
+      }
+
+      case nir_tex_src_offset:
+         offset_value = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
+         break;
+
+      case nir_tex_src_sampler_offset: {
+         /* The highest sampler which may be used by this operation is
+          * the last element of the array. Mark it here, because the generator
+          * doesn't have enough information to determine the bound.
+          */
+         uint32_t array_size = instr->sampler_array_size;
+         uint32_t max_used = sampler + array_size - 1;
+         if (instr->op == nir_texop_tg4) {
+            max_used += prog_data->base.binding_table.gather_texture_start;
+         } else {
+            max_used += prog_data->base.binding_table.texture_start;
+         }
+
+         brw_mark_surface_used(&prog_data->base, max_used);
+
+         /* Emit code to evaluate the actual indexing expression */
+         src_reg src = get_nir_src(instr->src[i].src, 1);
+         src_reg temp(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(temp), src, src_reg(sampler)));
+         sampler_reg = emit_uniformize(temp);
+         break;
+      }
+
+      case nir_tex_src_projector:
+         unreachable("Should be lowered by do_lower_texture_projection");
+
+      case nir_tex_src_bias:
+         unreachable("LOD bias is not valid for vertex shaders.\n");
+
+      default:
+         unreachable("unknown texture source");
+      }
+   }
+
+   uint32_t constant_offset = 0;
+   for (unsigned i = 0; i < 3; i++) {
+      if (instr->const_offset[i] != 0) {
+         constant_offset = brw_texture_offset(instr->const_offset, 3);
+         break;
+      }
+   }
+
+   /* Stuff the channel select bits in the top of the texture offset */
+   if (instr->op == nir_texop_tg4)
+      constant_offset |= gather_channel(instr->component, sampler) << 16;
+
+   ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op);
+
+   bool is_cube_array =
+      instr->op == nir_texop_txs &&
+      instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
+      instr->is_array;
+
+   emit_texture(op, dest, dest_type, coordinate, instr->coord_components,
+                shadow_comparitor,
+                lod, lod2, sample_index,
+                constant_offset, offset_value,
+                mcs, is_cube_array, sampler, sampler_reg);
+}
+
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index 555c42e2f24..617c9889cad 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -280,15 +280,15 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
     */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (unsigned int i = 0; i < 3; i++) {
-	 if (inst->src[i].file == GRF) {
-	    spill_costs[inst->src[i].reg] += loop_scale;
+         if (inst->src[i].file == GRF) {
+            spill_costs[inst->src[i].reg] += loop_scale;
             if (inst->src[i].reladdr)
                no_spill[inst->src[i].reg] = true;
-	 }
+         }
       }
 
       if (inst->dst.file == GRF) {
-	 spill_costs[inst->dst.reg] += loop_scale;
+         spill_costs[inst->dst.reg] += loop_scale;
          if (inst->dst.reladdr)
             no_spill[inst->dst.reg] = true;
       }
@@ -296,12 +296,12 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
       switch (inst->opcode) {
 
       case BRW_OPCODE_DO:
-	 loop_scale *= 10;
-	 break;
+         loop_scale *= 10;
+         break;
 
       case BRW_OPCODE_WHILE:
-	 loop_scale /= 10;
-	 break;
+         loop_scale /= 10;
+         break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
@@ -309,12 +309,12 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
             if (inst->src[i].file == GRF)
                no_spill[inst->src[i].reg] = true;
          }
-	 if (inst->dst.file == GRF)
-	    no_spill[inst->dst.reg] = true;
-	 break;
+         if (inst->dst.file == GRF)
+            no_spill[inst->dst.reg] = true;
+         break;
 
       default:
-	 break;
+         break;
       }
    }
 }
@@ -339,7 +339,7 @@ void
 vec4_visitor::spill_reg(int spill_reg_nr)
 {
    assert(alloc.sizes[spill_reg_nr] == 1);
-   unsigned int spill_offset = c->last_scratch++;
+   unsigned int spill_offset = last_scratch++;
 
    /* Generate spill/unspill instructions for the objects being spilled. */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 236fa51f92c..20b628e9192 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -287,7 +287,7 @@ vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements
 }
 
 src_reg
-vec4_visitor::fix_3src_operand(src_reg src)
+vec4_visitor::fix_3src_operand(const src_reg &src)
 {
    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
     * able to use vertical stride of zero to replicate the vec4 uniform, like
@@ -313,7 +313,20 @@ vec4_visitor::fix_3src_operand(src_reg src)
 }
 
 src_reg
-vec4_visitor::fix_math_operand(src_reg src)
+vec4_visitor::resolve_source_modifiers(const src_reg &src)
+{
+   if (!src.abs && !src.negate)
+      return src;
+
+   dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
+   resolved.type = src.type;
+   emit(MOV(resolved, src));
+
+   return src_reg(resolved);
+}
+
+src_reg
+vec4_visitor::fix_math_operand(const src_reg &src)
 {
    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
       return src;
@@ -338,7 +351,7 @@ vec4_visitor::fix_math_operand(src_reg src)
    return src_reg(expanded);
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_math(enum opcode opcode,
                         const dst_reg &dst,
                         const src_reg &src0, const src_reg &src1)
@@ -350,11 +363,13 @@ vec4_visitor::emit_math(enum opcode opcode,
       /* MATH on Gen6 must be align1, so we can't do writemasks. */
       math->dst = dst_reg(this, glsl_type::vec4_type);
       math->dst.type = dst.type;
-      emit(MOV(dst, src_reg(math->dst)));
+      math = emit(MOV(dst, src_reg(math->dst)));
    } else if (devinfo->gen < 6) {
       math->base_mrf = 1;
       math->mlen = src1.file == BAD_FILE ? 1 : 2;
    }
+
+   return math;
 }
 
 void
@@ -572,9 +587,18 @@ vec4_visitor::visit_instructions(const exec_list *list)
    }
 }
 
-
-static int
-type_size(const struct glsl_type *type)
+/**
+ * Returns the minimum number of vec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single vec4); for matrices, the
+ * number of columns; for array and struct, the sum of the vec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ */
+int
+vec4_visitor::type_size(const struct glsl_type *type)
 {
    unsigned int i;
    int size;
@@ -603,6 +627,9 @@ type_size(const struct glsl_type *type)
 	 size += type_size(type->fields.structure[i].type);
       }
       return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
+
    case GLSL_TYPE_SAMPLER:
       /* Samplers take up no register space, since they're baked in at
        * link time.
@@ -611,6 +638,7 @@ type_size(const struct glsl_type *type)
    case GLSL_TYPE_ATOMIC_UINT:
       return 0;
    case GLSL_TYPE_IMAGE:
+      return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_DOUBLE:
    case GLSL_TYPE_ERROR:
@@ -627,7 +655,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type));
+   this->reg = v->alloc.allocate(v->type_size(type));
 
    if (type->is_array() || type->is_record()) {
       this->swizzle = BRW_SWIZZLE_NOOP;
@@ -645,7 +673,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type) * size);
+   this->reg = v->alloc.allocate(v->type_size(type) * size);
 
    this->swizzle = BRW_SWIZZLE_NOOP;
 
@@ -657,7 +685,7 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type));
+   this->reg = v->alloc.allocate(v->type_size(type));
 
    if (type->is_array() || type->is_record()) {
       this->writemask = WRITEMASK_XYZW;
@@ -668,6 +696,21 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    this->type = brw_type_for_base_type(type);
 }
 
+void
+vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
+                                          unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      stage_prog_data->param[4 * uniforms + i] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      stage_prog_data->param[4 * uniforms + i] = &zero;
+
+   uniform_vector_size[uniforms++] = n;
+}
+
 /* Our support for uniforms is piggy-backed on the struct
  * gl_fragment_program, because that's where the values actually
  * get stored, rather than in some global gl_shader_program uniform
@@ -697,26 +740,13 @@ vec4_visitor::setup_uniform_values(ir_variable *ir)
          continue;
       }
 
-      gl_constant_value *components = storage->storage;
-      unsigned vector_count = (MAX2(storage->array_elements, 1) *
-                               storage->type->matrix_columns);
-
-      for (unsigned s = 0; s < vector_count; s++) {
-         assert(uniforms < uniform_array_size);
-         uniform_vector_size[uniforms] = storage->type->vector_elements;
-
-         int i;
-         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
-            stage_prog_data->param[uniforms * 4 + i] = components;
-            components++;
-         }
-         for (; i < 4; i++) {
-            static gl_constant_value zero = { 0.0 };
-            stage_prog_data->param[uniforms * 4 + i] = &zero;
-         }
+      const unsigned vector_count = (MAX2(storage->array_elements, 1) *
+                                     storage->type->matrix_columns);
+      const unsigned vector_size = storage->type->vector_elements;
 
-         uniforms++;
-      }
+      for (unsigned s = 0; s < vector_count; s++)
+         setup_vector_uniform_values(&storage->storage[s * vector_size],
+                                     vector_size);
    }
 }
 
@@ -1043,8 +1073,6 @@ vec4_visitor::visit(ir_variable *ir)
       for (int i = 0; i < type_size(ir->type); i++) {
 	 output_reg[ir->data.location + i] = *reg;
 	 output_reg[ir->data.location + i].reg_offset = i;
-	 output_reg[ir->data.location + i].type =
-            brw_type_for_base_type(ir->type->get_scalar_type());
 	 output_reg_annotation[ir->data.location + i] = ir->name;
       }
       break;
@@ -1064,7 +1092,7 @@ vec4_visitor::visit(ir_variable *ir)
        * Some uniforms, such as samplers and atomic counters, have no actual
        * storage, so we should ignore them.
        */
-      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
+      if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
          return;
 
       /* Track how big the whole uniform variable is, in case we need to put a
@@ -1081,7 +1109,7 @@ vec4_visitor::visit(ir_variable *ir)
       break;
 
    case ir_var_system_value:
-      reg = make_reg_for_system_value(ir);
+      reg = make_reg_for_system_value(ir->data.location, ir->type);
       break;
 
    default:
@@ -1253,7 +1281,7 @@ vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
    return true;
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
                           src_reg src0, src_reg src1)
 {
@@ -1268,9 +1296,11 @@ vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
       inst->predicate = BRW_PREDICATE_NORMAL;
    }
+
+   return inst;
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_lrp(const dst_reg &dst,
                        const src_reg &x, const src_reg &y, const src_reg &a)
 {
@@ -1278,8 +1308,8 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
       /* Note that the instruction's argument order is reversed from GLSL
        * and the IR.
        */
-      emit(LRP(dst,
-               fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
+     return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
+                     fix_3src_operand(x)));
    } else {
       /* Earlier generations don't support three source operations, so we
        * need to emit x*(1-a) + y*a.
@@ -1294,7 +1324,7 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
       emit(MUL(y_times_a, y, a));
       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
-      emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
+      return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
    }
 }
 
@@ -1375,15 +1405,19 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
       emit(pull);
 }
 
-void
-vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
+src_reg
+vec4_visitor::emit_uniformize(const src_reg &src)
 {
    const src_reg chan_index(this, glsl_type::uint_type);
+   const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
+                              src.type);
 
    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
       ->force_writemask_all = true;
    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
       ->force_writemask_all = true;
+
+   return src_reg(dst);
 }
 
 void
@@ -1555,6 +1589,10 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_unop_noise:
       unreachable("not reached: should be handled by lower_noise");
 
+   case ir_unop_subroutine_to_int:
+      emit(MOV(result_dst, op[0]));
+      break;
+
    case ir_binop_add:
       emit(ADD(result_dst, op[0], op[1]));
       break;
@@ -1602,20 +1640,13 @@ vec4_visitor::visit(ir_expression *ir)
       assert(ir->type->is_integer());
       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
       break;
-   case ir_binop_carry: {
-      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
 
-      emit(ADDC(dst_null_ud(), op[0], op[1]));
-      emit(MOV(result_dst, src_reg(acc)));
-      break;
-   }
-   case ir_binop_borrow: {
-      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+   case ir_binop_carry:
+      unreachable("Should have been lowered by carry_to_arith().");
+
+   case ir_binop_borrow:
+      unreachable("Should have been lowered by borrow_to_arith().");
 
-      emit(SUBB(dst_null_ud(), op[0], op[1]));
-      emit(MOV(result_dst, src_reg(acc)));
-      break;
-   }
    case ir_binop_mod:
       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
       assert(ir->type->is_integer());
@@ -1734,16 +1765,11 @@ vec4_visitor::visit(ir_expression *ir)
       emit(MOV(result_dst, op[0]));
       break;
    case ir_unop_b2i:
-      emit(AND(result_dst, op[0], src_reg(1)));
-      break;
    case ir_unop_b2f:
       if (devinfo->gen <= 5) {
          resolve_bool_comparison(ir->operands[0], &op[0]);
       }
-      op[0].type = BRW_REGISTER_TYPE_D;
-      result_dst.type = BRW_REGISTER_TYPE_D;
-      emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
-      result_dst.type = BRW_REGISTER_TYPE_F;
+      emit(MOV(result_dst, negate(op[0])));
       break;
    case ir_unop_f2b:
       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
@@ -1839,7 +1865,7 @@ vec4_visitor::visit(ir_expression *ir)
          surf_index = src_reg(this, glsl_type::uint_type);
          emit(ADD(dst_reg(surf_index), op[0],
                   src_reg(prog_data->base.binding_table.ubo_start)));
-         emit_uniformize(dst_reg(surf_index), surf_index);
+         surf_index = emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -2439,6 +2465,8 @@ vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
                           src_reg(), src_reg());
    }
+
+   brw_mark_surface_used(stage_prog_data, surf_index);
 }
 
 void
@@ -2456,7 +2484,8 @@ vec4_visitor::visit(ir_call *ir)
 }
 
 src_reg
-vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
+vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
+                             src_reg coordinate, src_reg sampler)
 {
    vec4_instruction *inst =
       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
@@ -2483,21 +2512,21 @@ vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler
    }
 
    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
-   int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
+   int coord_mask = (1 << coordinate_type->vector_elements) - 1;
    int zero_mask = 0xf & ~coord_mask;
 
-   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
             coordinate));
 
-   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
             src_reg(0)));
 
    emit(inst);
    return src_reg(inst->dst);
 }
 
-static bool
-is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
+bool
+vec4_visitor::is_high_sampler(src_reg sampler)
 {
    if (devinfo->gen < 8 && !devinfo->is_haswell)
       return false;
@@ -2506,6 +2535,183 @@ is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
 }
 
 void
+vec4_visitor::emit_texture(ir_texture_opcode op,
+                           dst_reg dest,
+                           const glsl_type *dest_type,
+                           src_reg coordinate,
+                           int coord_components,
+                           src_reg shadow_comparitor,
+                           src_reg lod, src_reg lod2,
+                           src_reg sample_index,
+                           uint32_t constant_offset,
+                           src_reg offset_value,
+                           src_reg mcs,
+                           bool is_cube_array,
+                           uint32_t sampler,
+                           src_reg sampler_reg)
+{
+   enum opcode opcode;
+   switch (op) {
+   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
+   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
+   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
+   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
+   case ir_tg4: opcode = offset_value.file != BAD_FILE
+                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
+   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
+   case ir_txb:
+      unreachable("TXB is not valid for vertex shaders.");
+   case ir_lod:
+      unreachable("LOD is not valid for vertex shaders.");
+   default:
+      unreachable("Unrecognized tex op");
+   }
+
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
+      opcode, dst_reg(this, dest_type));
+
+   inst->offset = constant_offset;
+
+   /* The message header is necessary for:
+    * - Gen4 (always)
+    * - Gen9+ for selecting SIMD4x2
+    * - Texel offsets
+    * - Gather channel selection
+    * - Sampler indices too large to fit in a 4-bit value.
+    */
+   inst->header_size =
+      (devinfo->gen < 5 || devinfo->gen >= 9 ||
+       inst->offset != 0 || op == ir_tg4 ||
+       is_high_sampler(sampler_reg)) ? 1 : 0;
+   inst->base_mrf = 2;
+   inst->mlen = inst->header_size + 1; /* always at least one */
+   inst->dst.writemask = WRITEMASK_XYZW;
+   inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
+
+   inst->src[1] = sampler_reg;
+
+   /* MRF for the first parameter */
+   int param_base = inst->base_mrf + inst->header_size;
+
+   if (op == ir_txs || op == ir_query_levels) {
+      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
+   } else {
+      /* Load the coordinate */
+      /* FINISHME: gl_clamp_mask and saturate */
+      int coord_mask = (1 << coord_components) - 1;
+      int zero_mask = 0xf & ~coord_mask;
+
+      emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
+               coordinate));
+
+      if (zero_mask != 0) {
+         emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
+                  src_reg(0)));
+      }
+      /* Load the shadow comparitor */
+      if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
+	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
+			  WRITEMASK_X),
+		  shadow_comparitor));
+	 inst->mlen++;
+      }
+
+      /* Load the LOD info */
+      if (op == ir_tex || op == ir_txl) {
+	 int mrf, writemask;
+	 if (devinfo->gen >= 5) {
+	    mrf = param_base + 1;
+	    if (shadow_comparitor.file != BAD_FILE) {
+	       writemask = WRITEMASK_Y;
+	       /* mlen already incremented */
+	    } else {
+	       writemask = WRITEMASK_X;
+	       inst->mlen++;
+	    }
+	 } else /* devinfo->gen == 4 */ {
+	    mrf = param_base;
+	    writemask = WRITEMASK_W;
+	 }
+         lod.swizzle = BRW_SWIZZLE_XXXX;
+	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
+      } else if (op == ir_txf) {
+         emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
+      } else if (op == ir_txf_ms) {
+         emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
+                  sample_index));
+         if (devinfo->gen >= 7) {
+            /* MCS data is in the first channel of `mcs`, but we need to get it into
+             * the .y channel of the second vec4 of params, so replicate .x across
+             * the whole vec4 and then mask off everything except .y
+             */
+            mcs.swizzle = BRW_SWIZZLE_XXXX;
+            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
+                     mcs));
+         }
+         inst->mlen++;
+      } else if (op == ir_txd) {
+         const brw_reg_type type = lod.type;
+
+	 if (devinfo->gen >= 5) {
+	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
+	    inst->mlen++;
+
+	    if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
+	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
+	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
+	       inst->mlen++;
+
+               if (shadow_comparitor.file != BAD_FILE) {
+                  emit(MOV(dst_reg(MRF, param_base + 2,
+                                   shadow_comparitor.type, WRITEMASK_Z),
+                           shadow_comparitor));
+               }
+	    }
+	 } else /* devinfo->gen == 4 */ {
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
+	    inst->mlen += 2;
+	 }
+      } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
+         if (shadow_comparitor.file != BAD_FILE) {
+            emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
+                     shadow_comparitor));
+         }
+
+         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
+                  offset_value));
+         inst->mlen++;
+      }
+   }
+
+   emit(inst);
+
+   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
+    * spec requires layers.
+    */
+   if (op == ir_txs && is_cube_array) {
+      emit_math(SHADER_OPCODE_INT_QUOTIENT,
+                writemask(inst->dst, WRITEMASK_Z),
+                src_reg(inst->dst), src_reg(6));
+   }
+
+   if (devinfo->gen == 6 && op == ir_tg4) {
+      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
+   }
+
+   swizzle_result(op, dest,
+                  src_reg(inst->dst), sampler, dest_type);
+}
+
+void
 vec4_visitor::visit(ir_texture *ir)
 {
    uint32_t sampler =
@@ -2535,11 +2741,9 @@ vec4_visitor::visit(ir_texture *ir)
 
       /* Emit code to evaluate the actual indexing expression */
       nonconst_sampler_index->accept(this);
-      dst_reg temp(this, glsl_type::uint_type);
-      emit(ADD(temp, this->result, src_reg(sampler)));
-      emit_uniformize(temp, src_reg(temp));
-
-      sampler_reg = src_reg(temp);
+      src_reg temp(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
+      sampler_reg = emit_uniformize(temp);
    } else {
       /* Single sampler, or constant array index; the indexing expression
        * is just an immediate.
@@ -2572,7 +2776,9 @@ vec4_visitor::visit(ir_texture *ir)
     * generating these values may involve SEND messages that need the MRFs.
     */
    src_reg coordinate;
+   int coord_components = 0;
    if (ir->coordinate) {
+      coord_components = ir->coordinate->type->vector_elements;
       ir->coordinate->accept(this);
       coordinate = this->result;
    }
@@ -2590,42 +2796,35 @@ vec4_visitor::visit(ir_texture *ir)
       offset_value = src_reg(this->result);
    }
 
-   const glsl_type *lod_type = NULL, *sample_index_type = NULL;
-   src_reg lod, dPdx, dPdy, sample_index, mcs;
+   src_reg lod, lod2, sample_index, mcs;
    switch (ir->op) {
    case ir_tex:
       lod = src_reg(0.0f);
-      lod_type = glsl_type::float_type;
       break;
    case ir_txf:
    case ir_txl:
    case ir_txs:
       ir->lod_info.lod->accept(this);
       lod = this->result;
-      lod_type = ir->lod_info.lod->type;
       break;
    case ir_query_levels:
       lod = src_reg(0);
-      lod_type = glsl_type::int_type;
       break;
    case ir_txf_ms:
       ir->lod_info.sample_index->accept(this);
       sample_index = this->result;
-      sample_index_type = ir->lod_info.sample_index->type;
 
       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
-         mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
+         mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
       else
          mcs = src_reg(0u);
       break;
    case ir_txd:
       ir->lod_info.grad.dPdx->accept(this);
-      dPdx = this->result;
+      lod = this->result;
 
       ir->lod_info.grad.dPdy->accept(this);
-      dPdy = this->result;
-
-      lod_type = ir->lod_info.grad.dPdx->type;
+      lod2 = this->result;
       break;
    case ir_txb:
    case ir_lod:
@@ -2633,175 +2832,31 @@ vec4_visitor::visit(ir_texture *ir)
       break;
    }
 
-   enum opcode opcode;
-   switch (ir->op) {
-   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_tg4: opcode = has_nonconstant_offset
-                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
-   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txb:
-      unreachable("TXB is not valid for vertex shaders.");
-   case ir_lod:
-      unreachable("LOD is not valid for vertex shaders.");
-   default:
-      unreachable("Unrecognized tex op");
-   }
-
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
-      opcode, dst_reg(this, ir->type));
-
+   uint32_t constant_offset = 0;
    if (ir->offset != NULL && !has_nonconstant_offset) {
-      inst->offset =
+      constant_offset  =
          brw_texture_offset(ir->offset->as_constant()->value.i,
                             ir->offset->type->vector_elements);
    }
 
    /* Stuff the channel select bits in the top of the texture offset */
    if (ir->op == ir_tg4)
-      inst->offset |= gather_channel(ir, sampler) << 16;
-
-   /* The message header is necessary for:
-    * - Gen4 (always)
-    * - Gen9+ for selecting SIMD4x2
-    * - Texel offsets
-    * - Gather channel selection
-    * - Sampler indices too large to fit in a 4-bit value.
-    */
-   inst->header_size =
-      (devinfo->gen < 5 || devinfo->gen >= 9 ||
-       inst->offset != 0 || ir->op == ir_tg4 ||
-       is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
-   inst->base_mrf = 2;
-   inst->mlen = inst->header_size + 1; /* always at least one */
-   inst->dst.writemask = WRITEMASK_XYZW;
-   inst->shadow_compare = ir->shadow_comparitor != NULL;
-
-   inst->src[1] = sampler_reg;
-
-   /* MRF for the first parameter */
-   int param_base = inst->base_mrf + inst->header_size;
-
-   if (ir->op == ir_txs || ir->op == ir_query_levels) {
-      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
-      emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
-   } else {
-      /* Load the coordinate */
-      /* FINISHME: gl_clamp_mask and saturate */
-      int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
-      int zero_mask = 0xf & ~coord_mask;
-
-      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
-               coordinate));
-
-      if (zero_mask != 0) {
-         emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
-                  src_reg(0)));
-      }
-      /* Load the shadow comparitor */
-      if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
-	 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
-			  WRITEMASK_X),
-		  shadow_comparitor));
-	 inst->mlen++;
-      }
+      constant_offset |=
+         gather_channel( ir->lod_info.component->as_constant()->value.i[0],
+                         sampler) << 16;
 
-      /* Load the LOD info */
-      if (ir->op == ir_tex || ir->op == ir_txl) {
-	 int mrf, writemask;
-	 if (devinfo->gen >= 5) {
-	    mrf = param_base + 1;
-	    if (ir->shadow_comparitor) {
-	       writemask = WRITEMASK_Y;
-	       /* mlen already incremented */
-	    } else {
-	       writemask = WRITEMASK_X;
-	       inst->mlen++;
-	    }
-	 } else /* devinfo->gen == 4 */ {
-	    mrf = param_base;
-	    writemask = WRITEMASK_W;
-	 }
-	 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
-      } else if (ir->op == ir_txf) {
-         emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
-      } else if (ir->op == ir_txf_ms) {
-         emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
-                  sample_index));
-         if (devinfo->gen >= 7) {
-            /* MCS data is in the first channel of `mcs`, but we need to get it into
-             * the .y channel of the second vec4 of params, so replicate .x across
-             * the whole vec4 and then mask off everything except .y
-             */
-            mcs.swizzle = BRW_SWIZZLE_XXXX;
-            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
-                     mcs));
-         }
-         inst->mlen++;
-      } else if (ir->op == ir_txd) {
-	 const glsl_type *type = lod_type;
+   glsl_type const *type = ir->sampler->type;
+   bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+      type->sampler_array;
 
-	 if (devinfo->gen >= 5) {
-	    dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
-	    dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
-	    inst->mlen++;
-
-	    if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
-	       dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
-	       dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
-	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
-	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
-	       inst->mlen++;
-
-               if (ir->shadow_comparitor) {
-                  emit(MOV(dst_reg(MRF, param_base + 2,
-                                   ir->shadow_comparitor->type, WRITEMASK_Z),
-                           shadow_comparitor));
-               }
-	    }
-	 } else /* devinfo->gen == 4 */ {
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
-	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
-	    inst->mlen += 2;
-	 }
-      } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
-         if (ir->shadow_comparitor) {
-            emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
-                     shadow_comparitor));
-         }
-
-         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
-                  offset_value));
-         inst->mlen++;
-      }
-   }
-
-   emit(inst);
-
-   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
-    * spec requires layers.
-    */
-   if (ir->op == ir_txs) {
-      glsl_type const *type = ir->sampler->type;
-      if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-          type->sampler_array) {
-         emit_math(SHADER_OPCODE_INT_QUOTIENT,
-                   writemask(inst->dst, WRITEMASK_Z),
-                   src_reg(inst->dst), src_reg(6));
-      }
-   }
-
-   if (devinfo->gen == 6 && ir->op == ir_tg4) {
-      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
-   }
+   this->result = src_reg(this, ir->type);
+   dst_reg dest = dst_reg(this->result);
 
-   swizzle_result(ir, src_reg(inst->dst), sampler);
+   emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
+                shadow_comparitor,
+                lod, lod2, sample_index,
+                constant_offset, offset_value,
+                mcs, is_cube_array, sampler, sampler_reg);
 }
 
 /**
@@ -2835,10 +2890,9 @@ vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
  * Set up the gather channel based on the swizzle, for gather4.
  */
 uint32_t
-vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
+vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
 {
-   ir_constant *chan = ir->lod_info.component->as_constant();
-   int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
+   int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
    switch (swiz) {
       case SWIZZLE_X: return 0;
       case SWIZZLE_Y:
@@ -2856,22 +2910,23 @@ vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
 }
 
 void
-vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
+vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
+                             src_reg orig_val, uint32_t sampler,
+                             const glsl_type *dest_type)
 {
    int s = key->tex.swizzles[sampler];
 
-   this->result = src_reg(this, ir->type);
-   dst_reg swizzled_result(this->result);
+   dst_reg swizzled_result = dest;
 
-   if (ir->op == ir_query_levels) {
+   if (op == ir_query_levels) {
       /* # levels is in .w */
       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
       emit(MOV(swizzled_result, orig_val));
       return;
    }
 
-   if (ir->op == ir_txs || ir->type == glsl_type::float_type
-			|| s == SWIZZLE_NOOP || ir->op == ir_tg4) {
+   if (op == ir_txs || dest_type == glsl_type::float_type
+			|| s == SWIZZLE_NOOP || op == ir_tg4) {
       emit(MOV(swizzled_result, orig_val));
       return;
    }
@@ -2954,12 +3009,25 @@ vec4_visitor::visit(ir_if *ir)
 }
 
 void
+vec4_visitor::gs_emit_vertex(int stream_id)
+{
+   unreachable("not reached");
+}
+
+void
 vec4_visitor::visit(ir_emit_vertex *)
 {
    unreachable("not reached");
 }
 
 void
+vec4_visitor::gs_end_primitive()
+{
+   unreachable("not reached");
+}
+
+
+void
 vec4_visitor::visit(ir_end_primitive *)
 {
    unreachable("not reached");
@@ -3094,6 +3162,7 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
          vec4_instruction *inst;
          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
          inst->predicate = BRW_PREDICATE_NORMAL;
+         output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
@@ -3106,18 +3175,23 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
          dst_reg reg_w = reg;
          reg_w.writemask = WRITEMASK_W;
-         emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
+         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
+         reg_as_src.type = reg_w.type;
+         reg_as_src.swizzle = brw_swizzle_for_size(1);
+         emit(MOV(reg_w, reg_as_src));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
          dst_reg reg_y = reg;
          reg_y.writemask = WRITEMASK_Y;
          reg_y.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
          dst_reg reg_z = reg;
          reg_z.writemask = WRITEMASK_Z;
          reg_z.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
       }
    }
@@ -3155,8 +3229,8 @@ vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
 vec4_instruction *
 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
 {
-   assert (varying < VARYING_SLOT_MAX);
-   reg.type = output_reg[varying].type;
+   assert(varying < VARYING_SLOT_MAX);
+   assert(output_reg[varying].type == reg.type);
    current_annotation = output_reg_annotation[varying];
    /* Copy the register, saturating if necessary */
    return emit(MOV(reg, src_reg(output_reg[varying])));
@@ -3166,6 +3240,7 @@ void
 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
 {
    reg.type = BRW_REGISTER_TYPE_F;
+   output_reg[varying].type = reg.type;
 
    switch (varying) {
    case VARYING_SLOT_PSIZ:
@@ -3422,7 +3497,8 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
 				       inst->dst.writemask));
    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
-   write->predicate = inst->predicate;
+   if (inst->opcode != BRW_OPCODE_SEL)
+      write->predicate = inst->predicate;
    write->ir = inst->ir;
    write->annotation = inst->annotation;
    inst->insert_after(block, write);
@@ -3485,16 +3561,16 @@ vec4_visitor::move_grf_array_access_to_scratch()
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       if (inst->dst.file == GRF && inst->dst.reladdr) {
          if (scratch_loc[inst->dst.reg] == -1) {
-            scratch_loc[inst->dst.reg] = c->last_scratch;
-            c->last_scratch += this->alloc.sizes[inst->dst.reg];
+            scratch_loc[inst->dst.reg] = last_scratch;
+            last_scratch += this->alloc.sizes[inst->dst.reg];
          }
 
          for (src_reg *iter = inst->dst.reladdr;
               iter->reladdr;
               iter = iter->reladdr) {
             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = c->last_scratch;
-               c->last_scratch += this->alloc.sizes[iter->reg];
+               scratch_loc[iter->reg] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->reg];
             }
          }
       }
@@ -3504,8 +3580,8 @@ vec4_visitor::move_grf_array_access_to_scratch()
               iter->reladdr;
               iter = iter->reladdr) {
             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = c->last_scratch;
-               c->last_scratch += this->alloc.sizes[iter->reg];
+               scratch_loc[iter->reg] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->reg];
             }
          }
       }
@@ -3679,7 +3755,7 @@ vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
 }
 
 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
-                           struct brw_vec4_compile *c,
+                           void *log_data,
                            struct gl_program *prog,
                            const struct brw_vue_prog_key *key,
                            struct brw_vue_prog_data *prog_data,
@@ -3688,9 +3764,8 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
 			   void *mem_ctx,
                            bool no_spills,
                            int shader_time_index)
-   : backend_shader(compiler, NULL, mem_ctx,
+   : backend_shader(compiler, log_data, mem_ctx,
                     shader_prog, prog, &prog_data->base, stage),
-     c(c),
      key(key),
      prog_data(prog_data),
      sanity_param_count(0),
@@ -3698,7 +3773,8 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
      first_non_payload_grf(0),
      need_all_constants_in_pull_buffer(false),
      no_spills(no_spills),
-     shader_time_index(shader_time_index)
+     shader_time_index(shader_time_index),
+     last_scratch(0)
 {
    this->failed = false;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
index dcbd2405078..d1a72d787e7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
@@ -394,8 +394,7 @@ vec4_vs_visitor::emit_program_code()
     * pull constants.  Do that now.
     */
    if (this->need_all_constants_in_pull_buffer) {
-      const struct gl_program_parameter_list *params =
-         vs_compile->vp->program.Base.Parameters;
+      const struct gl_program_parameter_list *params = vp->Base.Parameters;
       unsigned i;
       for (i = 0; i < params->NumParameters * 4; i++) {
          stage_prog_data->pull_param[i] =
@@ -415,8 +414,7 @@ vec4_vs_visitor::setup_vp_regs()
       vp_temp_regs[i] = src_reg(this, glsl_type::vec4_type);
 
    /* PROGRAM_STATE_VAR etc. */
-   struct gl_program_parameter_list *plist =
-      vs_compile->vp->program.Base.Parameters;
+   struct gl_program_parameter_list *plist = vp->Base.Parameters;
    for (unsigned p = 0; p < plist->NumParameters; p++) {
       unsigned components = plist->Parameters[p].Size;
 
@@ -486,8 +484,7 @@ vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
 src_reg
 vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
 {
-   struct gl_program_parameter_list *plist =
-      vs_compile->vp->program.Base.Parameters;
+   struct gl_program_parameter_list *plist = vp->Base.Parameters;
 
    src_reg result;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index f93062b46d0..620f652d6dc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -36,7 +36,7 @@ vec4_vs_visitor::emit_prolog()
 
    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
-         uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
+         uint8_t wa_flags = key->gl_attrib_wa_flags[i];
          dst_reg reg(ATTR, i);
          dst_reg reg_d = reg;
          reg_d.type = BRW_REGISTER_TYPE_D;
@@ -143,7 +143,8 @@ vec4_vs_visitor::emit_prolog()
 
 
 dst_reg *
-vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
+vec4_vs_visitor::make_reg_for_system_value(int location,
+                                           const glsl_type *type)
 {
    /* VertexID is stored by the VF as the last vertex element, but
     * we don't represent it with a flag in inputs_read, so we call
@@ -151,7 +152,7 @@ vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
     */
    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 
-   switch (ir->data.location) {
+   switch (location) {
    case SYSTEM_VALUE_BASE_VERTEX:
       reg->writemask = WRITEMASK_X;
       vs_prog_data->uses_vertexid = true;
@@ -212,19 +213,22 @@ vec4_vs_visitor::emit_thread_end()
 
 
 vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
-                                 struct brw_vs_compile *vs_compile,
+                                 void *log_data,
+                                 const struct brw_vs_prog_key *key,
                                  struct brw_vs_prog_data *vs_prog_data,
+                                 struct gl_vertex_program *vp,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
                                  int shader_time_index,
                                  bool use_legacy_snorm_formula)
-   : vec4_visitor(compiler, &vs_compile->base, &vs_compile->vp->program.Base,
-                  &vs_compile->key.base, &vs_prog_data->base, prog,
+   : vec4_visitor(compiler, log_data,
+                  &vp->Base, &key->base, &vs_prog_data->base, prog,
                   MESA_SHADER_VERTEX,
                   mem_ctx, false /* no_spills */,
                   shader_time_index),
-     vs_compile(vs_compile),
+     key(key),
      vs_prog_data(vs_prog_data),
+     vp(vp),
      use_legacy_snorm_formula(use_legacy_snorm_formula)
 {
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 6e9848fb1e9..c53cb49b612 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -94,7 +94,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
 {
    GLuint program_size;
    const GLuint *program;
-   struct brw_vs_compile c;
    struct brw_vs_prog_data prog_data;
    struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base;
    void *mem_ctx;
@@ -104,8 +103,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
    if (prog)
       vs = prog->_LinkedShaders[MESA_SHADER_VERTEX];
 
-   memset(&c, 0, sizeof(c));
-   memcpy(&c.key, key, sizeof(*key));
    memset(&prog_data, 0, sizeof(prog_data));
 
    /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
@@ -114,8 +111,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
 
    mem_ctx = ralloc_context(NULL);
 
-   c.vp = vp;
-
    /* Allocate the references to the uniforms that will end up in the
     * prog_data associated with the compiled program, and which will be freed
     * by the state cache.
@@ -126,26 +121,30 @@ brw_codegen_vs_prog(struct brw_context *brw,
        * case being a float value that gets blown up to a vec4, so be
        * conservative here.
        */
-      param_count = vs->num_uniform_components * 4;
-
+      param_count = vs->num_uniform_components * 4 +
+                    vs->NumImages * BRW_IMAGE_PARAM_SIZE;
+      stage_prog_data->nr_image_params = vs->NumImages;
    } else {
       param_count = vp->program.Base.Parameters->NumParameters * 4;
    }
    /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
     * planes as uniforms.
     */
-   param_count += c.key.base.nr_userclip_plane_consts * 4;
+   param_count += key->base.nr_userclip_plane_consts * 4;
 
    stage_prog_data->param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    stage_prog_data->pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   stage_prog_data->image_param =
+      rzalloc_array(NULL, struct brw_image_param,
+                    stage_prog_data->nr_image_params);
    stage_prog_data->nr_params = param_count;
 
    GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
    prog_data.inputs_read = vp->program.Base.InputsRead;
 
-   if (c.key.copy_edgeflag) {
+   if (key->copy_edgeflag) {
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
       prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
    }
@@ -158,7 +157,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
        * coords, which would be a pain to handle.
        */
       for (i = 0; i < 8; i++) {
-         if (c.key.point_coord_replace & (1 << i))
+         if (key->point_coord_replace & (1 << i))
             outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
       }
 
@@ -173,7 +172,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
     * distance varying slots whenever clipping is enabled, even if the vertex
     * shader doesn't write to gl_ClipDistance.
     */
-   if (c.key.base.userclip_active) {
+   if (key->base.userclip_active) {
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
    }
@@ -182,34 +181,28 @@ brw_codegen_vs_prog(struct brw_context *brw,
                        &prog_data.base.vue_map, outputs_written);
 
    if (0) {
-      _mesa_fprint_program_opt(stderr, &c.vp->program.Base, PROG_PRINT_DEBUG,
+      _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
 			       true);
    }
 
    /* Emit GEN4 code.
     */
-   program = brw_vs_emit(brw, prog, &c, &prog_data, mem_ctx, &program_size);
+   program = brw_vs_emit(brw, mem_ctx, key, &prog_data,
+                         &vp->program, prog, &program_size);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
    }
 
    /* Scratch space is used for register spilling */
-   if (c.base.last_scratch) {
-      perf_debug("Vertex shader triggered register spilling.  "
-                 "Try reducing the number of live vec4 values to "
-                 "improve performance.\n");
-
-      prog_data.base.base.total_scratch
-         = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
-
+   if (prog_data.base.base.total_scratch) {
       brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo,
 			 prog_data.base.base.total_scratch *
                          brw->max_vs_threads);
    }
 
    brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG,
-		    &c.key, sizeof(c.key),
+		    key, sizeof(struct brw_vs_prog_key),
 		    program, program_size,
 		    &prog_data, sizeof(prog_data),
 		    &brw->vs.base.prog_offset, &brw->vs.prog_data);
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 61f9b006a58..1d9bee11c56 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -50,22 +50,16 @@
 #define BRW_ATTRIB_WA_SIGN          32  /* interpret as signed in shader */
 #define BRW_ATTRIB_WA_SCALE         64  /* interpret as scaled in shader */
 
-struct brw_vs_compile {
-   struct brw_vec4_compile base;
-   struct brw_vs_prog_key key;
-
-   struct brw_vertex_program *vp;
-};
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 const unsigned *brw_vs_emit(struct brw_context *brw,
-                            struct gl_shader_program *prog,
-                            struct brw_vs_compile *c,
-                            struct brw_vs_prog_data *prog_data,
                             void *mem_ctx,
+                            const struct brw_vs_prog_key *key,
+                            struct brw_vs_prog_data *prog_data,
+                            struct gl_vertex_program *vp,
+                            struct gl_shader_program *shader_prog,
                             unsigned *program_size);
 void brw_vs_debug_recompile(struct brw_context *brw,
                             struct gl_shader_program *prog,
@@ -91,15 +85,18 @@ class vec4_vs_visitor : public vec4_visitor
 {
 public:
    vec4_vs_visitor(const struct brw_compiler *compiler,
-                   struct brw_vs_compile *vs_compile,
+                   void *log_data,
+                   const struct brw_vs_prog_key *key,
                    struct brw_vs_prog_data *vs_prog_data,
+                   struct gl_vertex_program *vp,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
                    int shader_time_index,
                    bool use_legacy_snorm_formula);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_program_code();
@@ -113,8 +110,9 @@ private:
    dst_reg get_vp_dst_reg(const prog_dst_register &dst);
    src_reg get_vp_src_reg(const prog_src_register &src);
 
-   struct brw_vs_compile * const vs_compile;
+   const struct brw_vs_prog_key *const key;
    struct brw_vs_prog_data * const vs_prog_data;
+   struct gl_vertex_program *const vp;
    src_reg *vp_temp_regs;
    src_reg vp_addr_reg;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index b2f91bd412b..72e37d4b467 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -191,3 +191,28 @@ const struct brw_tracked_state brw_vs_abo_surfaces = {
    },
    .emit = brw_upload_vs_abo_surfaces,
 };
+
+static void
+brw_upload_vs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_VERTEX_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
+
+   if (prog) {
+      /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX],
+                                &brw->vs.base, &brw->vs.prog_data->base.base);
+   }
+}
+
+const struct brw_tracked_state brw_vs_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_IMAGE_UNITS |
+             BRW_NEW_VERTEX_PROGRAM |
+             BRW_NEW_VS_PROG_DATA,
+   },
+   .emit = brw_upload_vs_image_surfaces,
+};
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 4619ce1080d..41266f57560 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -1,34 +1,28 @@
 /*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <[email protected]>
-  */
-
+ * Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ * Intel funded Tungsten Graphics to
+ * develop this 3D driver.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
 #include "brw_context.h"
 #include "brw_wm.h"
 #include "brw_state.h"
@@ -181,9 +175,12 @@ brw_codegen_wm_prog(struct brw_context *brw,
     * so the shader definitely kills pixels.
     */
    prog_data.uses_kill = fp->program.UsesKill || key->alpha_test_func;
-
+   prog_data.uses_omask =
+      fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    prog_data.computed_depth_mode = computed_depth_mode(&fp->program);
 
+   prog_data.early_fragment_tests = fs && fs->EarlyFragmentTests;
+
    /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
    if (!prog)
       prog_data.base.use_alt_mode = true;
@@ -194,7 +191,9 @@ brw_codegen_wm_prog(struct brw_context *brw,
     */
    int param_count;
    if (fs) {
-      param_count = fs->num_uniform_components;
+      param_count = fs->num_uniform_components +
+                    fs->NumImages * BRW_IMAGE_PARAM_SIZE;
+      prog_data.base.nr_image_params = fs->NumImages;
    } else {
       param_count = fp->program.Base.Parameters->NumParameters * 4;
    }
@@ -204,6 +203,9 @@ brw_codegen_wm_prog(struct brw_context *brw,
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    prog_data.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param,
+                    prog_data.base.nr_image_params);
    prog_data.base.nr_params = param_count;
 
    prog_data.barycentric_interp_modes =
@@ -349,13 +351,15 @@ static uint8_t
 gen6_gather_workaround(GLenum internalformat)
 {
    switch (internalformat) {
-      case GL_R8I: return WA_SIGN | WA_8BIT;
-      case GL_R8UI: return WA_8BIT;
-      case GL_R16I: return WA_SIGN | WA_16BIT;
-      case GL_R16UI: return WA_16BIT;
-      /* note that even though GL_R32I and GL_R32UI have format overrides
-       * in the surface state, there is no shader w/a required */
-      default: return 0;
+   case GL_R8I: return WA_SIGN | WA_8BIT;
+   case GL_R8UI: return WA_8BIT;
+   case GL_R16I: return WA_SIGN | WA_16BIT;
+   case GL_R16UI: return WA_16BIT;
+   default:
+      /* Note that even though GL_R32I and GL_R32UI have format overrides in
+       * the surface state, there is no shader w/a required.
+       */
+      return 0;
    }
 }
 
@@ -402,8 +406,9 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
 	       key->gl_clamp_mask[2] |= 1 << s;
 	 }
 
-         /* gather4's channel select for green from RG32F is broken;
-          * requires a shader w/a on IVB; fixable with just SCS on HSW. */
+         /* gather4's channel select for green from RG32F is broken; requires
+          * a shader w/a on IVB; fixable with just SCS on HSW.
+          */
          if (brw->gen == 7 && !brw->is_haswell && prog->UsesGather) {
             if (img->InternalFormat == GL_RG32F)
                key->gather_channel_quirk_mask |= 1 << s;
@@ -452,13 +457,13 @@ brw_wm_state_dirty (struct brw_context *brw)
                           BRW_NEW_VUE_MAP_GEOM_OUT);
 }
 
-static void brw_wm_populate_key( struct brw_context *brw,
-				 struct brw_wm_prog_key *key )
+static void
+brw_wm_populate_key(struct brw_context *brw, struct brw_wm_prog_key *key)
 {
    struct gl_context *ctx = &brw->ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp =
-      (struct brw_fragment_program *)brw->fragment_program;
+      (struct brw_fragment_program *) brw->fragment_program;
    const struct gl_program *prog = (struct gl_program *) brw->fragment_program;
    GLuint lookup = 0;
    GLuint line_aa;
@@ -604,7 +609,8 @@ static void brw_wm_populate_key( struct brw_context *brw,
     * like GL requires.  Fix that by building the alpha test into the
     * shader, and we'll skip enabling the fixed function alpha test.
     */
-   if (brw->gen < 6 && ctx->DrawBuffer->_NumColorDrawBuffers > 1 && ctx->Color.AlphaEnabled) {
+   if (brw->gen < 6 && ctx->DrawBuffer->_NumColorDrawBuffers > 1 &&
+       ctx->Color.AlphaEnabled) {
       key->alpha_test_func = ctx->Color.AlphaFunc;
       key->alpha_test_ref = ctx->Color.AlphaRef;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 72aad96bb6a..f13a97ce2b0 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1024,6 +1024,257 @@ const struct brw_tracked_state brw_cs_abo_surfaces = {
    .emit = brw_upload_cs_abo_surfaces,
 };
 
+static void
+brw_upload_cs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* _NEW_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
+
+   if (prog) {
+      /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_COMPUTE],
+                                &brw->cs.base, &brw->cs.prog_data->base);
+   }
+}
+
+const struct brw_tracked_state brw_cs_image_surfaces = {
+   .dirty = {
+      .mesa = _NEW_PROGRAM,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_CS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS
+   },
+   .emit = brw_upload_cs_image_surfaces,
+};
+
+static uint32_t
+get_image_format(struct brw_context *brw, mesa_format format, GLenum access)
+{
+   if (access == GL_WRITE_ONLY) {
+      return brw_format_for_mesa_format(format);
+   } else {
+      /* Typed surface reads support a very limited subset of the shader
+       * image formats.  Translate it into the closest format the
+       * hardware supports.
+       */
+      if ((_mesa_get_format_bytes(format) >= 16 && brw->gen <= 8) ||
+          (_mesa_get_format_bytes(format) >= 8 &&
+           (brw->gen == 7 && !brw->is_haswell)))
+         return BRW_SURFACEFORMAT_RAW;
+      else
+         return brw_format_for_mesa_format(
+            brw_lower_mesa_image_format(brw->intelScreen->devinfo, format));
+   }
+}
+
+static void
+update_default_image_param(struct brw_context *brw,
+                           struct gl_image_unit *u,
+                           unsigned surface_idx,
+                           struct brw_image_param *param)
+{
+   memset(param, 0, sizeof(*param));
+   param->surface_idx = surface_idx;
+   /* Set the swizzling shifts to all-ones to effectively disable swizzling --
+    * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
+    * detailed explanation of these parameters.
+    */
+   param->swizzling[0] = 0xff;
+   param->swizzling[1] = 0xff;
+}
+
+static void
+update_buffer_image_param(struct brw_context *brw,
+                          struct gl_image_unit *u,
+                          unsigned surface_idx,
+                          struct brw_image_param *param)
+{
+   struct gl_buffer_object *obj = u->TexObj->BufferObject;
+
+   update_default_image_param(brw, u, surface_idx, param);
+
+   param->size[0] = obj->Size / _mesa_get_format_bytes(u->_ActualFormat);
+   param->stride[0] = _mesa_get_format_bytes(u->_ActualFormat);
+}
+
+static void
+update_texture_image_param(struct brw_context *brw,
+                           struct gl_image_unit *u,
+                           unsigned surface_idx,
+                           struct brw_image_param *param)
+{
+   struct intel_mipmap_tree *mt = intel_texture_object(u->TexObj)->mt;
+
+   update_default_image_param(brw, u, surface_idx, param);
+
+   param->size[0] = minify(mt->logical_width0, u->Level);
+   param->size[1] = minify(mt->logical_height0, u->Level);
+   param->size[2] = (!u->Layered ? 1 :
+                     u->TexObj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
+                     u->TexObj->Target == GL_TEXTURE_3D ?
+                     minify(mt->logical_depth0, u->Level) :
+                     mt->logical_depth0);
+
+   intel_miptree_get_image_offset(mt, u->Level, u->Layer,
+                                  &param->offset[0],
+                                  &param->offset[1]);
+
+   param->stride[0] = mt->cpp;
+   param->stride[1] = mt->pitch / mt->cpp;
+   param->stride[2] =
+      brw_miptree_get_horizontal_slice_pitch(brw, mt, u->Level);
+   param->stride[3] =
+      brw_miptree_get_vertical_slice_pitch(brw, mt, u->Level);
+
+   if (mt->tiling == I915_TILING_X) {
+      /* An X tile is a rectangular block of 512x8 bytes. */
+      param->tiling[0] = _mesa_logbase2(512 / mt->cpp);
+      param->tiling[1] = _mesa_logbase2(8);
+
+      if (brw->has_swizzling) {
+         /* Right shifts required to swizzle bits 9 and 10 of the memory
+          * address with bit 6.
+          */
+         param->swizzling[0] = 3;
+         param->swizzling[1] = 4;
+      }
+   } else if (mt->tiling == I915_TILING_Y) {
+      /* The layout of a Y-tiled surface in memory isn't really fundamentally
+       * different to the layout of an X-tiled surface, we simply pretend that
+       * the surface is broken up in a number of smaller 16Bx32 tiles, each
+       * one arranged in X-major order just like is the case for X-tiling.
+       */
+      param->tiling[0] = _mesa_logbase2(16 / mt->cpp);
+      param->tiling[1] = _mesa_logbase2(32);
+
+      if (brw->has_swizzling) {
+         /* Right shift required to swizzle bit 9 of the memory address with
+          * bit 6.
+          */
+         param->swizzling[0] = 3;
+      }
+   }
+
+   /* 3D textures are arranged in 2D in memory with 2^lod slices per row.  The
+    * address calculation algorithm (emit_address_calculation() in
+    * brw_fs_surface_builder.cpp) handles this as a sort of tiling with
+    * modulus equal to the LOD.
+    */
+   param->tiling[2] = (u->TexObj->Target == GL_TEXTURE_3D ? u->Level :
+                       0);
+}
+
+static void
+update_image_surface(struct brw_context *brw,
+                     struct gl_image_unit *u,
+                     GLenum access,
+                     unsigned surface_idx,
+                     uint32_t *surf_offset,
+                     struct brw_image_param *param)
+{
+   if (u->_Valid) {
+      struct gl_texture_object *obj = u->TexObj;
+      const unsigned format = get_image_format(brw, u->_ActualFormat, access);
+
+      if (obj->Target == GL_TEXTURE_BUFFER) {
+         struct intel_buffer_object *intel_obj =
+            intel_buffer_object(obj->BufferObject);
+         const unsigned texel_size = (format == BRW_SURFACEFORMAT_RAW ? 1 :
+                                      _mesa_get_format_bytes(u->_ActualFormat));
+
+         brw->vtbl.emit_buffer_surface_state(
+            brw, surf_offset, intel_obj->buffer, obj->BufferOffset,
+            format, intel_obj->Base.Size / texel_size, texel_size,
+            access != GL_READ_ONLY);
+
+         update_buffer_image_param(brw, u, surface_idx, param);
+
+      } else {
+         struct intel_texture_object *intel_obj = intel_texture_object(obj);
+         struct intel_mipmap_tree *mt = intel_obj->mt;
+
+         if (format == BRW_SURFACEFORMAT_RAW) {
+            brw->vtbl.emit_buffer_surface_state(
+               brw, surf_offset, mt->bo, mt->offset,
+               format, mt->bo->size - mt->offset, 1 /* pitch */,
+               access != GL_READ_ONLY);
+
+         } else {
+            const unsigned min_layer = obj->MinLayer + u->Layer;
+            const unsigned min_level = obj->MinLevel + u->Level;
+            const unsigned num_layers = (!u->Layered ? 1 :
+                                         obj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
+                                         mt->logical_depth0);
+            const GLenum target = (obj->Target == GL_TEXTURE_CUBE_MAP ||
+                                   obj->Target == GL_TEXTURE_CUBE_MAP_ARRAY ?
+                                   GL_TEXTURE_2D_ARRAY : obj->Target);
+
+            brw->vtbl.emit_texture_surface_state(
+               brw, mt, target,
+               min_layer, min_layer + num_layers,
+               min_level, min_level + 1,
+               format, SWIZZLE_XYZW,
+               surf_offset, access != GL_READ_ONLY, false);
+         }
+
+         update_texture_image_param(brw, u, surface_idx, param);
+      }
+
+   } else {
+      brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, surf_offset);
+      update_default_image_param(brw, u, surface_idx, param);
+   }
+}
+
+void
+brw_upload_image_surfaces(struct brw_context *brw,
+                          struct gl_shader *shader,
+                          struct brw_stage_state *stage_state,
+                          struct brw_stage_prog_data *prog_data)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   if (shader && shader->NumImages) {
+      for (unsigned i = 0; i < shader->NumImages; i++) {
+         struct gl_image_unit *u = &ctx->ImageUnits[shader->ImageUnits[i]];
+         const unsigned surf_idx = prog_data->binding_table.image_start + i;
+
+         update_image_surface(brw, u, shader->ImageAccess[i],
+                              surf_idx,
+                              &stage_state->surf_offset[surf_idx],
+                              &prog_data->image_param[i]);
+      }
+
+      brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
+   }
+}
+
+static void
+brw_upload_wm_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct gl_shader_program *prog = ctx->Shader._CurrentFragmentProgram;
+
+   if (prog) {
+      /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_FRAGMENT],
+                                &brw->wm.base, &brw->wm.prog_data->base);
+   }
+}
+
+const struct brw_tracked_state brw_wm_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_FRAGMENT_PROGRAM |
+             BRW_NEW_FS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS
+   },
+   .emit = brw_upload_wm_image_surfaces,
+};
+
 void
 gen4_init_vtable_surface_functions(struct brw_context *brw)
 {
diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
index b6a3d78d849..54c4a6dfdd8 100644
--- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
@@ -821,7 +821,7 @@ gen6_blorp_emit_depth_stencil_config(struct brw_context *brw,
 
    /* 3DSTATE_DEPTH_BUFFER */
    {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
 
       BEGIN_BATCH(7);
       /* 3DSTATE_DEPTH_BUFFER dw0 */
@@ -896,7 +896,7 @@ static void
 gen6_blorp_emit_depth_disable(struct brw_context *brw,
                               const brw_blorp_params *params)
 {
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    BEGIN_BATCH(7);
    OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
@@ -1021,7 +1021,7 @@ gen6_blorp_exec(struct brw_context *brw,
    uint32_t prog_offset = params->get_wm_prog(brw, &prog_data);
 
    /* Emit workaround flushes when we switch from drawing to blorping. */
-   intel_emit_post_sync_nonzero_flush(brw);
+   brw_emit_post_sync_nonzero_flush(brw);
 
    gen6_emit_3dstate_multisample(brw, params->dst.num_samples);
    gen6_emit_3dstate_sample_mask(brw,
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 2bfa271b527..3bab8f46ae8 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -51,7 +51,7 @@ gen6_upload_blend_state(struct brw_context *brw)
     * with render target 0, which will reference BLEND_STATE[0] for
     * alpha test enable.
     */
-   if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
+   if (nr_draw_buffers == 0)
       nr_draw_buffers = 1;
 
    size = sizeof(*blend) * nr_draw_buffers;
@@ -97,8 +97,8 @@ gen6_upload_blend_state(struct brw_context *brw)
                    rb_type != GL_UNSIGNED_NORMALIZED &&
                    rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
                    "renderbuffer\n",
-                   _mesa_lookup_enum_by_nr(ctx->Color.LogicOp),
-                   _mesa_lookup_enum_by_nr(rb_type));
+                   _mesa_enum_to_string(ctx->Color.LogicOp),
+                   _mesa_enum_to_string(rb_type));
 	 if (rb_type == GL_UNSIGNED_NORMALIZED) {
 	    blend[b].blend1.logic_op_enable = 1;
 	    blend[b].blend1.logic_op_func =
diff --git a/src/mesa/drivers/dri/i965/gen6_depth_state.c b/src/mesa/drivers/dri/i965/gen6_depth_state.c
index 1df0bd47571..febd4781100 100644
--- a/src/mesa/drivers/dri/i965/gen6_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_depth_state.c
@@ -65,7 +65,7 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
     */
    bool enable_hiz_ss = hiz || separate_stencil;
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
    if (!irb)
@@ -73,7 +73,7 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
    rb = (struct gl_renderbuffer*) irb;
 
    if (rb) {
-      depth = MAX2(rb->Depth, 1);
+      depth = MAX2(irb->layer_count, 1);
       if (rb->TexImage)
          gl_target = rb->TexImage->TexObject->Target;
    }
@@ -89,6 +89,10 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
       surftype = BRW_SURFACE_2D;
       depth *= 6;
       break;
+   case GL_TEXTURE_3D:
+      assert(mt);
+      depth = MAX2(mt->logical_depth0, 1);
+      /* fallthrough */
    default:
       surftype = translate_tex_target(gl_target);
       break;
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 782687aac57..68e443d38a5 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -147,7 +147,12 @@ gen6_gs_visitor::emit_prolog()
 }
 
 void
-gen6_gs_visitor::visit(ir_emit_vertex *)
+gen6_gs_visitor::visit(ir_emit_vertex *ir)
+{
+   gs_emit_vertex(ir->stream_id());
+}
+void
+gen6_gs_visitor::gs_emit_vertex(int stream_id)
 {
    this->current_annotation = "gen6 emit vertex";
    /* Honor max_vertex layout indication in geometry shader by ignoring any
@@ -224,6 +229,12 @@ gen6_gs_visitor::visit(ir_emit_vertex *)
 void
 gen6_gs_visitor::visit(ir_end_primitive *)
 {
+   gs_end_primitive();
+}
+
+void
+gen6_gs_visitor::gs_end_primitive()
+{
    this->current_annotation = "gen6 end primitive";
    /* Calling EndPrimitive() is optional for point output. In this case we set
     * the PrimEnd flag when we process EmitVertex().
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index 27254ebb727..4cf94893261 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -36,12 +36,14 @@ class gen6_gs_visitor : public vec4_gs_visitor
 {
 public:
    gen6_gs_visitor(const struct brw_compiler *comp,
+                   void *log_data,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index) :
-      vec4_gs_visitor(comp, c, prog, mem_ctx, no_spills, shader_time_index) {}
+      vec4_gs_visitor(comp, log_data, c, prog, mem_ctx, no_spills,
+                      shader_time_index) {}
 
 protected:
    virtual void assign_binding_table_offsets();
@@ -49,6 +51,8 @@ protected:
    virtual void emit_thread_end();
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
    virtual void emit_urb_write_header(int mrf);
    virtual void emit_urb_write_opcode(bool complete,
                                       int base_mrf,
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index 36734f598fe..8444c0c9bae 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -143,12 +143,11 @@ gen6_emit_3dstate_multisample(struct brw_context *brw,
    ADVANCE_BATCH();
 }
 
-
 unsigned
 gen6_determine_sample_mask(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   float coverage = 1.0;
+   float coverage = 1.0f;
    float coverage_invert = false;
    unsigned sample_mask = ~0u;
 
@@ -166,7 +165,7 @@ gen6_determine_sample_mask(struct brw_context *brw)
    }
 
    if (num_samples > 1) {
-      int coverage_int = (int) (num_samples * coverage + 0.5);
+      int coverage_int = (int) (num_samples * coverage + 0.5f);
       uint32_t coverage_bits = (1 << coverage_int) - 1;
       if (coverage_invert)
          coverage_bits ^= (1 << num_samples) - 1;
@@ -176,7 +175,6 @@ gen6_determine_sample_mask(struct brw_context *brw)
    }
 }
 
-
 /**
  * 3DSTATE_SAMPLE_MASK
  */
@@ -189,15 +187,14 @@ gen6_emit_3dstate_sample_mask(struct brw_context *brw, unsigned mask)
    ADVANCE_BATCH();
 }
 
-
-static void upload_multisample_state(struct brw_context *brw)
+static void
+upload_multisample_state(struct brw_context *brw)
 {
    /* BRW_NEW_NUM_SAMPLES */
    gen6_emit_3dstate_multisample(brw, brw->num_samples);
    gen6_emit_3dstate_sample_mask(brw, gen6_determine_sample_mask(brw));
 }
 
-
 const struct brw_tracked_state gen6_multisample_state = {
    .dirty = {
       .mesa = _NEW_MULTISAMPLE,
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index ba5c944fb3d..9f4a5db3592 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -86,7 +86,7 @@ static void
 write_primitives_generated(struct brw_context *brw,
                            drm_intel_bo *query_bo, int stream, int idx)
 {
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen >= 7 && stream > 0) {
       brw_store_register_mem64(brw, query_bo,
@@ -100,7 +100,7 @@ static void
 write_xfb_primitives_written(struct brw_context *brw,
                              drm_intel_bo *bo, int stream, int idx)
 {
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen >= 7) {
       brw_store_register_mem64(brw, bo, GEN7_SO_NUM_PRIMS_WRITTEN(stream), idx);
@@ -157,7 +157,7 @@ emit_pipeline_stat(struct brw_context *brw, drm_intel_bo *bo,
    /* Emit a flush to make sure various parts of the pipeline are complete and
     * we get an accurate value
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    brw_store_register_mem64(brw, bo, reg, idx);
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index b00517ed81e..4068f2844a2 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -383,7 +383,7 @@ upload_sf_state(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw4 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw4 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /*
     * Window coordinates in an FBO are inverted, which means point
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index be80d7bdfc5..3899ce9451f 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -292,5 +292,5 @@ brw_end_transform_feedback(struct gl_context *ctx,
     * simplicity, just do a full flush.
     */
    struct brw_context *brw = brw_context(ctx);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_surface_state.c b/src/mesa/drivers/dri/i965/gen6_surface_state.c
index 03e913a0a76..39de62f2304 100644
--- a/src/mesa/drivers/dri/i965/gen6_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_surface_state.c
@@ -88,7 +88,8 @@ gen6_update_renderbuffer_surface(struct brw_context *brw,
       break;
    }
 
-   const int min_array_element = layered ? 0 : irb->mt_layer;
+   const int min_array_element = irb->mt_layer;
+   assert(!layered || irb->mt_layer == 0);
 
    surf[0] = SET_FIELD(surftype, BRW_SURFACE_TYPE) |
              SET_FIELD(format, BRW_SURFACE_FORMAT);
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index 107a4f24fa6..c7311fd0b03 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -120,7 +120,7 @@ gen6_upload_urb( struct brw_context *brw )
     * a workaround.
     */
    if (brw->urb.gs_present && !gs_present)
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
    brw->urb.gs_present = gs_present;
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 7c8d8849f4e..11b9a360ced 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -101,7 +101,7 @@ gen6_upload_sf_vp(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
 
       /* _NEW_VIEWPORT */
       _mesa_get_viewport_xform(ctx, i, scale, translate);
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 2bdc82bc895..9822dc1fe79 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -645,7 +645,7 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
 
    /* 3DSTATE_DEPTH_BUFFER */
    {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
 
       BEGIN_BATCH(7);
       OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
@@ -696,7 +696,7 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
 static void
 gen7_blorp_emit_depth_disable(struct brw_context *brw)
 {
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    BEGIN_BATCH(7);
    OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
@@ -794,6 +794,8 @@ gen7_blorp_exec(struct brw_context *brw,
    }
    depthstencil_offset = gen6_blorp_emit_depth_stencil_state(brw, params);
    gen7_blorp_emit_depth_stencil_state_pointers(brw, depthstencil_offset);
+   if (brw->use_resource_streamer)
+      gen7_disable_hw_binding_tables(brw);
    if (params->use_wm_prog) {
       uint32_t wm_surf_offset_renderbuffer;
       uint32_t wm_surf_offset_texture = 0;
diff --git a/src/mesa/drivers/dri/i965/gen7_disable.c b/src/mesa/drivers/dri/i965/gen7_disable.c
index 2c43cd77f07..bb509696d72 100644
--- a/src/mesa/drivers/dri/i965/gen7_disable.c
+++ b/src/mesa/drivers/dri/i965/gen7_disable.c
@@ -52,7 +52,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_HS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    /* Disable the TE */
@@ -85,7 +85,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_DS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index 8d6d3fe1d34..497ecec8e45 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -59,7 +59,9 @@ upload_gs_state(struct brw_context *brw)
       OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((brw->gs.prog_data->base.base.binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+                (brw->is_haswell && prog_data->base.nr_image_params ?
+                 HSW_GS_UAV_ACCESS_ENABLE : 0));
 
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
index f4f665219d6..a14d4a0c50d 100644
--- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_misc_state.c
@@ -57,7 +57,7 @@ gen7_emit_depth_stencil_hiz(struct brw_context *brw,
       return;
    }
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
    if (!irb)
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 4fa46a8eb97..698b3d491bc 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -220,7 +220,7 @@ upload_sf_state(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw3 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw3 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /* _NEW_LIGHT */
    if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index aec4f44bb73..41573a80a52 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -365,7 +365,7 @@ gen7_save_primitives_written_counters(struct brw_context *brw,
    }
 
    /* Flush any drawing so that the counters have the right values. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Emit MI_STORE_REGISTER_MEM commands to write the values. */
    for (int i = 0; i < streams; i++) {
@@ -502,7 +502,7 @@ gen7_pause_transform_feedback(struct gl_context *ctx,
       (struct brw_transform_feedback_object *) obj;
 
    /* Flush any drawing so that the counters have the right values. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the SOL buffer offset register values. */
    if (brw->gen < 8) {
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index d371c193577..69162171c4e 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -228,7 +228,7 @@ gen7_upload_urb(struct brw_context *brw)
       remaining_space = total_wants;
    if (remaining_space > 0) {
       unsigned vs_additional = (unsigned)
-         round(vs_wants * (((double) remaining_space) / total_wants));
+         roundf(vs_wants * (((float) remaining_space) / total_wants));
       vs_chunks += vs_additional;
       remaining_space -= vs_additional;
       gs_chunks += remaining_space;
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
index b655205ec35..c75dc9964bf 100644
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
@@ -53,7 +53,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       /* According to the "Vertex X,Y Clamping and Quantization" section of
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 4b17d06fa83..b7e48585482 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -62,6 +62,7 @@ gen7_upload_constant_state(struct brw_context *brw,
       OUT_BATCH(active ? stage_state->push_const_size : 0);
       OUT_BATCH(0);
    }
+
    /* Pointer to the constant buffer.  Covered by the set of state flags
     * from gen6_prepare_wm_contants
     */
@@ -95,15 +96,14 @@ gen7_upload_constant_state(struct brw_context *brw,
 
    ADVANCE_BATCH();
 
-  /* On SKL+ the new constants don't take effect until the next corresponding
-   * 3DSTATE_BINDING_TABLE_POINTER_* command is parsed so we need to ensure
-   * that is sent
-   */
+   /* On SKL+ the new constants don't take effect until the next corresponding
+    * 3DSTATE_BINDING_TABLE_POINTER_* command is parsed so we need to ensure
+    * that is sent
+    */
    if (brw->gen >= 9)
       brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
 }
 
-
 static void
 upload_vs_state(struct brw_context *brw)
 {
@@ -111,6 +111,7 @@ upload_vs_state(struct brw_context *brw)
    uint32_t floating_point_mode = 0;
    const int max_threads_shift = brw->is_haswell ?
       HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
+   const struct brw_vue_prog_data *prog_data = &brw->vs.prog_data->base;
 
    if (!brw->is_haswell && !brw->is_baytrail)
       gen7_emit_vs_workaround_flush(brw);
@@ -125,19 +126,21 @@ upload_vs_state(struct brw_context *brw)
 	     ((ALIGN(stage_state->sampler_count, 4)/4) <<
               GEN6_VS_SAMPLER_COUNT_SHIFT) |
              ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
-              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+             (brw->is_haswell && prog_data->base.nr_image_params ?
+              HSW_VS_UAV_ACCESS_ENABLE : 0));
 
-   if (brw->vs.prog_data->base.base.total_scratch) {
+   if (prog_data->base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(brw->vs.prog_data->base.base.total_scratch) - 11);
+		ffs(prog_data->base.total_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
 
-   OUT_BATCH((brw->vs.prog_data->base.base.dispatch_grf_start_reg <<
+   OUT_BATCH((prog_data->base.dispatch_grf_start_reg <<
               GEN6_VS_DISPATCH_START_GRF_SHIFT) |
-	     (brw->vs.prog_data->base.urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
+	     (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
 	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
 
    OUT_BATCH(((brw->max_vs_threads - 1) << max_threads_shift) |
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index ea11ae845e3..fd6dab5be8b 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -83,6 +83,7 @@ upload_wm_state(struct brw_context *brw)
 
    /* _NEW_BUFFERS | _NEW_COLOR */
    if (brw_color_buffer_write_enabled(brw) || writes_depth ||
+       prog_data->base.nr_image_params ||
        dw1 & GEN7_WM_KILL_ENABLE) {
       dw1 |= GEN7_WM_DISPATCH_ENABLE;
    }
@@ -106,6 +107,18 @@ upload_wm_state(struct brw_context *brw)
       dw1 |= GEN7_WM_USES_INPUT_COVERAGE_MASK;
    }
 
+   /* BRW_NEW_FS_PROG_DATA */
+   if (prog_data->early_fragment_tests)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
+   else if (prog_data->base.nr_image_params)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
+
+   /* _NEW_BUFFERS | _NEW_COLOR */
+   if (brw->is_haswell &&
+       !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
+       prog_data->base.nr_image_params)
+      dw2 |= HSW_WM_UAV_ONLY;
+
    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
    OUT_BATCH(dw1);
@@ -127,7 +140,7 @@ const struct brw_tracked_state gen7_wm_state = {
    .emit = upload_wm_state,
 };
 
-void
+static void
 gen7_upload_ps_state(struct brw_context *brw,
                      const struct gl_fragment_program *fp,
                      const struct brw_stage_state *stage_state,
@@ -208,6 +221,9 @@ gen7_upload_ps_state(struct brw_context *brw,
       _mesa_get_min_invocations_per_fragment(ctx, fp, false);
    assert(min_inv_per_frag >= 1);
 
+   if (brw->is_haswell && prog_data->base.nr_image_params)
+      dw4 |= HSW_PS_UAV_ACCESS_ENABLE;
+
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
       if (!prog_data->no_8 && min_inv_per_frag == 1) {
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 12ac97a5d14..93100a0708f 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -41,7 +41,6 @@ emit_depth_packets(struct brw_context *brw,
                    bool depth_writable,
                    struct intel_mipmap_tree *stencil_mt,
                    bool stencil_writable,
-                   uint32_t stencil_offset,
                    bool hiz,
                    uint32_t width,
                    uint32_t height,
@@ -57,7 +56,7 @@ emit_depth_packets(struct brw_context *brw,
       return;
    }
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    /* _NEW_BUFFERS, _NEW_DEPTH, _NEW_STENCIL */
    BEGIN_BATCH(8);
@@ -100,7 +99,7 @@ emit_depth_packets(struct brw_context *brw,
    }
 
    if (stencil_mt == NULL) {
-     BEGIN_BATCH(5);
+      BEGIN_BATCH(5);
       OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2));
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -127,8 +126,7 @@ emit_depth_packets(struct brw_context *brw,
       OUT_BATCH(HSW_STENCIL_ENABLED | mocs_wb << 22 |
                 (2 * stencil_mt->pitch - 1));
       OUT_RELOC64(stencil_mt->bo,
-                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  stencil_offset);
+                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
       OUT_BATCH(stencil_mt ? stencil_mt->qpitch >> 2 : 0);
       ADVANCE_BATCH();
    }
@@ -220,7 +218,6 @@ gen8_emit_depth_stencil_hiz(struct brw_context *brw,
    emit_depth_packets(brw, depth_mt, brw_depthbuffer_format(brw), surftype,
                       ctx->Depth.Mask != 0,
                       stencil_mt, ctx->Stencil._WriteEnabled,
-                      brw->depthstencil.stencil_offset,
                       hiz, width, height, depth, lod, min_array_element);
 }
 
@@ -253,10 +250,10 @@ pma_fix_enable(const struct brw_context *brw)
     */
    const bool hiz_enabled = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
 
-   /* 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
-    * We always leave this set to EDSC_NORMAL (0).
+   /* BRW_NEW_FS_PROG_DATA:
+    * 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
     */
-   const bool edsc_not_preps = true;
+   const bool edsc_not_preps = !brw->wm.prog_data->early_fragment_tests;
 
    /* 3DSTATE_PS_EXTRA::PixelShaderValid is always true. */
    const bool pixel_shader_valid = true;
@@ -439,7 +436,7 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
                       brw_depth_format(brw, mt->format),
                       BRW_SURFACE_2D,
                       true, /* depth writes */
-                      NULL, false, 0, /* no stencil for now */
+                      NULL, false, /* no stencil for now */
                       true, /* hiz */
                       surface_width,
                       surface_height,
@@ -499,7 +496,7 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
     */
    brw_emit_pipe_control_write(brw,
                                PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0, 0, 0);
+                               brw->workaround_bo, 0, 0, 0);
 
    /* Emit 3DSTATE_WM_HZ_OP again to disable the state overrides. */
    BEGIN_BATCH(5);
diff --git a/src/mesa/drivers/dri/i965/gen8_disable.c b/src/mesa/drivers/dri/i965/gen8_disable.c
index da0d4a5fe7a..32508e377c9 100644
--- a/src/mesa/drivers/dri/i965/gen8_disable.c
+++ b/src/mesa/drivers/dri/i965/gen8_disable.c
@@ -66,7 +66,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_HS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    /* Disable the TE */
@@ -101,7 +101,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_DS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    BEGIN_BATCH(2);
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 26a02d3b045..81bd3b21778 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -52,7 +52,9 @@ gen8_upload_gs_state(struct brw_context *brw)
                 ((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((prog_data->base.binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+                (prog_data->base.nr_image_params ?
+                 HSW_GS_UAV_ACCESS_ENABLE : 0));
 
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index a88f109c691..ae18f0f162c 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -58,7 +58,11 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (prog_data->uses_omask)
       dw1 |= GEN8_PSX_OMASK_TO_RENDER_TARGET;
 
-   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx))
+   if (brw->gen >= 9 && prog_data->pulls_bary)
+      dw1 |= GEN9_PSX_SHADER_PULLS_BARY;
+
+   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) ||
+       prog_data->base.nr_image_params)
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 
    BEGIN_BATCH(2);
@@ -115,6 +119,12 @@ upload_wm_state(struct brw_context *brw)
    dw1 |= brw->wm.prog_data->barycentric_interp_modes <<
       GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
 
+   /* BRW_NEW_FS_PROG_DATA */
+   if (brw->wm.prog_data->early_fragment_tests)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
+   else if (brw->wm.prog_data->base.nr_image_params)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
+
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_WM << 16 | (2 - 2));
    OUT_BATCH(dw1);
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index c2b585d0001..6b655ee493e 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -169,7 +169,7 @@ upload_sf(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw3 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw3 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /* _NEW_PROGRAM | _NEW_POINT */
    if (!(ctx->VertexProgram.PointSizeEnabled || ctx->Point._Attenuated))
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index b2d1a579815..6c4d3e197a5 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -88,12 +88,12 @@ vertical_alignment(const struct brw_context *brw,
                    uint32_t surf_type)
 {
    /* On Gen9+ vertical alignment is ignored for 1D surfaces and when
-    * tr_mode is not TRMODE_NONE.
+    * tr_mode is not TRMODE_NONE. Set to an arbitrary non-reserved value.
     */
    if (brw->gen > 8 &&
        (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
         surf_type == BRW_SURFACE_1D))
-      return 0;
+      return GEN8_SURFACE_VALIGN_4;
 
    switch (mt->align_h) {
    case 4:
@@ -113,12 +113,12 @@ horizontal_alignment(const struct brw_context *brw,
                      uint32_t surf_type)
 {
    /* On Gen9+ horizontal alignment is ignored when tr_mode is not
-    * TRMODE_NONE.
+    * TRMODE_NONE. Set to an arbitrary non-reserved value.
     */
    if (brw->gen > 8 &&
        (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
         gen9_use_linear_1d_layout(brw, mt)))
-      return 0;
+      return GEN8_SURFACE_HALIGN_4;
 
    switch (mt->align_w) {
    case 4:
@@ -401,8 +401,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
       irb->mt_layer : (irb->mt_layer / MAX2(mt->num_samples, 1));
    GLenum gl_target =
       rb->TexImage ? rb->TexImage->TexObject->Target : GL_TEXTURE_2D;
-   /* FINISHME: Use PTE MOCS on Skylake. */
-   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_WT : BDW_MOCS_PTE;
+   const uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_PTE : BDW_MOCS_PTE;
 
    intel_miptree_used_for_rendering(mt);
 
diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
index 2d8eeb1f10f..2692ad55999 100644
--- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
@@ -53,7 +53,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       /* _NEW_VIEWPORT: Viewport Matrix Elements */
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index 28f5adddf14..8b5048bee7e 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -53,7 +53,9 @@ upload_vs_state(struct brw_context *brw)
              ((ALIGN(stage_state->sampler_count, 4) / 4) <<
                GEN6_VS_SAMPLER_COUNT_SHIFT) |
              ((prog_data->base.binding_table.size_bytes / 4) <<
-               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+             (prog_data->base.nr_image_params ?
+              HSW_VS_UAV_ACCESS_ENABLE : 0));
 
    if (prog_data->base.total_scratch) {
       OUT_RELOC64(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index ed659ed625e..85f20a05729 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -32,6 +32,8 @@
 #include "intel_buffers.h"
 #include "intel_fbo.h"
 #include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
 
 #include <xf86drm.h>
 #include <i915_drm.h>
@@ -44,19 +46,10 @@ intel_batchbuffer_init(struct brw_context *brw)
 {
    intel_batchbuffer_reset(brw);
 
-   if (brw->gen >= 6) {
-      /* We can't just use brw_state_batch to get a chunk of space for
-       * the gen6 workaround because it involves actually writing to
-       * the buffer, and the kernel doesn't let us write to the batch.
-       */
-      brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
-						      "pipe_control workaround",
-						      4096, 4096);
-   }
-
    if (!brw->has_llc) {
       brw->batch.cpu_map = malloc(BATCH_SZ);
       brw->batch.map = brw->batch.cpu_map;
+      brw->batch.map_next = brw->batch.cpu_map;
    }
 }
 
@@ -77,12 +70,11 @@ intel_batchbuffer_reset(struct brw_context *brw)
       drm_intel_bo_map(brw->batch.bo, true);
       brw->batch.map = brw->batch.bo->virtual;
    }
+   brw->batch.map_next = brw->batch.map;
 
    brw->batch.reserved_space = BATCH_RESERVED;
    brw->batch.state_batch_offset = brw->batch.bo->size;
-   brw->batch.used = 0;
    brw->batch.needs_sol_reset = false;
-   brw->batch.pipe_controls_since_last_cs_stall = 0;
 
    /* We don't know what ring the new batch will be sent to until we see the
     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
@@ -93,7 +85,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
 void
 intel_batchbuffer_save_state(struct brw_context *brw)
 {
-   brw->batch.saved.used = brw->batch.used;
+   brw->batch.saved.map_next = brw->batch.map_next;
    brw->batch.saved.reloc_count =
       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
 }
@@ -103,8 +95,8 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 {
    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
 
-   brw->batch.used = brw->batch.saved.used;
-   if (brw->batch.used == 0)
+   brw->batch.map_next = brw->batch.saved.map_next;
+   if (USED_BATCH(brw->batch) == 0)
       brw->batch.ring = UNKNOWN_RING;
 }
 
@@ -114,7 +106,6 @@ intel_batchbuffer_free(struct brw_context *brw)
    free(brw->batch.cpu_map);
    drm_intel_bo_unreference(brw->batch.last_bo);
    drm_intel_bo_unreference(brw->batch.bo);
-   drm_intel_bo_unreference(brw->batch.workaround_bo);
 }
 
 static void
@@ -133,7 +124,7 @@ do_batch_dump(struct brw_context *brw)
       drm_intel_decode_set_batch_pointer(decode,
 					 batch->bo->virtual,
 					 batch->bo->offset64,
-					 batch->used);
+                                         USED_BATCH(*batch));
    } else {
       fprintf(stderr,
 	      "WARNING: failed to map batchbuffer (%s), "
@@ -142,7 +133,7 @@ do_batch_dump(struct brw_context *brw)
       drm_intel_decode_set_batch_pointer(decode,
 					 batch->map,
 					 batch->bo->offset64,
-					 batch->used);
+                                         USED_BATCH(*batch));
    }
 
    drm_intel_decode_set_output_file(decode, stderr);
@@ -218,10 +209,32 @@ brw_finish_batch(struct brw_context *brw)
     */
    brw_emit_query_end(brw);
 
-   /* We may also need to snapshot and disable OA counters. */
-   if (brw->batch.ring == RENDER_RING)
+   if (brw->batch.ring == RENDER_RING) {
+      /* We may also need to snapshot and disable OA counters. */
       brw_perf_monitor_finish_batch(brw);
 
+      if (brw->is_haswell) {
+         /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
+          * 3DSTATE_CC_STATE_POINTERS > "Note":
+          *
+          * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
+          *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
+          *
+          * From the example in the docs, it seems to expect a regular pipe control
+          * flush here as well. We may have done it already, but meh.
+          *
+          * See also WaAvoidRCZCounterRollover.
+          */
+         brw_emit_mi_flush(brw);
+         BEGIN_BATCH(2);
+         OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
+         OUT_BATCH(brw->cc.state_offset | 1);
+         ADVANCE_BATCH();
+         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                          PIPE_CONTROL_CS_STALL);
+      }
+   }
+
    /* Mark that the current program cache BO has been used by the GPU.
     * It will be reallocated if we need to put new programs in for the
     * next batch.
@@ -267,6 +280,11 @@ throttle(struct brw_context *brw)
    }
 }
 
+/* Drop when RS headers get pulled to libdrm */
+#ifndef I915_EXEC_RESOURCE_STREAMER
+#define I915_EXEC_RESOURCE_STREAMER (1<<15)
+#endif
+
 /* TODO: Push this whole function into bufmgr.
  */
 static int
@@ -278,7 +296,7 @@ do_flush_locked(struct brw_context *brw)
    if (brw->has_llc) {
       drm_intel_bo_unmap(batch->bo);
    } else {
-      ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
+      ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 	 ret = drm_intel_bo_subdata(batch->bo,
 				    batch->state_batch_offset,
@@ -293,7 +311,8 @@ do_flush_locked(struct brw_context *brw)
       if (brw->gen >= 6 && batch->ring == BLT_RING) {
          flags = I915_EXEC_BLT;
       } else {
-         flags = I915_EXEC_RENDER;
+         flags = I915_EXEC_RENDER |
+            (brw->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0);
       }
       if (batch->needs_sol_reset)
 	 flags |= I915_EXEC_GEN7_SOL_RESET;
@@ -303,11 +322,11 @@ do_flush_locked(struct brw_context *brw)
             brw_annotate_aub(brw);
 
 	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
-	    ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
-					flags);
+            ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
+                                        NULL, 0, 0, flags);
 	 } else {
 	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
-						4 * batch->used, flags);
+                                                4 * USED_BATCH(*batch), flags);
 	 }
       }
 
@@ -331,7 +350,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 {
    int ret;
 
-   if (brw->batch.used == 0)
+   if (USED_BATCH(brw->batch) == 0)
       return 0;
 
    if (brw->throttle_batch[0] == NULL) {
@@ -340,7 +359,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
-      int bytes_for_commands = 4 * brw->batch.used;
+      int bytes_for_commands = 4 * USED_BATCH(brw->batch);
       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
       int total_bytes = bytes_for_commands + bytes_for_state;
       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
@@ -356,7 +375,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 
    /* Mark the end of the buffer. */
    intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
-   if (brw->batch.used & 1) {
+   if (USED_BATCH(brw->batch) & 1) {
       /* Round batchbuffer usage to 2 DWORDs. */
       intel_batchbuffer_emit_dword(brw, MI_NOOP);
    }
@@ -373,6 +392,9 @@ _intel_batchbuffer_flush(struct brw_context *brw,
       drm_intel_bo_wait_rendering(brw->batch.bo);
    }
 
+   if (brw->use_resource_streamer)
+      gen7_reset_hw_bt_pool_offsets(brw);
+
    /* Start a new batch buffer. */
    brw_new_batch(brw);
 
@@ -382,15 +404,15 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 
 /*  This is the only way buffers get added to the validate list.
  */
-bool
-intel_batchbuffer_emit_reloc(struct brw_context *brw,
-                             drm_intel_bo *buffer,
-                             uint32_t read_domains, uint32_t write_domain,
-			     uint32_t delta)
+uint32_t
+intel_batchbuffer_reloc(struct brw_context *brw,
+                        drm_intel_bo *buffer, uint32_t offset,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t delta)
 {
    int ret;
 
-   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
 				 buffer, delta,
 				 read_domains, write_domain);
    assert(ret == 0);
@@ -400,18 +422,16 @@ intel_batchbuffer_emit_reloc(struct brw_context *brw,
     * case the buffer doesn't move and we can short-circuit the relocation
     * processing in the kernel
     */
-   intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
-
-   return true;
+   return buffer->offset64 + delta;
 }
 
-bool
-intel_batchbuffer_emit_reloc64(struct brw_context *brw,
-                               drm_intel_bo *buffer,
-                               uint32_t read_domains, uint32_t write_domain,
-			       uint32_t delta)
+uint64_t
+intel_batchbuffer_reloc64(struct brw_context *brw,
+                          drm_intel_bo *buffer, uint32_t offset,
+                          uint32_t read_domains, uint32_t write_domain,
+                          uint32_t delta)
 {
-   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
                                      buffer, delta,
                                      read_domains, write_domain);
    assert(ret == 0);
@@ -421,11 +441,7 @@ intel_batchbuffer_emit_reloc64(struct brw_context *brw,
     * case the buffer doesn't move and we can short-circuit the relocation
     * processing in the kernel
     */
-   uint64_t offset = buffer->offset64 + delta;
-   intel_batchbuffer_emit_dword(brw, offset);
-   intel_batchbuffer_emit_dword(brw, offset >> 32);
-
-   return true;
+   return buffer->offset64 + delta;
 }
 
 
@@ -435,312 +451,8 @@ intel_batchbuffer_data(struct brw_context *brw,
 {
    assert((bytes & 3) == 0);
    intel_batchbuffer_require_space(brw, bytes, ring);
-   memcpy(brw->batch.map + brw->batch.used, data, bytes);
-   brw->batch.used += bytes >> 2;
-}
-
-/**
- * According to the latest documentation, any PIPE_CONTROL with the
- * "Command Streamer Stall" bit set must also have another bit set,
- * with five different options:
- *
- *  - Render Target Cache Flush
- *  - Depth Cache Flush
- *  - Stall at Pixel Scoreboard
- *  - Post-Sync Operation
- *  - Depth Stall
- *
- * I chose "Stall at Pixel Scoreboard" since we've used it effectively
- * in the past, but the choice is fairly arbitrary.
- */
-static void
-gen8_add_cs_stall_workaround_bits(uint32_t *flags)
-{
-   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                      PIPE_CONTROL_WRITE_IMMEDIATE |
-                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
-                      PIPE_CONTROL_WRITE_TIMESTAMP |
-                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
-                      PIPE_CONTROL_DEPTH_STALL;
-
-   /* If we're doing a CS stall, and don't already have one of the
-    * workaround bits set, add "Stall at Pixel Scoreboard."
-    */
-   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
-      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
-}
-
-/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
- *
- * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
- *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
- *
- * Note that the kernel does CS stalls between batches, so we only need
- * to count them within a batch.
- */
-static uint32_t
-gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
-{
-   if (brw->gen == 7 && !brw->is_haswell) {
-      if (flags & PIPE_CONTROL_CS_STALL) {
-         /* If we're doing a CS stall, reset the counter and carry on. */
-         brw->batch.pipe_controls_since_last_cs_stall = 0;
-         return 0;
-      }
-
-      /* If this is the fourth pipe control without a CS stall, do one now. */
-      if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
-         brw->batch.pipe_controls_since_last_cs_stall = 0;
-         return PIPE_CONTROL_CS_STALL;
-      }
-   }
-   return 0;
-}
-
-/**
- * Emit a PIPE_CONTROL with various flushing flags.
- *
- * The caller is responsible for deciding what flags are appropriate for the
- * given generation.
- */
-void
-brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
-{
-   if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
-      OUT_BATCH(flags);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else if (brw->gen >= 6) {
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(flags);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * Emit a PIPE_CONTROL that writes to a buffer object.
- *
- * \p flags should contain one of the following items:
- *  - PIPE_CONTROL_WRITE_IMMEDIATE
- *  - PIPE_CONTROL_WRITE_TIMESTAMP
- *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
- */
-void
-brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
-                            drm_intel_bo *bo, uint32_t offset,
-                            uint32_t imm_lower, uint32_t imm_upper)
-{
-   if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
-      OUT_BATCH(flags);
-      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                  offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   } else if (brw->gen >= 6) {
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
-
-      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
-       * on later platforms.  We always use PPGTT on Gen7+.
-       */
-      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(flags);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                gen6_gtt | offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * Restriction [DevSNB, DevIVB]:
- *
- * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
- * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
- * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
- * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
- * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
- * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
- * unless SW can otherwise guarantee that the pipeline from WM onwards is
- * already flushed (e.g., via a preceding MI_FLUSH).
- */
-void
-intel_emit_depth_stall_flushes(struct brw_context *brw)
-{
-   assert(brw->gen >= 6 && brw->gen <= 9);
-
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
-}
-
-/**
- * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
- * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
- *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
- *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
- *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
- *  to be sent before any combination of VS associated 3DSTATE."
- */
-void
-gen7_emit_vs_workaround_flush(struct brw_context *brw)
-{
-   assert(brw->gen == 7);
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_WRITE_IMMEDIATE
-                               | PIPE_CONTROL_DEPTH_STALL,
-                               brw->batch.workaround_bo, 0,
-                               0, 0);
-}
-
-
-/**
- * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
- */
-void
-gen7_emit_cs_stall_flush(struct brw_context *brw)
-{
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_CS_STALL
-                               | PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0,
-                               0, 0);
-}
-
-
-/**
- * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
- * implementing two workarounds on gen6.  From section 1.4.7.1
- * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
- *
- * [DevSNB-C+{W/A}] Before any depth stall flush (including those
- * produced by non-pipelined state commands), software needs to first
- * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
- * 0.
- *
- * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
- * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
- *
- * And the workaround for these two requires this workaround first:
- *
- * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
- * BEFORE the pipe-control with a post-sync op and no write-cache
- * flushes.
- *
- * And this last workaround is tricky because of the requirements on
- * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
- * volume 2 part 1:
- *
- *     "1 of the following must also be set:
- *      - Render Target Cache Flush Enable ([12] of DW1)
- *      - Depth Cache Flush Enable ([0] of DW1)
- *      - Stall at Pixel Scoreboard ([1] of DW1)
- *      - Depth Stall ([13] of DW1)
- *      - Post-Sync Operation ([13] of DW1)
- *      - Notify Enable ([8] of DW1)"
- *
- * The cache flushes require the workaround flush that triggered this
- * one, so we can't use it.  Depth stall would trigger the same.
- * Post-sync nonzero is what triggered this second workaround, so we
- * can't use that one either.  Notify enable is IRQs, which aren't
- * really our business.  That leaves only stall at scoreboard.
- */
-void
-intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
-{
-   brw_emit_pipe_control_flush(brw,
-                               PIPE_CONTROL_CS_STALL |
-                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
-
-   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0, 0, 0);
-}
-
-/* Emit a pipelined flush to either flush render and texture cache for
- * reading from a FBO-drawn texture, or flush so that frontbuffer
- * render appears on the screen in DRI1.
- *
- * This is also used for the always_flush_cache driconf debug option.
- */
-void
-intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
-{
-   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
-      BEGIN_BATCH_BLT(4);
-      OUT_BATCH(MI_FLUSH_DW);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
-      if (brw->gen >= 6) {
-         if (brw->gen == 9) {
-            /* Hardware workaround: SKL
-             *
-             * Emit Pipe Control with all bits set to zero before emitting
-             * a Pipe Control with VF Cache Invalidate set.
-             */
-            brw_emit_pipe_control_flush(brw, 0);
-         }
-
-         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-                  PIPE_CONTROL_CS_STALL;
-
-         if (brw->gen == 6) {
-            /* Hardware workaround: SNB B-Spec says:
-             *
-             * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
-             * Flush Enable =1, a PIPE_CONTROL with any non-zero
-             * post-sync-op is required.
-             */
-            intel_emit_post_sync_nonzero_flush(brw);
-         }
-      }
-      brw_emit_pipe_control_flush(brw, flags);
-   }
-
-   brw_render_cache_set_clear(brw);
+   memcpy(brw->batch.map_next, data, bytes);
+   brw->batch.map_next += bytes >> 2;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index 7bdd8364346..84add927c9a 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -22,12 +22,16 @@ extern "C" {
  *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
  *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
  *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
- *       which are 4 DWords each ==> 2 * 3 * 4 * 4 = 96 bytes
+ *       which are 5 DWords each ==> 2 * 3 * 5 * 4 = 120 bytes
  *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
  *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
  *       Sandybridge PIPE_CONTROL madness.
+ *   - CC_STATE workaround on HSW (12 * 4 = 48 bytes)
+ *     - 5 dwords for initial mi_flush
+ *     - 2 dwords for CC state setup
+ *     - 5 dwords for the required pipe control at the end
  */
-#define BATCH_RESERVED 146
+#define BATCH_RESERVED 152
 
 struct intel_batchbuffer;
 
@@ -53,25 +57,20 @@ void intel_batchbuffer_data(struct brw_context *brw,
                             const void *data, GLuint bytes,
                             enum brw_gpu_ring ring);
 
-bool intel_batchbuffer_emit_reloc(struct brw_context *brw,
-                                       drm_intel_bo *buffer,
-				       uint32_t read_domains,
-				       uint32_t write_domain,
-				       uint32_t offset);
-bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
-                                    drm_intel_bo *buffer,
-                                    uint32_t read_domains,
-                                    uint32_t write_domain,
-                                    uint32_t offset);
-void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
-void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
-                                 drm_intel_bo *bo, uint32_t offset,
-                                 uint32_t imm_lower, uint32_t imm_upper);
-void intel_batchbuffer_emit_mi_flush(struct brw_context *brw);
-void intel_emit_post_sync_nonzero_flush(struct brw_context *brw);
-void intel_emit_depth_stall_flushes(struct brw_context *brw);
-void gen7_emit_vs_workaround_flush(struct brw_context *brw);
-void gen7_emit_cs_stall_flush(struct brw_context *brw);
+uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
+                                 drm_intel_bo *buffer,
+                                 uint32_t offset,
+                                 uint32_t read_domains,
+                                 uint32_t write_domain,
+                                 uint32_t delta);
+uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
+                                   drm_intel_bo *buffer,
+                                   uint32_t offset,
+                                   uint32_t read_domains,
+                                   uint32_t write_domain,
+                                   uint32_t delta);
+
+#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))
 
 static inline uint32_t float_as_int(float f)
 {
@@ -93,7 +92,7 @@ static inline unsigned
 intel_batchbuffer_space(struct brw_context *brw)
 {
    return (brw->batch.state_batch_offset - brw->batch.reserved_space)
-      - brw->batch.used*4;
+      - USED_BATCH(brw->batch) * 4;
 }
 
 
@@ -103,7 +102,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
 #ifdef DEBUG
    assert(intel_batchbuffer_space(brw) >= 4);
 #endif
-   brw->batch.map[brw->batch.used++] = dword;
+   *brw->batch.map_next++ = dword;
    assert(brw->batch.ring != UNKNOWN_RING);
 }
 
@@ -144,8 +143,8 @@ intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
 {
    intel_batchbuffer_require_space(brw, n * 4, ring);
 
-   brw->batch.emit = brw->batch.used;
 #ifdef DEBUG
+   brw->batch.emit = USED_BATCH(brw->batch);
    brw->batch.total = n;
 #endif
 }
@@ -155,7 +154,7 @@ intel_batchbuffer_advance(struct brw_context *brw)
 {
 #ifdef DEBUG
    struct intel_batchbuffer *batch = &brw->batch;
-   unsigned int _n = batch->used - batch->emit;
+   unsigned int _n = USED_BATCH(*batch) - batch->emit;
    assert(batch->total != 0);
    if (_n != batch->total) {
       fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",
@@ -166,21 +165,42 @@ intel_batchbuffer_advance(struct brw_context *brw)
 #endif
 }
 
-#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
-#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
-#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
-#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
-   intel_batchbuffer_emit_reloc(brw, buf,			\
-				read_domains, write_domain, delta);	\
+#define BEGIN_BATCH(n) do {                            \
+   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
+
+#define BEGIN_BATCH_BLT(n) do {                        \
+   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
+
+#define OUT_BATCH(d) *__map++ = (d)
+#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do { \
+   uint32_t __offset = (__map - brw->batch.map) * 4;           \
+   OUT_BATCH(intel_batchbuffer_reloc(brw, (buf), __offset,     \
+                                     (read_domains),           \
+                                     (write_domain),           \
+                                     (delta)));                \
 } while (0)
 
 /* Handle 48-bit address relocations for Gen8+ */
-#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
-   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);	\
+#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {      \
+   uint32_t __offset = (__map - brw->batch.map) * 4;                  \
+   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, (buf), __offset, \
+                                                (read_domains),       \
+                                                (write_domain),       \
+                                                (delta));             \
+   OUT_BATCH(reloc64);                                                \
+   OUT_BATCH(reloc64 >> 32);                                          \
 } while (0)
 
-#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
+#define ADVANCE_BATCH()                  \
+   assert(__map == brw->batch.map_next); \
+   intel_batchbuffer_advance(brw);       \
+} while (0)
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index d3ab769356c..6d92580e725 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -27,6 +27,7 @@
 
 
 #include "main/mtypes.h"
+#include "main/blit.h"
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/colormac.h"
@@ -43,6 +44,23 @@
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
 
+#define SET_TILING_XY_FAST_COPY_BLT(tiling, tr_mode, type)           \
+({                                                                   \
+   switch (tiling) {                                                 \
+   case I915_TILING_X:                                               \
+      CMD |= type ## _TILED_X;                                       \
+      break;                                                         \
+   case I915_TILING_Y:                                               \
+      if (tr_mode == INTEL_MIPTREE_TRMODE_YS)                        \
+         CMD |= type ## _TILED_64K;                                  \
+      else                                                           \
+         CMD |= type ## _TILED_Y;                                    \
+      break;                                                         \
+   default:                                                          \
+      unreachable("not reached");                                    \
+   }                                                                 \
+})
+
 static void
 intel_miptree_set_alpha_to_one(struct brw_context *brw,
                                struct intel_mipmap_tree *mt,
@@ -75,6 +93,10 @@ static uint32_t
 br13_for_cpp(int cpp)
 {
    switch (cpp) {
+   case 16:
+      return BR13_32323232;
+   case 8:
+      return BR13_16161616;
    case 4:
       return BR13_8888;
    case 2:
@@ -86,6 +108,64 @@ br13_for_cpp(int cpp)
    }
 }
 
+static uint32_t
+get_tr_horizontal_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
+   /* Alignment tables for YF/YS tiled surfaces. */
+   const uint32_t align_2d_yf[] = {64, 64, 32, 32, 16};
+   const uint32_t bpp = cpp * 8;
+   const uint32_t shift = is_src ? 17 : 10;
+   uint32_t align;
+   int i = 0;
+
+   if (tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return 0;
+
+   /* Compute array index. */
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
+   i = ffs(bpp / 8) - 1;
+
+   align = tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+           align_2d_yf[i] :
+           4 * align_2d_yf[i];
+
+   assert(_mesa_is_pow_two(align));
+
+   /* XY_FAST_COPY_BLT doesn't support horizontal alignment of 16. */
+   if (align == 16)
+      align = 32;
+
+   return (ffs(align) - 6) << shift;
+}
+
+static uint32_t
+get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
+   /* Vertical alignment tables for YF/YS tiled surfaces. */
+   const unsigned align_2d_yf[] = {64, 32, 32, 16, 16};
+   const uint32_t bpp = cpp * 8;
+   const uint32_t shift = is_src ? 15 : 8;
+   uint32_t align;
+   int i = 0;
+
+   if (tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return 0;
+
+   /* Compute array index. */
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
+   i = ffs(bpp / 8) - 1;
+
+   align = tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+           align_2d_yf[i] :
+           4 * align_2d_yf[i];
+
+   assert(_mesa_is_pow_two(align));
+
+   /* XY_FAST_COPY_BLT doesn't support vertical alignments of 16 and 32. */
+   if (align == 16 || align == 32)
+      align = 64;
+
+   return (ffs(align) - 7) << shift;
+}
+
 /**
  * Emits the packet for switching the blitter from X to Y tiled or back.
  *
@@ -96,9 +176,10 @@ br13_for_cpp(int cpp)
  * tiling state would leak into other unsuspecting applications (like the X
  * server).
  */
-static void
+static uint32_t *
 set_blitter_tiling(struct brw_context *brw,
-                   bool dst_y_tiled, bool src_y_tiled)
+                   bool dst_y_tiled, bool src_y_tiled,
+                   uint32_t *__map)
 {
    assert(brw->gen >= 6);
 
@@ -113,19 +194,19 @@ set_blitter_tiling(struct brw_context *brw,
    OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16 |
              (dst_y_tiled ? BCS_SWCTRL_DST_Y : 0) |
              (src_y_tiled ? BCS_SWCTRL_SRC_Y : 0));
+   return __map;
 }
+#define SET_BLITTER_TILING(...) __map = set_blitter_tiling(__VA_ARGS__, __map)
 
-#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) do {         \
+#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled)              \
       BEGIN_BATCH_BLT(n + ((dst_y_tiled || src_y_tiled) ? 14 : 0));     \
       if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, dst_y_tiled, src_y_tiled);             \
-   } while (0)
+         SET_BLITTER_TILING(brw, dst_y_tiled, src_y_tiled)
 
-#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) do {              \
+#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled)                   \
       if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, false, false);                         \
-      ADVANCE_BATCH();                                                  \
-   } while (0)
+         SET_BLITTER_TILING(brw, false, false);                         \
+      ADVANCE_BATCH()
 
 static int
 blt_pitch(struct intel_mipmap_tree *mt)
@@ -278,9 +359,11 @@ intel_miptree_blit(struct brw_context *brw,
                           src_pitch,
                           src_mt->bo, src_mt->offset,
                           src_mt->tiling,
+                          src_mt->tr_mode,
                           dst_mt->pitch,
                           dst_mt->bo, dst_mt->offset,
                           dst_mt->tiling,
+                          dst_mt->tr_mode,
                           src_x, src_y,
                           dst_x, dst_y,
                           width, height,
@@ -313,6 +396,112 @@ alignment_valid(struct brw_context *brw, unsigned offset, uint32_t tiling)
    return true;
 }
 
+static bool
+can_fast_copy_blit(struct brw_context *brw,
+		   drm_intel_bo *src_buffer,
+                   int16_t src_x, int16_t src_y,
+                   uintptr_t src_offset, uint32_t src_pitch,
+                   uint32_t src_tiling, uint32_t src_tr_mode,
+		   drm_intel_bo *dst_buffer,
+                   int16_t dst_x, int16_t dst_y,
+                   uintptr_t dst_offset, uint32_t dst_pitch,
+                   uint32_t dst_tiling, uint32_t dst_tr_mode,
+                   int16_t w, int16_t h, uint32_t cpp)
+{
+   const bool dst_tiling_none = dst_tiling == I915_TILING_NONE;
+   const bool src_tiling_none = src_tiling == I915_TILING_NONE;
+
+   if (brw->gen < 9)
+      return false;
+
+   if (src_buffer->handle == dst_buffer->handle &&
+       _mesa_regions_overlap(src_x, src_y, src_x + w, src_y + h,
+                             dst_x, dst_y, dst_x + w, dst_y + h))
+      return false;
+
+   /* Enable fast copy blit only if the surfaces are Yf/Ys tiled.
+    * FIXME: Based on performance data, remove this condition later to
+    * enable for all types of surfaces.
+    */
+   if (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
+       dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return false;
+
+   /* For all surface types buffers must be cacheline-aligned. */
+   if ((dst_offset | src_offset) & 63)
+      return false;
+
+   /* Color depth greater than 128 bits not supported. */
+   if (cpp > 16)
+      return false;
+
+   /* For Fast Copy Blits the pitch cannot be a negative number. So, bit 15
+    * of the destination pitch must be zero.
+    */
+   if ((src_pitch >> 15 & 1) != 0 || (dst_pitch >> 15 & 1) != 0)
+      return false;
+
+   /* For Linear surfaces, the pitch has to be an OWord (16byte) multiple. */
+   if ((src_tiling_none && src_pitch % 16 != 0) ||
+       (dst_tiling_none && dst_pitch % 16 != 0))
+      return false;
+
+   /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
+    * (X direction width of the Tile). This means the pitch value will
+    * always be Cache Line aligned (64byte multiple).
+    */
+   if ((!dst_tiling_none && dst_pitch % 64 != 0) ||
+       (!src_tiling_none && src_pitch % 64 != 0))
+      return false;
+
+   return true;
+}
+
+static uint32_t
+xy_blit_cmd(uint32_t src_tiling, uint32_t src_tr_mode,
+            uint32_t dst_tiling, uint32_t dst_tr_mode,
+            uint32_t cpp, bool use_fast_copy_blit)
+{
+   uint32_t CMD = 0;
+
+   if (use_fast_copy_blit) {
+      CMD = XY_FAST_COPY_BLT_CMD;
+
+      if (dst_tiling != I915_TILING_NONE)
+         SET_TILING_XY_FAST_COPY_BLT(dst_tiling, dst_tr_mode, XY_FAST_DST);
+
+      if (src_tiling != I915_TILING_NONE)
+         SET_TILING_XY_FAST_COPY_BLT(src_tiling, src_tr_mode, XY_FAST_SRC);
+
+      CMD |= get_tr_horizontal_align(src_tr_mode, cpp, true /* is_src */);
+      CMD |= get_tr_vertical_align(src_tr_mode, cpp, true /* is_src */);
+
+      CMD |= get_tr_horizontal_align(dst_tr_mode, cpp, false /* is_src */);
+      CMD |= get_tr_vertical_align(dst_tr_mode, cpp, false /* is_src */);
+
+   } else {
+      assert(cpp <= 4);
+      switch (cpp) {
+      case 1:
+      case 2:
+         CMD = XY_SRC_COPY_BLT_CMD;
+         break;
+      case 4:
+         CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      if (dst_tiling != I915_TILING_NONE)
+         CMD |= XY_DST_TILED;
+
+      if (src_tiling != I915_TILING_NONE)
+         CMD |= XY_SRC_TILED;
+   }
+   return CMD;
+}
+
 /* Copy BitBlt
  */
 bool
@@ -322,10 +511,12 @@ intelEmitCopyBlit(struct brw_context *brw,
 		  drm_intel_bo *src_buffer,
 		  GLuint src_offset,
 		  uint32_t src_tiling,
+		  uint32_t src_tr_mode,
 		  GLshort dst_pitch,
 		  drm_intel_bo *dst_buffer,
 		  GLuint dst_offset,
 		  uint32_t dst_tiling,
+		  uint32_t dst_tr_mode,
 		  GLshort src_x, GLshort src_y,
 		  GLshort dst_x, GLshort dst_y,
 		  GLshort w, GLshort h,
@@ -337,18 +528,11 @@ intelEmitCopyBlit(struct brw_context *brw,
    drm_intel_bo *aper_array[3];
    bool dst_y_tiled = dst_tiling == I915_TILING_Y;
    bool src_y_tiled = src_tiling == I915_TILING_Y;
-
-   if (!alignment_valid(brw, dst_offset, dst_tiling))
-      return false;
-   if (!alignment_valid(brw, src_offset, src_tiling))
-      return false;
+   bool use_fast_copy_blit = false;
 
    if ((dst_y_tiled || src_y_tiled) && brw->gen < 6)
       return false;
 
-   assert(!dst_y_tiled || (dst_pitch % 128) == 0);
-   assert(!src_y_tiled || (src_pitch % 128) == 0);
-
    /* do space check before going any further */
    do {
        aper_array[0] = brw->batch.bo;
@@ -373,52 +557,98 @@ intelEmitCopyBlit(struct brw_context *brw,
        src_buffer, src_pitch, src_offset, src_x, src_y,
        dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
 
-   /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
-    * the low bits.  Offsets must be naturally aligned.
-    */
-   if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
-       dst_pitch % 4 != 0 || dst_offset % cpp != 0)
-      return false;
+   use_fast_copy_blit = can_fast_copy_blit(brw,
+                                           src_buffer,
+                                           src_x, src_y,
+                                           src_offset, src_pitch,
+                                           src_tiling, src_tr_mode,
+                                           dst_buffer,
+                                           dst_x, dst_y,
+                                           dst_offset, dst_pitch,
+                                           dst_tiling, dst_tr_mode,
+                                           w, h, cpp);
+   assert(use_fast_copy_blit ||
+          (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
+           dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE));
+
+   if (use_fast_copy_blit) {
+      /* When two sequential fast copy blits have different source surfaces,
+       * but their destinations refer to the same destination surfaces and
+       * therefore destinations overlap it is imperative that a flush be
+       * inserted between the two blits.
+       *
+       * FIXME: Figure out a way to avoid flushing when not required.
+       */
+      brw_emit_mi_flush(brw);
+
+      assert(cpp <= 16);
+      BR13 = br13_for_cpp(cpp);
+
+      if (src_tr_mode == INTEL_MIPTREE_TRMODE_YF)
+         BR13 |= XY_FAST_SRC_TRMODE_YF;
+
+      if (dst_tr_mode == INTEL_MIPTREE_TRMODE_YF)
+         BR13 |= XY_FAST_DST_TRMODE_YF;
+
+      CMD = xy_blit_cmd(src_tiling, src_tr_mode,
+                        dst_tiling, dst_tr_mode,
+                        cpp, use_fast_copy_blit);
+
+      /* For tiled source and destination, pitch value should be specified
+       * as a number of Dwords.
+       */
+      if (dst_tiling != I915_TILING_NONE)
+         dst_pitch /= 4;
+
+      if (src_tiling != I915_TILING_NONE)
+         src_pitch /= 4;
 
-   /* For big formats (such as floating point), do the copy using 16 or 32bpp
-    * and multiply the coordinates.
-    */
-   if (cpp > 4) {
-      if (cpp % 4 == 2) {
-         dst_x *= cpp / 2;
-         dst_x2 *= cpp / 2;
-         src_x *= cpp / 2;
-         cpp = 2;
-      } else {
-         assert(cpp % 4 == 0);
-         dst_x *= cpp / 4;
-         dst_x2 *= cpp / 4;
-         src_x *= cpp / 4;
-         cpp = 4;
+   } else {
+      assert(!dst_y_tiled || (dst_pitch % 128) == 0);
+      assert(!src_y_tiled || (src_pitch % 128) == 0);
+
+      /* For big formats (such as floating point), do the copy using 16 or
+       * 32bpp and multiply the coordinates.
+       */
+      if (cpp > 4) {
+         if (cpp % 4 == 2) {
+            dst_x *= cpp / 2;
+            dst_x2 *= cpp / 2;
+            src_x *= cpp / 2;
+            cpp = 2;
+         } else {
+            assert(cpp % 4 == 0);
+            dst_x *= cpp / 4;
+            dst_x2 *= cpp / 4;
+            src_x *= cpp / 4;
+            cpp = 4;
+         }
       }
-   }
 
-   BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;
+      if (!alignment_valid(brw, dst_offset, dst_tiling))
+         return false;
+      if (!alignment_valid(brw, src_offset, src_tiling))
+         return false;
 
-   switch (cpp) {
-   case 1:
-   case 2:
-      CMD = XY_SRC_COPY_BLT_CMD;
-      break;
-   case 4:
-      CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      break;
-   default:
-      return false;
-   }
+      /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
+       * the low bits.  Offsets must be naturally aligned.
+       */
+      if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
+          dst_pitch % 4 != 0 || dst_offset % cpp != 0)
+         return false;
 
-   if (dst_tiling != I915_TILING_NONE) {
-      CMD |= XY_DST_TILED;
-      dst_pitch /= 4;
-   }
-   if (src_tiling != I915_TILING_NONE) {
-      CMD |= XY_SRC_TILED;
-      src_pitch /= 4;
+      assert(cpp <= 4);
+      BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;
+
+      CMD = xy_blit_cmd(src_tiling, src_tr_mode,
+                        dst_tiling, dst_tr_mode,
+                        cpp, use_fast_copy_blit);
+
+      if (dst_tiling != I915_TILING_NONE)
+         dst_pitch /= 4;
+
+      if (src_tiling != I915_TILING_NONE)
+         src_pitch /= 4;
    }
 
    if (dst_y2 <= dst_y || dst_x2 <= dst_x) {
@@ -460,7 +690,7 @@ intelEmitCopyBlit(struct brw_context *brw,
 
    ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    return true;
 }
@@ -544,7 +774,7 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
 
    intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    return true;
 }
@@ -576,7 +806,9 @@ intel_emit_linear_blit(struct brw_context *brw,
    dst_x = dst_offset % 64;
    ok = intelEmitCopyBlit(brw, 1,
 			  pitch, src_bo, src_offset - src_x, I915_TILING_NONE,
+                          INTEL_MIPTREE_TRMODE_NONE,
 			  pitch, dst_bo, dst_offset - dst_x, I915_TILING_NONE,
+                          INTEL_MIPTREE_TRMODE_NONE,
 			  src_x, 0, /* src x/y */
 			  dst_x, 0, /* dst x/y */
 			  pitch, height, /* w, h */
@@ -595,7 +827,9 @@ intel_emit_linear_blit(struct brw_context *brw,
    if (size != 0) {
       ok = intelEmitCopyBlit(brw, 1,
 			     pitch, src_bo, src_offset - src_x, I915_TILING_NONE,
+                             INTEL_MIPTREE_TRMODE_NONE,
 			     pitch, dst_bo, dst_offset - dst_x, I915_TILING_NONE,
+                             INTEL_MIPTREE_TRMODE_NONE,
 			     src_x, 0, /* src x/y */
 			     dst_x, 0, /* dst x/y */
 			     size, 1, /* w, h */
@@ -667,5 +901,5 @@ intel_miptree_set_alpha_to_one(struct brw_context *brw,
    OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
    ADVANCE_BATCH_TILED(dst_y_tiled, false);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
index 2287c379c4e..c3d19a5a20e 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -32,19 +32,21 @@
 
 bool
 intelEmitCopyBlit(struct brw_context *brw,
-                              GLuint cpp,
-                              GLshort src_pitch,
-                              drm_intel_bo *src_buffer,
-                              GLuint src_offset,
-			      uint32_t src_tiling,
-                              GLshort dst_pitch,
-                              drm_intel_bo *dst_buffer,
-                              GLuint dst_offset,
-			      uint32_t dst_tiling,
-                              GLshort srcx, GLshort srcy,
-                              GLshort dstx, GLshort dsty,
-                              GLshort w, GLshort h,
-			      GLenum logicop );
+                  GLuint cpp,
+                  GLshort src_pitch,
+                  drm_intel_bo *src_buffer,
+                  GLuint src_offset,
+                  uint32_t src_tiling,
+                  uint32_t src_tr_mode,
+                  GLshort dst_pitch,
+                  drm_intel_bo *dst_buffer,
+                  GLuint dst_offset,
+                  uint32_t dst_tiling,
+                  uint32_t dst_tr_mode,
+                  GLshort srcx, GLshort srcy,
+                  GLshort dstx, GLshort dsty,
+                  GLshort w, GLshort h,
+                  GLenum logicop);
 
 bool intel_miptree_blit_compatible_formats(mesa_format src, mesa_format dst);
 
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 627c487f0e7..ff05b5cd0e7 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -560,7 +560,7 @@ brw_unmap_buffer(struct gl_context *ctx,
        * flush.  Once again, we wish for a domain tracker in libdrm to cover
        * usage inside of a batchbuffer.
        */
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
 
       drm_intel_bo_unreference(intel_obj->range_map_bo[index]);
       intel_obj->range_map_bo[index] = NULL;
@@ -632,7 +632,7 @@ brw_copy_buffer_subdata(struct gl_context *ctx,
     * flush.  Once again, we wish for a domain tracker in libdrm to cover
     * usage inside of a batchbuffer.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/intel_copy_image.c b/src/mesa/drivers/dri/i965/intel_copy_image.c
index f4c7eff2904..3706704bf1a 100644
--- a/src/mesa/drivers/dri/i965/intel_copy_image.c
+++ b/src/mesa/drivers/dri/i965/intel_copy_image.c
@@ -126,9 +126,11 @@ copy_image_with_blitter(struct brw_context *brw,
                             src_mt->pitch,
                             src_mt->bo, src_mt->offset,
                             src_mt->tiling,
+                            src_mt->tr_mode,
                             dst_mt->pitch,
                             dst_mt->bo, dst_mt->offset,
                             dst_mt->tiling,
+                            dst_mt->tr_mode,
                             src_x, src_y,
                             dst_x, dst_y,
                             src_width, src_height,
diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index 75cf7854eff..58f41bfd55d 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -79,11 +79,13 @@ intel_debug_flag_for_shader_stage(gl_shader_stage stage)
 {
    uint64_t flags[] = {
       [MESA_SHADER_VERTEX] = DEBUG_VS,
+      [MESA_SHADER_TESS_CTRL] = 0,
+      [MESA_SHADER_TESS_EVAL] = 0,
       [MESA_SHADER_GEOMETRY] = DEBUG_GS,
       [MESA_SHADER_FRAGMENT] = DEBUG_WM,
       [MESA_SHADER_COMPUTE] = DEBUG_CS,
    };
-   STATIC_ASSERT(MESA_SHADER_STAGES == 4);
+   STATIC_ASSERT(MESA_SHADER_STAGES == 6);
    return flags[stage];
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index c99677c7197..3bc28a12026 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -64,10 +64,10 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   drm_intel_bo_map(brw->batch.workaround_bo, true);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, true);
+   data = brw->workaround_bo->virtual;
    data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    /* Write the register. */
    BEGIN_BATCH(3);
@@ -76,13 +76,13 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    OUT_BATCH(expected_value);
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the register's value back to the buffer. */
    BEGIN_BATCH(3);
    OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    OUT_BATCH(reg);
-   OUT_RELOC(brw->batch.workaround_bo,
+   OUT_RELOC(brw->workaround_bo,
              I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
              offset * sizeof(uint32_t));
    ADVANCE_BATCH();
@@ -90,10 +90,10 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    intel_batchbuffer_flush(brw);
 
    /* Check whether the value got written. */
-   drm_intel_bo_map(brw->batch.workaround_bo, false);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, false);
+   data = brw->workaround_bo->virtual;
    bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    result = success;
 
@@ -120,10 +120,10 @@ can_write_oacontrol(struct brw_context *brw)
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   drm_intel_bo_map(brw->batch.workaround_bo, true);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, true);
+   data = brw->workaround_bo->virtual;
    data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    /* Write OACONTROL. */
    BEGIN_BATCH(3);
@@ -132,18 +132,18 @@ can_write_oacontrol(struct brw_context *brw)
    OUT_BATCH(expected_value);
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the register's value back to the buffer. */
    BEGIN_BATCH(3);
    OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    OUT_BATCH(OACONTROL);
-   OUT_RELOC(brw->batch.workaround_bo,
+   OUT_RELOC(brw->workaround_bo,
              I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
              offset * sizeof(uint32_t));
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Set OACONTROL back to zero (everything off). */
    BEGIN_BATCH(3);
@@ -155,10 +155,10 @@ can_write_oacontrol(struct brw_context *brw)
    intel_batchbuffer_flush(brw);
 
    /* Check whether the value got written. */
-   drm_intel_bo_map(brw->batch.workaround_bo, false);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, false);
+   data = brw->workaround_bo->virtual;
    bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    result = success;
 
@@ -284,8 +284,6 @@ intelInitExtensions(struct gl_context *ctx)
    }
 
    if (brw->gen >= 6) {
-      uint64_t dummy;
-
       ctx->Extensions.ARB_blend_func_extended =
          brw->optionCache.info == NULL ||
          !driQueryOptionb(&brw->optionCache, "disable_blend_func_extended");
@@ -311,13 +309,14 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.OES_depth_texture_cube_map = true;
 
       /* Test if the kernel has the ioctl. */
-      if (brw->bufmgr && drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &dummy) == 0)
+      if (brw->intelScreen->hw_has_timestamp)
          ctx->Extensions.ARB_timer_query = true;
 
       /* Only enable this in core profile because other parts of Mesa behave
        * slightly differently when the extension is enabled.
        */
       if (ctx->API == API_OPENGL_CORE) {
+         ctx->Extensions.ARB_shader_subroutine = true;
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
       }
@@ -331,6 +330,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_framebuffer_no_attachments = true;
       ctx->Extensions.ARB_gpu_shader5 = true;
       ctx->Extensions.ARB_shader_atomic_counters = true;
+      ctx->Extensions.ARB_shader_image_load_store = true;
       ctx->Extensions.ARB_texture_compression_bptc = true;
       ctx->Extensions.ARB_texture_view = true;
 
@@ -351,6 +351,7 @@ intelInitExtensions(struct gl_context *ctx)
       if (ctx->API == API_OPENGL_CORE) {
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
+         ctx->Extensions.ARB_shader_subroutine = true;
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 1b3a72f3ec2..72648b01e33 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -310,7 +310,7 @@ intel_alloc_private_renderbuffer_storage(struct gl_context * ctx, struct gl_rend
    intel_miptree_release(&irb->mt);
 
    DBG("%s: %s: %s (%dx%d)\n", __func__,
-       _mesa_lookup_enum_by_nr(internalFormat),
+       _mesa_enum_to_string(internalFormat),
        _mesa_get_format_name(rb->Format), width, height);
 
    if (width == 0 || height == 0)
@@ -551,10 +551,12 @@ intel_renderbuffer_update_wrapper(struct brw_context *brw,
 
    irb->mt_layer = layer_multiplier * layer;
 
-   if (layered) {
-      irb->layer_count = image->TexObject->NumLayers ?: mt->level[level].depth / layer_multiplier;
-   } else {
+   if (!layered) {
       irb->layer_count = 1;
+   } else if (image->TexObject->NumLayers > 0) {
+      irb->layer_count = image->TexObject->NumLayers;
+   } else {
+      irb->layer_count = mt->level[level].depth / layer_multiplier;
    }
 
    intel_miptree_reference(&irb->mt, mt);
@@ -1020,6 +1022,9 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
    struct intel_mipmap_tree *new_mt;
    int width, height, depth;
 
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                           MIPTREE_LAYOUT_TILING_ANY;
+
    intel_miptree_get_dimensions_for_image(rb->TexImage, &width, &height, &depth);
 
    new_mt = intel_miptree_create(brw, rb->TexImage->TexObject->Target,
@@ -1028,8 +1033,7 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                  intel_image->base.Base.Level,
                                  width, height, depth,
                                  irb->mt->num_samples,
-                                 INTEL_MIPTREE_TILING_ANY,
-                                 MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                 layout_flags);
 
    if (intel_miptree_wants_hiz_buffer(brw, new_mt)) {
       intel_miptree_alloc_hiz(brw, new_mt);
@@ -1076,7 +1080,7 @@ brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
    if (!_mesa_set_search(brw->render_cache, bo))
       return;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 6aa969a4930..e85c3f00c7b 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -272,7 +272,6 @@ intel_miptree_create_layout(struct brw_context *brw,
                             GLuint height0,
                             GLuint depth0,
                             GLuint num_samples,
-                            enum intel_miptree_tiling_mode requested,
                             uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
@@ -280,7 +279,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       return NULL;
 
    DBG("%s target %s format %s level %d..%d slices %d <-- %p\n", __func__,
-       _mesa_lookup_enum_by_nr(target),
+       _mesa_enum_to_string(target),
        _mesa_get_format_name(format),
        first_level, last_level, depth0, mt);
 
@@ -454,8 +453,10 @@ intel_miptree_create_layout(struct brw_context *brw,
 	(brw->has_separate_stencil &&
          intel_miptree_wants_hiz_buffer(brw, mt)))) {
       uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-      if (brw->gen == 6)
-         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
+      if (brw->gen == 6) {
+         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD |
+                          MIPTREE_LAYOUT_TILING_ANY;
+      }
 
       mt->stencil_mt = intel_miptree_create(brw,
                                             mt->target,
@@ -466,7 +467,6 @@ intel_miptree_create_layout(struct brw_context *brw,
                                             mt->logical_height0,
                                             mt->logical_depth0,
                                             num_samples,
-                                            INTEL_MIPTREE_TILING_ANY,
                                             stencil_flags);
 
       if (!mt->stencil_mt) {
@@ -510,7 +510,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
    }
 
-   brw_miptree_layout(brw, mt, requested, layout_flags);
+   brw_miptree_layout(brw, mt, layout_flags);
 
    if (mt->disable_aux_buffers)
       assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
@@ -558,6 +558,53 @@ intel_lower_compressed_format(struct brw_context *brw, mesa_format format)
    }
 }
 
+/* This function computes Yf/Ys tiled bo size, alignment and pitch. */
+static unsigned long
+intel_get_yf_ys_bo_size(struct intel_mipmap_tree *mt, unsigned *alignment,
+                        unsigned long *pitch)
+{
+   const uint32_t bpp = mt->cpp * 8;
+   const uint32_t aspect_ratio = (bpp == 16 || bpp == 64) ? 2 : 1;
+   uint32_t tile_width, tile_height;
+   unsigned long stride, size, aligned_y;
+
+   assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
+
+   switch (bpp) {
+   case 8:
+      tile_height = 64;
+      break;
+   case 16:
+   case 32:
+      tile_height = 32;
+      break;
+   case 64:
+   case 128:
+      tile_height = 16;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
+      tile_height *= 4;
+
+   aligned_y = ALIGN(mt->total_height, tile_height);
+   stride = mt->total_width * mt->cpp;
+   tile_width = tile_height * mt->cpp * aspect_ratio;
+   stride = ALIGN(stride, tile_width);
+   size = stride * aligned_y;
+
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YF) {
+      assert(size % 4096 == 0);
+      *alignment = 4096;
+   } else {
+      assert(size % (64 * 1024) == 0);
+      *alignment = 64 * 1024;
+   }
+   *pitch = stride;
+   return size;
+}
 
 struct intel_mipmap_tree *
 intel_miptree_create(struct brw_context *brw,
@@ -569,7 +616,6 @@ intel_miptree_create(struct brw_context *brw,
                      GLuint height0,
                      GLuint depth0,
                      GLuint num_samples,
-                     enum intel_miptree_tiling_mode requested_tiling,
                      uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
@@ -587,7 +633,7 @@ intel_miptree_create(struct brw_context *brw,
    mt = intel_miptree_create_layout(brw, target, format,
                                     first_level, last_level, width0,
                                     height0, depth0, num_samples,
-                                    requested_tiling, layout_flags);
+                                    layout_flags);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -616,10 +662,22 @@ intel_miptree_create(struct brw_context *brw,
       alloc_flags |= BO_ALLOC_FOR_RENDER;
 
    unsigned long pitch;
-   mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree", total_width,
-                                     total_height, mt->cpp, &mt->tiling,
-                                     &pitch, alloc_flags);
    mt->etc_format = etc_format;
+
+   if (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
+      unsigned alignment = 0;
+      unsigned long size;
+      size = intel_get_yf_ys_bo_size(mt, &alignment, &pitch);
+      assert(size);
+      mt->bo = drm_intel_bo_alloc_for_render(brw->bufmgr, "miptree",
+                                             size, alignment);
+   } else {
+      mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
+                                        total_width, total_height, mt->cpp,
+                                        &mt->tiling, &pitch,
+                                        alloc_flags);
+   }
+
    mt->pitch = pitch;
 
    /* If the BO is too large to fit in the aperture, we need to use the
@@ -698,17 +756,16 @@ intel_miptree_create_for_bo(struct brw_context *brw,
 
    target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
 
-   /* 'requested' parameter of intel_miptree_create_layout() is relevant
-    * only for non bo miptree. Tiling for bo is already computed above.
-    * So, the tiling requested (INTEL_MIPTREE_TILING_ANY) below is
-    * just a place holder and will not make any change to the miptree
-    * tiling format.
+   /* The BO already has a tiling format and we shouldn't confuse the lower
+    * layers by making it try to find a tiling format again.
     */
+   assert((layout_flags & MIPTREE_LAYOUT_TILING_ANY) == 0);
+   assert((layout_flags & MIPTREE_LAYOUT_TILING_NONE) == 0);
+
    layout_flags |= MIPTREE_LAYOUT_FOR_BO;
    mt = intel_miptree_create_layout(brw, target, format,
                                     0, 0,
                                     width, height, depth, 0,
-                                    INTEL_MIPTREE_TILING_ANY,
                                     layout_flags);
    if (!mt)
       return NULL;
@@ -816,11 +873,13 @@ intel_miptree_create_for_renderbuffer(struct brw_context *brw,
    uint32_t depth = 1;
    bool ok;
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
+   const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                                 MIPTREE_LAYOUT_TILING_ANY;
+
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
                              width, height, depth, num_samples,
-                             INTEL_MIPTREE_TILING_ANY,
-                             MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                             layout_flags);
    if (!mt)
       goto fail;
 
@@ -1325,6 +1384,8 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
     *
     *     "The MCS surface must be stored as Tile Y."
     */
+   const uint32_t mcs_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                              MIPTREE_LAYOUT_TILING_Y;
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1334,8 +1395,7 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
                                      mt->logical_height0,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     INTEL_MIPTREE_TILING_Y,
-                                     MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                     mcs_flags);
 
    /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
     *
@@ -1383,9 +1443,11 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    unsigned mcs_height =
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
-   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-   if (brw->gen >= 8)
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                           MIPTREE_LAYOUT_TILING_Y;
+   if (brw->gen >= 8) {
       layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
+   }
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1395,7 +1457,6 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                      mcs_height,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     INTEL_MIPTREE_TILING_Y,
                                      layout_flags);
 
    return mt->mcs_mt;
@@ -1456,21 +1517,23 @@ intel_gen7_hiz_buf_create(struct brw_context *brw,
    /* Gen7 PRM Volume 2, Part 1, 11.5.3 "Hierarchical Depth Buffer" documents
     * adjustments required for Z_Height and Z_Width based on multisampling.
     */
-   switch (mt->num_samples) {
-   case 0:
-   case 1:
-      break;
-   case 2:
-   case 4:
-      z_width *= 2;
-      z_height *= 2;
-      break;
-   case 8:
-      z_width *= 4;
-      z_height *= 2;
-      break;
-   default:
-      unreachable("unsupported sample count");
+   if (brw->gen < 9) {
+      switch (mt->num_samples) {
+      case 0:
+      case 1:
+         break;
+      case 2:
+      case 4:
+         z_width *= 2;
+         z_height *= 2;
+         break;
+      case 8:
+         z_width *= 4;
+         z_height *= 2;
+         break;
+      default:
+         unreachable("unsupported sample count");
+      }
    }
 
    const unsigned vertical_align = 8; /* 'j' in the docs */
@@ -1646,6 +1709,7 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
    if (!buf)
       return NULL;
 
+   layout_flags |= MIPTREE_LAYOUT_TILING_ANY;
    buf->mt = intel_miptree_create(brw,
                                   mt->target,
                                   mt->format,
@@ -1655,7 +1719,6 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                                   mt->logical_height0,
                                   mt->logical_depth0,
                                   mt->num_samples,
-                                  INTEL_MIPTREE_TILING_ANY,
                                   layout_flags);
    if (!buf->mt) {
       free(buf);
@@ -2086,7 +2149,7 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  0, INTEL_MIPTREE_TILING_NONE, 0);
+                                  0, MIPTREE_LAYOUT_TILING_NONE);
 
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index bde6daa4e2d..790d3129207 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -516,12 +516,6 @@ struct intel_mipmap_tree
    GLuint refcount;
 };
 
-enum intel_miptree_tiling_mode {
-   INTEL_MIPTREE_TILING_ANY,
-   INTEL_MIPTREE_TILING_Y,
-   INTEL_MIPTREE_TILING_NONE,
-};
-
 void
 intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
@@ -541,6 +535,11 @@ enum {
    MIPTREE_LAYOUT_FOR_BO                   = 1 << 2,
    MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
    MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
+
+   MIPTREE_LAYOUT_TILING_Y                 = 1 << 5,
+   MIPTREE_LAYOUT_TILING_NONE              = 1 << 6,
+   MIPTREE_LAYOUT_TILING_ANY               = MIPTREE_LAYOUT_TILING_Y |
+                                             MIPTREE_LAYOUT_TILING_NONE,
 };
 
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
@@ -552,7 +551,6 @@ struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLuint height0,
                                                GLuint depth0,
                                                GLuint num_samples,
-                                               enum intel_miptree_tiling_mode,
                                                uint32_t flags);
 
 struct intel_mipmap_tree *
@@ -771,7 +769,6 @@ brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags);
 
 void *intel_miptree_map_raw(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index 30380570d62..3fe506e3cf1 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -247,7 +247,7 @@ intelReadPixels(struct gl_context * ctx,
           * rendered to via a PBO at any point, so it seems better to just
           * flush here unconditionally.
           */
-         intel_batchbuffer_emit_mi_flush(brw);
+         brw_emit_mi_flush(brw);
          return;
       }
 
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index bd14e189da3..b4283da9633 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -47,6 +47,9 @@
 /* Load a value from memory into a register.  Only available on Gen7+. */
 #define GEN7_MI_LOAD_REGISTER_MEM	(CMD_MI | (0x29 << 23))
 # define MI_LOAD_REGISTER_MEM_USE_GGTT		(1 << 22)
+/* Haswell RS control */
+#define MI_RS_CONTROL                   (CMD_MI | (0x6 << 23))
+#define MI_RS_STORE_DATA_IMM            (CMD_MI | (0x2b << 23))
 
 /* Manipulate the predicate bit based on some register values. Only on Gen7+ */
 #define GEN7_MI_PREDICATE		(CMD_MI | (0xC << 23))
@@ -102,6 +105,8 @@
 
 #define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22))
 
+#define XY_FAST_COPY_BLT_CMD             (CMD_2D | (0x42 << 22))
+
 #define XY_TEXT_IMMEDIATE_BLIT_CMD	(CMD_2D | (0x31 << 22))
 # define XY_TEXT_BYTE_PACKED		(1 << 16)
 
@@ -111,10 +116,24 @@
 #define XY_SRC_TILED		(1 << 15)
 #define XY_DST_TILED		(1 << 11)
 
+/* BR00 */
+#define XY_FAST_SRC_TILED_64K        (3 << 20)
+#define XY_FAST_SRC_TILED_Y          (2 << 20)
+#define XY_FAST_SRC_TILED_X          (1 << 20)
+
+#define XY_FAST_DST_TILED_64K        (3 << 13)
+#define XY_FAST_DST_TILED_Y          (2 << 13)
+#define XY_FAST_DST_TILED_X          (1 << 13)
+
 /* BR13 */
 #define BR13_8			(0x0 << 24)
 #define BR13_565		(0x1 << 24)
 #define BR13_8888		(0x3 << 24)
+#define BR13_16161616		(0x4 << 24)
+#define BR13_32323232		(0x5 << 24)
+
+#define XY_FAST_SRC_TRMODE_YF        (1 << 31)
+#define XY_FAST_DST_TRMODE_YF        (1 << 30)
 
 /* Pipeline Statistics Counter Registers */
 #define IA_VERTICES_COUNT               0x2310
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index de14696bd76..a164c6985dc 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -229,6 +229,12 @@ static struct intel_image_format intel_image_formats[] = {
    { __DRI_IMAGE_FOURCC_RGB565, __DRI_IMAGE_COMPONENTS_RGB, 1,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_RGB565, 2 } } },
 
+   { __DRI_IMAGE_FOURCC_R8, __DRI_IMAGE_COMPONENTS_R, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, } },
+
+   { __DRI_IMAGE_FOURCC_GR88, __DRI_IMAGE_COMPONENTS_RG, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, } },
+
    { __DRI_IMAGE_FOURCC_YUV410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 },
@@ -1123,6 +1129,50 @@ intel_detect_swizzling(struct intel_screen *screen)
       return true;
 }
 
+static int
+intel_detect_timestamp(struct intel_screen *screen)
+{
+   uint64_t dummy = 0, last = 0;
+   int upper, lower, loops;
+
+   /* On 64bit systems, some old kernels trigger a hw bug resulting in the
+    * TIMESTAMP register being shifted and the low 32bits always zero.
+    *
+    * More recent kernels offer an interface to read the full 36bits
+    * everywhere.
+    */
+   if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP | 1, &dummy) == 0)
+      return 3;
+
+   /* Determine if we have a 32bit or 64bit kernel by inspecting the
+    * upper 32bits for a rapidly changing timestamp.
+    */
+   if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP, &last))
+      return 0;
+
+   upper = lower = 0;
+   for (loops = 0; loops < 10; loops++) {
+      /* The TIMESTAMP should change every 80ns, so several round trips
+       * through the kernel should be enough to advance it.
+       */
+      if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP, &dummy))
+         return 0;
+
+      upper += (dummy >> 32) != (last >> 32);
+      if (upper > 1) /* beware 32bit counter overflow */
+         return 2; /* upper dword holds the low 32bits of the timestamp */
+
+      lower += (dummy & 0xffffffff) != (last & 0xffffffff);
+      if (lower > 1)
+         return 1; /* timestamp is unshifted */
+
+      last = dummy;
+   }
+
+   /* No advancement? No timestamp! */
+   return 0;
+}
+
 /**
  * Return array of MSAA modes supported by the hardware. The array is
  * zero-terminated and sorted in decreasing order.
@@ -1309,11 +1359,6 @@ set_max_gl_versions(struct intel_screen *screen)
    }
 }
 
-/* drop when libdrm 2.4.61 is released */
-#ifndef I915_PARAM_REVISION
-#define I915_PARAM_REVISION 32
-#endif
-
 static int
 brw_get_revision(int fd)
 {
@@ -1332,6 +1377,11 @@ brw_get_revision(int fd)
    return revision;
 }
 
+/* Drop when RS headers get pulled to libdrm */
+#ifndef I915_PARAM_HAS_RESOURCE_STREAMER
+#define I915_PARAM_HAS_RESOURCE_STREAMER 36
+#endif
+
 /**
  * This is the driver specific part of the createNewScreen entry point.
  * Called when using DRI2.
@@ -1378,6 +1428,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    intelScreen->hw_must_use_separate_stencil = intelScreen->devinfo->gen >= 7;
 
    intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
+   intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
 
    const char *force_msaa = getenv("INTEL_FORCE_MSAA");
    if (force_msaa) {
@@ -1423,6 +1474,15 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    intelScreen->compiler = brw_compiler_create(intelScreen,
                                                intelScreen->devinfo);
 
+   if (intelScreen->devinfo->has_resource_streamer) {
+      int val = -1;
+      getparam.param = I915_PARAM_HAS_RESOURCE_STREAMER;
+      getparam.value = &val;
+
+      drmIoctl(psp->fd, DRM_IOCTL_I915_GETPARAM, &getparam);
+      intelScreen->has_resource_streamer = val > 0;
+   }
+
    return (const __DRIconfig**) intel_screen_make_configs(psp);
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 742b3d30eee..fd5143eecba 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -52,6 +52,13 @@ struct intel_screen
 
    bool hw_has_swizzling;
 
+   int hw_has_timestamp;
+
+   /**
+    * Does the kernel support resource streamer?
+    */
+   bool has_resource_streamer;
+
    /**
     * Does the kernel support context reset notifications?
     */
diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index 3cfa7e593ab..c44c4beceef 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -69,7 +69,7 @@ brw_fence_insert(struct brw_context *brw, struct brw_fence *fence)
    assert(!fence->batch_bo);
    assert(!fence->signalled);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    fence->batch_bo = brw->batch.bo;
    drm_intel_bo_reference(fence->batch_bo);
    intel_batchbuffer_flush(brw);
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index b0181ad1d75..e16b0def0d4 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -145,7 +145,7 @@ intel_alloc_texture_storage(struct gl_context *ctx,
                                               0, levels - 1,
                                               width, height, depth,
                                               num_samples,
-                                              INTEL_MIPTREE_TILING_ANY, 0);
+                                              MIPTREE_LAYOUT_TILING_ANY);
 
       if (intel_texobj->mt == NULL) {
          return false;
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index ebe84b664d4..93a8cdee0cb 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -80,8 +80,7 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
 			       height,
 			       depth,
                                intelImage->base.Base.NumSamples,
-                               INTEL_MIPTREE_TILING_ANY,
-                               layout_flags);
+                               layout_flags | MIPTREE_LAYOUT_TILING_ANY);
 }
 
 static void
@@ -98,8 +97,8 @@ intelTexImage(struct gl_context * ctx,
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
-       _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type),
+       _mesa_enum_to_string(texImage->TexObject->Target),
+       _mesa_enum_to_string(format), _mesa_enum_to_string(type),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    /* Allocate storage for texture data. */
@@ -472,39 +471,44 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
 }
 
 static void
-intel_get_tex_image(struct gl_context *ctx,
-                    GLenum format, GLenum type, GLvoid *pixels,
-                    struct gl_texture_image *texImage) {
+intel_get_tex_sub_image(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage)
+{
    struct brw_context *brw = brw_context(ctx);
    bool ok;
 
    DBG("%s\n", __func__);
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage, 0, 0, 0,
-                                        texImage->Width, texImage->Height,
-                                        texImage->Depth, format, type,
+      if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage,
+                                        xoffset, yoffset, zoffset,
+                                        width, height, depth, format, type,
                                         pixels, &ctx->Pack)) {
          /* Flush to guarantee coherency between the render cache and other
           * caches the PBO could potentially be bound to after this point.
           * See the related comment in intelReadPixels() for a more detailed
           * explanation.
           */
-         intel_batchbuffer_emit_mi_flush(brw);
+         brw_emit_mi_flush(brw);
          return;
       }
 
       perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
    }
 
-   ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, 0, 0,
-                                          texImage->Width, texImage->Height,
+   ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset,
+                                          width, height,
                                           format, type, pixels, &ctx->Pack);
 
    if(ok)
       return;
 
-   _mesa_meta_GetTexImage(ctx, format, type, pixels, texImage);
+   _mesa_meta_GetTexSubImage(ctx, xoffset, yoffset, zoffset,
+                             width, height, depth,
+                             format, type, pixels, texImage);
 
    DBG("%s - DONE\n", __func__);
 }
@@ -515,5 +519,5 @@ intelInitTextureImageFuncs(struct dd_function_table *functions)
    functions->TexImage = intelTexImage;
    functions->EGLImageTargetTexture2D = intel_image_target_texture_2d;
    functions->BindRenderbufferTexImage = intel_bind_renderbuffer_tex_image;
-   functions->GetTexImage = intel_get_tex_image;
+   functions->GetTexSubImage = intel_get_tex_sub_image;
 }
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 7507f7669a0..31e511f0b7b 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -206,8 +206,8 @@ intelTexSubImage(struct gl_context * ctx,
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
-       _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type),
+       _mesa_enum_to_string(texImage->TexObject->Target),
+       _mesa_enum_to_string(format), _mesa_enum_to_string(type),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    ok = _mesa_meta_pbo_TexSubImage(ctx, dims, texImage,
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 4991c2997ef..d3fb252b5d5 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -136,6 +136,8 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                  _mesa_get_format_name(firstImage->base.Base.TexFormat),
                  width, height, depth, validate_last_level + 1);
 
+      const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                                    MIPTREE_LAYOUT_TILING_ANY;
       intelObj->mt = intel_miptree_create(brw,
                                           intelObj->base.Target,
 					  firstImage->base.Base.TexFormat,
@@ -145,8 +147,7 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                                           height,
                                           depth,
                                           0 /* num_samples */,
-                                          INTEL_MIPTREE_TILING_ANY,
-                                          MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                          layout_flags);
       if (!intelObj->mt)
          return false;
    }
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 8010fb4f610..ba67bc59e19 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -283,10 +283,10 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
    fs_reg zero(0.0f);
-   bld.ADD(offset(dest, 2), src0, src1);
+   bld.ADD(offset(dest, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dest, src2)
       ->regs_written = 4;
-   bld.CMP(bld.null_reg_f(), offset(dest, 2), zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
index 3ef0cb319eb..1caa0b50ec6 100644
--- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
@@ -367,10 +367,10 @@ TEST_F(saturate_propagation_test, intervening_dest_write)
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
-   bld.ADD(offset(dst0, 2), src0, src1);
+   bld.ADD(offset(dst0, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dst0, src2)
       ->regs_written = 4;
-   set_saturate(true, bld.MOV(dst1, offset(dst0, 2)));
+   set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2)));
 
    /* = Before =
     *
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index 84e43fa75cd..fbd9fa8f19b 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -53,7 +53,8 @@ public:
    }
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir)
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type)
    {
       unreachable("Not reached");
    }
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index de2afd39cfe..a3055fcc851 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -56,7 +56,8 @@ public:
    }
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir)
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type)
    {
       unreachable("Not reached");
    }
diff --git a/src/mesa/drivers/dri/nouveau/Makefile.am b/src/mesa/drivers/dri/nouveau/Makefile.am
index 61af95a7dbc..01e34a8e3c3 100644
--- a/src/mesa/drivers/dri/nouveau/Makefile.am
+++ b/src/mesa/drivers/dri/nouveau/Makefile.am
@@ -38,8 +38,8 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
-	$(NOUVEAU_CFLAGS)
+	$(NVVIEUX_CFLAGS)
 
 noinst_LTLIBRARIES = libnouveau_dri.la
 libnouveau_dri_la_SOURCES = $(NOUVEAU_C_FILES)
-libnouveau_dri_la_LIBADD = $(NOUVEAU_LIBS)
+libnouveau_dri_la_LIBADD = $(NVVIEUX_LIBS)
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c b/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
index 0753c3a0019..755de2c4b68 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
@@ -338,7 +338,6 @@ TAG(swtnl_init)(struct gl_context *ctx)
 			   NUM_VERTEX_ATTRS * 4 * sizeof(GLfloat));
 	_tnl_need_projected_coords(ctx, GL_FALSE);
 	_tnl_allow_vertex_fog(ctx, GL_FALSE);
-	_tnl_wakeup(ctx);
 
 	swtnl_alloc_vertices(ctx);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
index c85acec1268..a3fbad07e66 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
@@ -223,6 +223,7 @@ TAG(vbo_render_prims)(struct gl_context *ctx,
 		      GLboolean index_bounds_valid,
 		      GLuint min_index, GLuint max_index,
 		      struct gl_transform_feedback_object *tfb_vertcount,
+                      unsigned stream,
 		      struct gl_buffer_object *indirect);
 
 static GLboolean
@@ -455,6 +456,7 @@ TAG(vbo_render_prims)(struct gl_context *ctx,
 		      GLboolean index_bounds_valid,
 		      GLuint min_index, GLuint max_index,
 		      struct gl_transform_feedback_object *tfb_vertcount,
+                      unsigned stream,
 		      struct gl_buffer_object *indirect)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
@@ -492,6 +494,7 @@ TAG(vbo_check_render_prims)(struct gl_context *ctx,
 			    GLboolean index_bounds_valid,
 			    GLuint min_index, GLuint max_index,
 			    struct gl_transform_feedback_object *tfb_vertcount,
+                            unsigned stream,
 			    struct gl_buffer_object *indirect)
 {
 	struct nouveau_context *nctx = to_nouveau_context(ctx);
@@ -501,12 +504,12 @@ TAG(vbo_check_render_prims)(struct gl_context *ctx,
 	if (nctx->fallback == HWTNL)
 		TAG(vbo_render_prims)(ctx, prims, nr_prims, ib,
 				      index_bounds_valid, min_index, max_index,
-				      tfb_vertcount, indirect);
+				      tfb_vertcount, stream, indirect);
 
 	if (nctx->fallback == SWTNL)
 		_tnl_draw_prims(ctx, prims, nr_prims, ib,
 				index_bounds_valid, min_index, max_index,
-				tfb_vertcount, indirect);
+				tfb_vertcount, stream, indirect);
 }
 
 void
diff --git a/src/mesa/drivers/dri/nouveau/nv04_render.c b/src/mesa/drivers/dri/nouveau/nv04_render.c
index 30e9f9aad96..3b7f7829044 100644
--- a/src/mesa/drivers/dri/nouveau/nv04_render.c
+++ b/src/mesa/drivers/dri/nouveau/nv04_render.c
@@ -285,7 +285,6 @@ nv04_render_init(struct gl_context *ctx)
 	_tnl_init_vertices(ctx, tnl->vb.Size,
 			   NUM_VERTEX_ATTRS * 4 * sizeof(GLfloat));
 	_tnl_allow_pixel_fog(ctx, GL_FALSE);
-	_tnl_wakeup(ctx);
 }
 
 void
diff --git a/src/mesa/drivers/dri/r200/r200_blit.c b/src/mesa/drivers/dri/r200/r200_blit.c
index 3adc69423cd..d68a53e67f7 100644
--- a/src/mesa/drivers/dri/r200/r200_blit.c
+++ b/src/mesa/drivers/dri/r200/r200_blit.c
@@ -28,6 +28,7 @@
 #include "radeon_common.h"
 #include "r200_context.h"
 #include "r200_blit.h"
+#include "r200_tex.h"
 
 static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
                                   int reg, int count)
@@ -40,22 +41,42 @@ static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
 /* common formats supported as both textures and render targets */
 unsigned r200_check_blit(mesa_format mesa_format, uint32_t dst_pitch)
 {
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-    case MESA_FORMAT_B5G6R5_UNORM:
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_L_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-    /* swizzled */
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-    case MESA_FORMAT_R8G8B8A8_UNORM:
+    /* XXX others? */
+    if (_mesa_little_endian()) {
+	switch (mesa_format) {
+	case MESA_FORMAT_B8G8R8A8_UNORM:
+	case MESA_FORMAT_B8G8R8X8_UNORM:
+	case MESA_FORMAT_B5G6R5_UNORM:
+	case MESA_FORMAT_B4G4R4A4_UNORM:
+	case MESA_FORMAT_B5G5R5A1_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	/* swizzled - probably can't happen with the disabled Choose8888TexFormat code */
+	case MESA_FORMAT_A8B8G8R8_UNORM:
+	case MESA_FORMAT_R8G8B8A8_UNORM:
 	    break;
-    default:
+	default:
 	    return 0;
+	}
+    }
+    else {
+	switch (mesa_format) {
+	case MESA_FORMAT_A8R8G8B8_UNORM:
+	case MESA_FORMAT_X8R8G8B8_UNORM:
+	case MESA_FORMAT_R5G6B5_UNORM:
+	case MESA_FORMAT_A4R4G4B4_UNORM:
+	case MESA_FORMAT_A1R5G5B5_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	/* swizzled  - probably can't happen with the disabled Choose8888TexFormat code */
+	case MESA_FORMAT_R8G8B8A8_UNORM:
+	case MESA_FORMAT_A8B8G8R8_UNORM:
+	   break;
+	default:
+	   return 0;
+	}
     }
 
     /* Rendering to small buffer doesn't work.
@@ -112,41 +133,11 @@ static void inline emit_tx_setup(struct r200_context *r200,
     assert(height <= 2048);
     assert(offset % 32 == 0);
 
-    /* XXX others?  BE/LE? */
-    switch (src_mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-	    txformat |= R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_R8G8B8A8_UNORM:
-	    txformat |= R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB8888;
-	    break;
-    case MESA_FORMAT_B5G6R5_UNORM:
-	    txformat |= R200_TXFORMAT_RGB565;
-	    break;
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-	    txformat |= R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_L_UNORM8:
-	    txformat |= R200_TXFORMAT_I8;
-	    break;
-    case MESA_FORMAT_L8A8_UNORM:
-	    txformat |= R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    default:
-	    break;
+    if (_mesa_little_endian()) {
+	txformat |= tx_table_le[src_mesa_format].format;
+    }
+    else {
+	txformat |= tx_table_be[src_mesa_format].format;
     }
 
     if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
@@ -155,11 +146,19 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	offset |= R200_TXO_MICRO_TILE;
 
     switch (dst_mesa_format) {
+    /* le */
     case MESA_FORMAT_B8G8R8A8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
     case MESA_FORMAT_B5G6R5_UNORM:
     case MESA_FORMAT_B4G4R4A4_UNORM:
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    /* be */
+    case MESA_FORMAT_A8R8G8B8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
+    /* little and big */
     case MESA_FORMAT_A_UNORM8:
     case MESA_FORMAT_L_UNORM8:
     case MESA_FORMAT_I_UNORM8:
@@ -183,6 +182,9 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	    END_BATCH();
 	    break;
     case MESA_FORMAT_A8B8G8R8_UNORM:
+    case MESA_FORMAT_R8G8B8A8_UNORM:
+       if ((dst_mesa_format == MESA_FORMAT_A8B8G8R8_UNORM && _mesa_little_endian()) ||
+	   (dst_mesa_format == MESA_FORMAT_R8G8B8A8_UNORM && !_mesa_little_endian())) {
 	    BEGIN_BATCH(10);
 	    OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE |
 					      RADEON_TEX_BLEND_0_ENABLE));
@@ -190,6 +192,8 @@ static void inline emit_tx_setup(struct r200_context *r200,
 						  R200_TXC_ARG_B_ZERO |
 						  R200_TXC_ARG_C_R0_COLOR |
 						  R200_TXC_OP_MADD));
+	    /* XXX I don't think this can work. This is output rotation, and alpha contains
+	     * red, not alpha (we'd write gbrr). */
 	    OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_0, (R200_TXC_CLAMP_0_1 |
 						   R200_TXC_OUTPUT_ROTATE_GBA |
 						   R200_TXC_OUTPUT_REG_R0));
@@ -201,8 +205,16 @@ static void inline emit_tx_setup(struct r200_context *r200,
 						   (R200_TXA_REPL_RED << R200_TXA_REPL_ARG_C_SHIFT) |
 						   R200_TXA_OUTPUT_REG_R0));
 	    END_BATCH();
-	    break;
-    case MESA_FORMAT_R8G8B8A8_UNORM:
+       }
+       else {
+	    /* XXX pretty sure could do this with just 2 instead of 4 instructions.
+	     * Like so:
+	     * 1st: use RGA output rotation, rgb arg replicate b, a arg r, write mask rb.
+	     * That's just one instruction in fact but I'm not entirely sure it works
+	     * if some of those incoming r0 components are never written (due to mask)
+	     * in the shader itself to r0.
+	     * In any case this case (and the one above) may not be reachable with
+	     * disabled Choose8888TexFormat code. */
 	    BEGIN_BATCH(34);
 	    OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE |
 					      RADEON_TEX_BLEND_0_ENABLE |
@@ -272,7 +284,8 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	    OUT_BATCH_REGVAL(R200_PP_TXABLEND2_3, (R200_TXA_CLAMP_0_1 |
 						   R200_TXA_OUTPUT_REG_R0));
 	    END_BATCH();
-	    break;
+	}
+	break;
     }
 
     BEGIN_BATCH(18);
@@ -306,21 +319,27 @@ static inline void emit_cb_setup(struct r200_context *r200,
     uint32_t dst_format = 0;
     BATCH_LOCALS(&r200->radeon);
 
-    /* XXX others?  BE/LE? */
     switch (mesa_format) {
+    /* The first of each pair is for little, the second for big endian */
     case MESA_FORMAT_B8G8R8A8_UNORM:
+    case MESA_FORMAT_A8R8G8B8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
+    /* These two are valid both for little and big endian (swizzled) */
     case MESA_FORMAT_A8B8G8R8_UNORM:
     case MESA_FORMAT_R8G8B8A8_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
 	    break;
     case MESA_FORMAT_B5G6R5_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_RGB565;
 	    break;
     case MESA_FORMAT_B4G4R4A4_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB4444;
 	    break;
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
 	    break;
     case MESA_FORMAT_A_UNORM8:
@@ -547,5 +566,21 @@ unsigned r200_blit(struct gl_context *ctx,
 
     radeonFlush(ctx);
 
+    /* We submitted those packets outside our state atom mechanism. Thus
+     * make sure the atoms are resubmitted the next time. */
+    r200->hw.cst.dirty = GL_TRUE;
+    r200->hw.ctx.dirty = GL_TRUE;
+    r200->hw.vap.dirty = GL_TRUE;
+    r200->hw.msk.dirty = GL_TRUE;
+    r200->hw.pix[0].dirty = GL_TRUE;
+    r200->hw.pix[1].dirty = GL_TRUE;
+    r200->hw.pix[2].dirty = GL_TRUE;
+    r200->hw.pix[3].dirty = GL_TRUE;
+    r200->hw.sci.dirty = GL_TRUE;
+    r200->hw.set.dirty = GL_TRUE;
+    r200->hw.tex[0].dirty = GL_TRUE;
+    r200->hw.vte.dirty = GL_TRUE;
+    r200->hw.vtx.dirty = GL_TRUE;
+
     return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index fb15082114f..2a42ab3f4c8 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -225,18 +225,9 @@ GLboolean r200CreateContext( gl_api api,
    rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
 							"def_max_anisotropy");
 
-   if ( sPriv->drm_version.major == 1
-       && driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
-      if ( sPriv->drm_version.minor < 13 )
-	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
-			  "disabling.\n", sPriv->drm_version.minor );
-      else
-	 rmesa->using_hyperz = GL_TRUE;
-   }
+   if (driQueryOptionb( &rmesa->radeon.optionCache, "hyperz"))
+      rmesa->using_hyperz = GL_TRUE;
  
-   if ( sPriv->drm_version.minor >= 15 )
-      rmesa->texmicrotile = GL_TRUE;
-
    /* Init default driver functions then plug in our R200-specific functions
     * (the texture functions are especially important)
     */
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index eb498f7406b..c02a4f399ee 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -109,7 +109,6 @@ struct r200_texture_state {
 #define CTX_RB3D_COLOROFFSET  11
 #define CTX_CMD_2             12 /* why */
 #define CTX_RB3D_COLORPITCH   13 /* why */
-#define CTX_STATE_SIZE_OLDDRM 14
 #define CTX_CMD_3             14
 #define CTX_RB3D_BLENDCOLOR   15
 #define CTX_RB3D_ABLENDCNTL   16
@@ -167,9 +166,6 @@ struct r200_texture_state {
 #define TEX_PP_TXSIZE               4  /*2c0c*/
 #define TEX_PP_TXPITCH              5  /*2c10*/
 #define TEX_PP_BORDER_COLOR         6  /*2c14*/
-#define TEX_CMD_1_OLDDRM            7
-#define TEX_PP_TXOFFSET_OLDDRM      8  /*2d00 */
-#define TEX_STATE_SIZE_OLDDRM       9
 #define TEX_PP_CUBIC_FACES          7
 #define TEX_PP_TXMULTI_CTL          8
 #define TEX_CMD_1_NEWDRM            9
@@ -618,7 +614,6 @@ struct r200_context {
    struct r200_swtcl_info swtcl;
 
    GLboolean using_hyperz;
-   GLboolean texmicrotile;
 
   struct ati_fragment_shader *afs_loaded;
 };
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 6fe70b5c9d0..cca176d7f9b 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -1546,7 +1546,7 @@ void r200UpdateWindow( struct gl_context *ctx )
    GLfloat xoffset = 0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->h : 0;
    const GLboolean render_to_fbo = (ctx->DrawBuffer ? _mesa_is_user_fbo(ctx->DrawBuffer) : 0);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    GLfloat y_scale, y_bias;
 
    if (render_to_fbo) {
@@ -1669,7 +1669,7 @@ static void r200Enable( struct gl_context *ctx, GLenum cap, GLboolean state )
 
    if ( R200_DEBUG & RADEON_STATE )
       fprintf( stderr, "%s( %s = %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( cap ),
+	       _mesa_enum_to_string( cap ),
 	       state ? "GL_TRUE" : "GL_FALSE" );
 
    switch ( cap ) {
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index d9d1a0ed227..ad64f788b9f 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -254,7 +254,7 @@ CHECK( never, GL_FALSE, 0 )
 CHECK( tex_any, ctx->Texture._MaxEnabledTexImageUnit != -1, 0 )
 CHECK( tf, (ctx->Texture._MaxEnabledTexImageUnit != -1 && !ctx->ATIFragmentShader._Enabled), 0 );
 CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled, 0 )
-   CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled), 0 )
+CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled), 0 )
 CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)), 0 )
 CHECK( afs, ctx->ATIFragmentShader._Enabled, 0 )
 CHECK( tex_cube, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT, 3 + 3*5 - CUBE_STATE_SIZE )
@@ -453,12 +453,15 @@ static void ctx_emit_cs(struct gl_context *ctx, struct radeon_state_atom *atom)
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
    else switch (rrb->base.Base.Format) {
    case MESA_FORMAT_B5G6R5_UNORM:
+   case MESA_FORMAT_R5G6B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
 	break;
    case MESA_FORMAT_B4G4R4A4_UNORM:
+   case MESA_FORMAT_A4R4G4B4_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
 	break;
    case MESA_FORMAT_B5G5R5A1_UNORM:
+   case MESA_FORMAT_A1R5G5B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
 	break;
    default:
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index 083a1840d9e..feee0b2ba3f 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -68,9 +68,9 @@ static void r200SetTexWrap( radeonTexObjPtr t, GLenum swrap, GLenum twrap, GLenu
    radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s(tex %p) sw %s, tw %s, rw %s\n",
 		__func__, t,
-		_mesa_lookup_enum_by_nr(swrap),
-		_mesa_lookup_enum_by_nr(twrap),
-		_mesa_lookup_enum_by_nr(rwrap));
+		_mesa_enum_to_string(swrap),
+		_mesa_enum_to_string(twrap),
+		_mesa_enum_to_string(rwrap));
 
    t->pp_txfilter &= ~(R200_CLAMP_S_MASK | R200_CLAMP_T_MASK | R200_BORDER_MODE_D3D);
 
@@ -225,8 +225,8 @@ static void r200SetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
    radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 	"%s(tex %p) minf %s, maxf %s, anisotropy %d.\n",
 	__func__, t,
-	_mesa_lookup_enum_by_nr(minf),
-	_mesa_lookup_enum_by_nr(magf),
+	_mesa_enum_to_string(minf),
+	_mesa_enum_to_string(magf),
 	anisotropy);
 
    if ( anisotropy == R200_MAX_ANISO_1_TO_1 ) {
@@ -302,7 +302,7 @@ static void r200TexEnv( struct gl_context *ctx, GLenum target,
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_VERBOSE, "%s( %s )\n",
-	       __func__, _mesa_lookup_enum_by_nr( pname ) );
+	       __func__, _mesa_enum_to_string( pname ) );
 
    /* This is incorrect: Need to maintain this data for each of
     * GL_TEXTURE_{123}D, GL_TEXTURE_RECTANGLE_NV, etc, and switch
@@ -384,7 +384,7 @@ static void r200TexParameter( struct gl_context *ctx,
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_VERBOSE,
 		"%s(%p, tex %p)  pname %s\n",
 		__func__, ctx, texObj,
-	       _mesa_lookup_enum_by_nr( pname ) );
+	       _mesa_enum_to_string( pname ) );
 
    switch ( pname ) {
    case GL_TEXTURE_MIN_FILTER:
@@ -415,7 +415,7 @@ static void r200DeleteTexture(struct gl_context * ctx, struct gl_texture_object
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_NORMAL,
            "%s( %p (target = %s) )\n", __func__,
 	   (void *)texObj,
-	   _mesa_lookup_enum_by_nr(texObj->Target));
+	   _mesa_enum_to_string(texObj->Target));
 
    if (rmesa) {
       int i;
@@ -473,7 +473,7 @@ static struct gl_texture_object *r200NewTextureObject(struct gl_context * ctx,
    radeon_print(RADEON_STATE | RADEON_TEXTURE, RADEON_NORMAL,
            "%s(%p) target %s, new texture %p.\n",
 	   __func__, ctx,
-	   _mesa_lookup_enum_by_nr(target), t);
+	   _mesa_enum_to_string(target), t);
 
    _mesa_initialize_texture_object(ctx, &t->base, name, target);
    t->base.Sampler.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
diff --git a/src/mesa/drivers/dri/r200/r200_tex.h b/src/mesa/drivers/dri/r200/r200_tex.h
index d7e91d1a0c8..a8c31b741ed 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.h
+++ b/src/mesa/drivers/dri/r200/r200_tex.h
@@ -52,4 +52,68 @@ extern void r200TexUpdateParameters(struct gl_context *ctx, GLuint unit);
 
 extern void set_re_cntl_d3d( struct gl_context *ctx, int unit, GLboolean use_d3d );
 
+struct tx_table {
+   GLuint format, filter;
+};
+
+/* Note the tables (have to) contain invalid entries (if they are only valid
+ * for either be/le) */
+static const struct tx_table tx_table_be[] =
+{
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YVYU422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_VYUY422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_DXT23 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_DXT45 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+static const struct tx_table tx_table_le[] =
+{
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { R200_TXFORMAT_ARGB8888, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YVYU422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_VYUY422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_DXT23 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_DXT45 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+
+
 #endif /* __R200_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index ab84d1752ba..441ac730d4c 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -49,80 +49,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_tex.h"
 #include "r200_tcl.h"
 
-
-#define R200_TXFORMAT_A8        R200_TXFORMAT_I8
-#define R200_TXFORMAT_L8        R200_TXFORMAT_I8
-#define R200_TXFORMAT_AL88      R200_TXFORMAT_AI88
-#define R200_TXFORMAT_YCBCR     R200_TXFORMAT_YVYU422
-#define R200_TXFORMAT_YCBCR_REV R200_TXFORMAT_VYUY422
-#define R200_TXFORMAT_RGB_DXT1  R200_TXFORMAT_DXT1
-#define R200_TXFORMAT_RGBA_DXT1 R200_TXFORMAT_DXT1
-#define R200_TXFORMAT_RGBA_DXT3 R200_TXFORMAT_DXT23
-#define R200_TXFORMAT_RGBA_DXT5 R200_TXFORMAT_DXT45
-
 #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
                              && (tx_table_be[f].format != 0xffffffff) )
 
-struct tx_table {
-   GLuint format, filter;
-};
-
-static const struct tx_table tx_table_be[] =
-{
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_A8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YCBCR, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_YCBCR_REV, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_RGBA_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_RGBA_DXT3 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_RGBA_DXT5 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
-static const struct tx_table tx_table_le[] =
-{
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { R200_TXFORMAT_ARGB8888, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_A8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YCBCR, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_YCBCR_REV, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_RGBA_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_RGBA_DXT3 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_RGBA_DXT5 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
 /* ================================================================
  * Texture combine functions
  */
diff --git a/src/mesa/drivers/dri/radeon/radeon_blit.c b/src/mesa/drivers/dri/radeon/radeon_blit.c
index 0de17514e05..0b0f06f0edb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_blit.c
+++ b/src/mesa/drivers/dri/radeon/radeon_blit.c
@@ -28,6 +28,7 @@
 #include "radeon_common.h"
 #include "radeon_context.h"
 #include "radeon_blit.h"
+#include "radeon_tex.h"
 
 static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
                                   int reg, int count)
@@ -40,19 +41,36 @@ static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
 /* common formats supported as both textures and render targets */
 unsigned r100_check_blit(mesa_format mesa_format, uint32_t dst_pitch)
 {
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-    case MESA_FORMAT_B5G6R5_UNORM:
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_L_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
+    /* XXX others?  */
+    if (_mesa_little_endian()) {
+	switch (mesa_format) {
+	case MESA_FORMAT_B8G8R8A8_UNORM:
+	case MESA_FORMAT_B8G8R8X8_UNORM:
+	case MESA_FORMAT_B5G6R5_UNORM:
+	case MESA_FORMAT_B4G4R4A4_UNORM:
+	case MESA_FORMAT_B5G5R5A1_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
 	    break;
-    default:
+	default:
+	    return 0;
+	}
+    }
+    else {
+	switch (mesa_format) {
+	case MESA_FORMAT_A8R8G8B8_UNORM:
+	case MESA_FORMAT_X8R8G8B8_UNORM:
+	case MESA_FORMAT_R5G6B5_UNORM:
+	case MESA_FORMAT_A4R4G4B4_UNORM:
+	case MESA_FORMAT_A1R5G5B5_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	    break;
+	default:
 	    return 0;
+	}
     }
 
     /* Rendering to small buffer doesn't work.
@@ -106,40 +124,8 @@ static void inline emit_tx_setup(struct r100_context *r100,
     assert(height <= 2048);
     assert(offset % 32 == 0);
 
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-            txformat |= RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-            break;
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB8888;
-	    break;
-    case MESA_FORMAT_B5G6R5_UNORM:
-	    txformat |= RADEON_TXFORMAT_RGB565;
-	    break;
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-	    txformat |= RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_L_UNORM8:
-            txformat |= RADEON_TXFORMAT_I8;
-            break;
-    case MESA_FORMAT_L8A8_UNORM:
-            txformat |= RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-            break;
-    default:
-	    break;
-    }
-    
+    txformat |= tx_table[mesa_format].format;
+
     if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
        offset |= RADEON_TXO_MACRO_TILE;
     if (bo->flags & RADEON_BO_FLAGS_MICRO_TILE)
@@ -184,19 +170,25 @@ static inline void emit_cb_setup(struct r100_context *r100,
     uint32_t dst_format = 0;
     BATCH_LOCALS(&r100->radeon);
 
-    /* XXX others?  BE/LE? */
+    /* XXX others? */
     switch (mesa_format) {
+    /* The first of each pair is for little, the second for big endian. */
     case MESA_FORMAT_B8G8R8A8_UNORM:
+    case MESA_FORMAT_A8R8G8B8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
 	    break;
     case MESA_FORMAT_B5G6R5_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_RGB565;
 	    break;
     case MESA_FORMAT_B4G4R4A4_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB4444;
 	    break;
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
 	    break;
     case MESA_FORMAT_A_UNORM8:
@@ -425,5 +417,13 @@ unsigned r100_blit(struct gl_context *ctx,
 
     radeonFlush(ctx);
 
+    /* We submitted those packets outside our state atom mechanism. Thus
+     * make sure they are all resubmitted the next time. */
+    r100->hw.ctx.dirty = GL_TRUE;
+    r100->hw.msk.dirty = GL_TRUE;
+    r100->hw.set.dirty = GL_TRUE;
+    r100->hw.tex[0].dirty = GL_TRUE;
+    r100->hw.txr[0].dirty = GL_TRUE;
+
     return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 2a8bd6c9edc..fde89214ed2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -164,7 +164,7 @@ uint32_t radeonGetAge(radeonContextPtr radeon)
 
 	gp.param = RADEON_PARAM_LAST_CLEAR;
 	gp.value = (int *)&age;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+	ret = drmCommandWriteRead(radeon->radeonScreen->driScreen->fd, DRM_RADEON_GETPARAM,
 				  &gp, sizeof(gp));
 	if (ret) {
 		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __func__,
@@ -343,7 +343,7 @@ void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
 {
 	if (RADEON_DEBUG & RADEON_DRI)
 		fprintf(stderr, "%s %s\n", __func__,
-			_mesa_lookup_enum_by_nr( mode ));
+			_mesa_enum_to_string( mode ));
 
 	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
@@ -358,8 +358,8 @@ void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
        * that the front-buffer has actually been allocated.
        */
 		if (!was_front_buffer_rendering && radeon->is_front_buffer_rendering) {
-			radeon_update_renderbuffers(radeon->dri.context,
-				radeon->dri.context->driDrawablePriv, GL_FALSE);
+			radeon_update_renderbuffers(radeon->driContext,
+				radeon->driContext->driDrawablePriv, GL_FALSE);
       }
 	}
 
@@ -375,8 +375,8 @@ void radeonReadBuffer( struct gl_context *ctx, GLenum mode )
 					|| (mode == GL_FRONT);
 
 		if (!was_front_buffer_reading && rmesa->is_front_buffer_reading) {
-			radeon_update_renderbuffers(rmesa->dri.context,
-						    rmesa->dri.context->driReadablePriv, GL_FALSE);
+			radeon_update_renderbuffers(rmesa->driContext,
+						    rmesa->driContext->driReadablePriv, GL_FALSE);
 	 	}
 	}
 	/* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
@@ -399,7 +399,7 @@ void radeon_window_moved(radeonContextPtr radeon)
 void radeon_viewport(struct gl_context *ctx)
 {
 	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	__DRIcontext *driContext = radeon->dri.context;
+	__DRIcontext *driContext = radeon->driContext;
 	void (*old_viewport)(struct gl_context *ctx);
 
 	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
@@ -693,6 +693,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 {
 	GLuint size;
 	struct drm_radeon_gem_info mminfo = { 0 };
+	int fd = rmesa->radeonScreen->driScreen->fd;
 
 	/* Initialize command buffer */
 	size = 256 * driQueryOptioni(&rmesa->optionCache,
@@ -711,8 +712,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 			"Allocating %d bytes command buffer (max state is %d bytes)\n",
 			size * 4, rmesa->hw.max_state_size * 4);
 
-	rmesa->cmdbuf.csm =
-		radeon_cs_manager_gem_ctor(rmesa->radeonScreen->driScreen->fd);
+	rmesa->cmdbuf.csm = radeon_cs_manager_gem_ctor(fd);
 	if (rmesa->cmdbuf.csm == NULL) {
 		/* FIXME: fatal error */
 		return;
@@ -725,7 +725,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 				  (void (*)(void *))rmesa->glCtx.Driver.Flush, &rmesa->glCtx);
 
 
-	if (!drmCommandWriteRead(rmesa->dri.fd, DRM_RADEON_GEM_INFO,
+	if (!drmCommandWriteRead(fd, DRM_RADEON_GEM_INFO,
 				 &mminfo, sizeof(mminfo))) {
 		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM,
 				    mminfo.vram_visible);
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 9699dcbfcdc..4660d98c9a2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -162,10 +162,7 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 	_mesa_meta_init(ctx);
 
 	/* DRI fields */
-	radeon->dri.context = driContextPriv;
-	radeon->dri.screen = sPriv;
-	radeon->dri.fd = sPriv->fd;
-	radeon->dri.drmMinor = sPriv->drm_version.minor;
+	radeon->driContext = driContextPriv;
 
 	/* Setup IRQs */
 	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
@@ -194,6 +191,29 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 
 	radeon_init_dma(radeon);
 
+        /* _mesa_initialize_context calls _mesa_init_queryobj which
+         * initializes all of the counter sizes to 64.  The counters on r100
+         * and r200 are only 32-bits for occlusion queries.  Those are the
+         * only counters, so set the other sizes to zero.
+         */
+        radeon->glCtx.Const.QueryCounterBits.SamplesPassed = 32;
+
+        radeon->glCtx.Const.QueryCounterBits.TimeElapsed = 0;
+        radeon->glCtx.Const.QueryCounterBits.Timestamp = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesGenerated = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesWritten = 0;
+        radeon->glCtx.Const.QueryCounterBits.VerticesSubmitted = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesSubmitted = 0;
+        radeon->glCtx.Const.QueryCounterBits.VsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.TessPatches = 0;
+        radeon->glCtx.Const.QueryCounterBits.TessInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.GsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.GsPrimitives = 0;
+        radeon->glCtx.Const.QueryCounterBits.FsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.ComputeInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.ClInPrimitives = 0;
+        radeon->glCtx.Const.QueryCounterBits.ClOutPrimitives = 0;
+
 	return GL_TRUE;
 }
 
@@ -302,7 +322,7 @@ radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
  */
 void radeon_prepare_render(radeonContextPtr radeon)
 {
-    __DRIcontext *driContext = radeon->dri.context;
+    __DRIcontext *driContext = radeon->driContext;
     __DRIdrawable *drawable;
     __DRIscreen *screen;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index dc72592b90c..d142a871b40 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -342,17 +342,6 @@ struct radeon_store {
 	int elts_start;
 };
 
-struct radeon_dri_mirror {
-	__DRIcontext *context;	/* DRI context */
-	__DRIscreen *screen;	/* DRI screen */
-
-	drm_context_t hwContext;
-	drm_hw_lock_t *hwLock;
-	int hwLockCount;
-	int fd;
-	int drmMinor;
-};
-
 typedef void (*radeon_tri_func) (radeonContextPtr,
 				 radeonVertex *,
 				 radeonVertex *, radeonVertex *);
@@ -385,6 +374,7 @@ struct radeon_cmdbuf {
 
 struct radeon_context {
    struct gl_context glCtx;             /**< base class, must be first */
+   __DRIcontext *driContext;               /* DRI context */
    radeonScreenPtr radeonScreen;	/* Screen private DRI data */
 
    /* Texture object bookkeeping
@@ -407,9 +397,6 @@ struct radeon_context {
    /* Drawable information */
    unsigned int lastStamp;
 
-   /* Mirrors of some DRI state */
-   struct radeon_dri_mirror dri;
-
    /* Busy waiting */
    GLuint do_usleeps;
    GLuint do_irqs;
@@ -502,12 +489,12 @@ static inline radeonContextPtr RADEON_CONTEXT(struct gl_context *ctx)
 
 static inline __DRIdrawable* radeon_get_drawable(radeonContextPtr radeon)
 {
-	return radeon->dri.context->driDrawablePriv;
+	return radeon->driContext->driDrawablePriv;
 }
 
 static inline __DRIdrawable* radeon_get_readable(radeonContextPtr radeon)
 {
-	return radeon->dri.context->driReadablePriv;
+	return radeon->driContext->driReadablePriv;
 }
 
 extern const char const *radeonVendorString;
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index d4d19354b6d..a9e2ab563d3 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -191,16 +191,8 @@ r100CreateContext( gl_api api,
    rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
                                                  "def_max_anisotropy");
 
-   if ( driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
-      if ( sPriv->drm_version.minor < 13 )
-	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
-			  "disabling.\n", sPriv->drm_version.minor );
-      else
-	 rmesa->using_hyperz = GL_TRUE;
-   }
-
-   if ( sPriv->drm_version.minor >= 15 )
-      rmesa->texmicrotile = GL_TRUE;
+   if (driQueryOptionb(&rmesa->radeon.optionCache, "hyperz"))
+      rmesa->using_hyperz = GL_TRUE;
 
    /* Init default driver functions then plug in our Radeon-specific functions
     * (the texture functions are especially important)
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index 40325327813..badabd9508c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -426,7 +426,6 @@ struct r100_context {
 	struct r100_swtcl_info swtcl;
 
 	GLboolean using_hyperz;
-	GLboolean texmicrotile;
 
 	/* Performance counters
 	 */
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index ef62d097bae..5eece518c95 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -169,6 +169,7 @@ radeon_map_renderbuffer_s8z24(struct gl_context *ctx,
     rrb->map_buffer = malloc(w * h * 4);
     ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
     assert(!ret);
+    (void) ret;
     untiled_s8z24_map = rrb->map_buffer;
     tiled_s8z24_map = rrb->bo->ptr;
 
@@ -207,6 +208,7 @@ radeon_map_renderbuffer_z16(struct gl_context *ctx,
     rrb->map_buffer = malloc(w * h * 2);
     ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
     assert(!ret);
+    (void) ret;
 
     untiled_z16_map = rrb->map_buffer;
     tiled_z16_map = rrb->bo->ptr;
@@ -324,6 +326,7 @@ radeon_map_renderbuffer(struct gl_context *ctx,
 
    ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
    assert(!ret);
+   (void) ret;
 
    map = rrb->bo->ptr;
    stride = rrb->map_pitch;
@@ -416,7 +419,6 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
 {
    struct radeon_context *const rmesa = RADEON_CONTEXT(ctx);
    struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
-   GLboolean ok;
 
    if ((rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_DEPTH_ALWAYS_TILED) && !rrb->has_surface) {
        if (rb->Format == MESA_FORMAT_Z24_UNORM_S8_UINT || rb->Format == MESA_FORMAT_Z24_UNORM_X8_UINT) {
@@ -438,6 +440,7 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
    radeon_bo_unmap(rrb->map_bo);
 
    if (rrb->map_mode & GL_MAP_WRITE_BIT) {
+      GLboolean ok;
       ok = rmesa->vtbl.blit(ctx, rrb->map_bo, 0,
 			    rb->Format, rrb->map_pitch / rrb->cpp,
 			    rrb->map_w, rrb->map_h,
@@ -449,6 +452,7 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
 			    rrb->map_w, rrb->map_h,
 			    GL_FALSE);
       assert(ok);
+      (void) ok;
    }
 
    radeon_bo_unref(rrb->map_bo);
@@ -700,7 +704,7 @@ radeon_bind_framebuffer(struct gl_context * ctx, GLenum target,
   radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s(%p, fb %p, target %s) \n",
 		__func__, ctx, fb,
-		_mesa_lookup_enum_by_nr(target));
+		_mesa_enum_to_string(target));
 
    if (target == GL_FRAMEBUFFER_EXT || target == GL_DRAW_FRAMEBUFFER_EXT) {
       radeon_draw_buffer(ctx, fb);
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index 28591cad895..c71766d0a3e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -276,7 +276,7 @@ static void calculate_min_max_lod(struct gl_sampler_object *samp, struct gl_text
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 			"%s(%p) target %s, min %d, max %d.\n",
 			__func__, tObj,
-			_mesa_lookup_enum_by_nr(tObj->Target),
+			_mesa_enum_to_string(tObj->Target),
 			minLod, maxLod);
 
 	/* save these values */
diff --git a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
index 6998444fb66..e115b749da5 100644
--- a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
+++ b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
@@ -212,7 +212,7 @@ radeonReadPixels(struct gl_context * ctx,
      */
     radeon_print(RADEON_FALLBACKS, RADEON_NORMAL,
                  "Falling back to sw for ReadPixels (format %s, type %s)\n",
-                 _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type));
+                 _mesa_enum_to_string(format), _mesa_enum_to_string(type));
 
     if (ctx->NewState)
         _mesa_update_state(ctx);
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 45d9b2b8c0b..98b4741b456 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -135,36 +135,26 @@ DRI_CONF_END
 static int
 radeonGetParam(__DRIscreen *sPriv, int param, void *value)
 {
-  int ret;
-  drm_radeon_getparam_t gp = { 0 };
   struct drm_radeon_info info = { 0 };
 
-  if (sPriv->drm_version.major >= 2) {
-      info.value = (uint64_t)(uintptr_t)value;
-      switch (param) {
-      case RADEON_PARAM_DEVICE_ID:
-          info.request = RADEON_INFO_DEVICE_ID;
-          break;
-      case RADEON_PARAM_NUM_GB_PIPES:
-          info.request = RADEON_INFO_NUM_GB_PIPES;
-          break;
-      case RADEON_PARAM_NUM_Z_PIPES:
-          info.request = RADEON_INFO_NUM_Z_PIPES;
-          break;
-      case RADEON_INFO_TILE_CONFIG:
-	  info.request = RADEON_INFO_TILE_CONFIG;
-          break;
-      default:
-          return -EINVAL;
-      }
-      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_INFO, &info, sizeof(info));
-  } else {
-      gp.param = param;
-      gp.value = value;
-
-      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+  info.value = (uint64_t)(uintptr_t)value;
+  switch (param) {
+  case RADEON_PARAM_DEVICE_ID:
+    info.request = RADEON_INFO_DEVICE_ID;
+    break;
+  case RADEON_PARAM_NUM_GB_PIPES:
+    info.request = RADEON_INFO_NUM_GB_PIPES;
+    break;
+  case RADEON_PARAM_NUM_Z_PIPES:
+    info.request = RADEON_INFO_NUM_Z_PIPES;
+    break;
+  case RADEON_INFO_TILE_CONFIG:
+    info.request = RADEON_INFO_TILE_CONFIG;
+    break;
+  default:
+    return -EINVAL;
   }
-  return ret;
+  return drmCommandWriteRead(sPriv->fd, DRM_RADEON_INFO, &info, sizeof(info));
 }
 
 #if defined(RADEON_R100)
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index cba3d9c9689..74c1fc6c902 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -1354,7 +1354,7 @@ void radeonUpdateWindow( struct gl_context *ctx )
    GLfloat xoffset = 0.0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->h : 0;
    const GLboolean render_to_fbo = (ctx->DrawBuffer ? _mesa_is_user_fbo(ctx->DrawBuffer) : 0);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    GLfloat y_scale, y_bias;
 
    if (render_to_fbo) {
@@ -1452,7 +1452,7 @@ static void radeonEnable( struct gl_context *ctx, GLenum cap, GLboolean state )
 
    if ( RADEON_DEBUG & RADEON_STATE )
       fprintf( stderr, "%s( %s = %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( cap ),
+	       _mesa_enum_to_string( cap ),
 	       state ? "GL_TRUE" : "GL_FALSE" );
 
    switch ( cap ) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index c800edfc7be..5e2f41fdb4a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -336,12 +336,15 @@ static void ctx_emit_cs(struct gl_context *ctx, struct radeon_state_atom *atom)
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
    else switch (rrb->base.Base.Format) {
    case MESA_FORMAT_B5G6R5_UNORM:
+   case MESA_FORMAT_R5G6B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
 	break;
    case MESA_FORMAT_B4G4R4A4_UNORM:
+   case MESA_FORMAT_A4R4G4B4_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
 	break;
    case MESA_FORMAT_B5G5R5A1_UNORM:
+   case MESA_FORMAT_A1R5G5B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
 	break;
    default:
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index 8a1fbab39f8..2fbd353297b 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -442,7 +442,7 @@ static GLboolean radeon_run_render( struct gl_context *ctx,
 
       radeon_print(RADEON_SWRENDER, RADEON_NORMAL,
 	  "radeon_render.c: prim %s %d..%d\n",
-		 _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
+		 _mesa_enum_to_string(prim & PRIM_MODE_MASK), 
 		 start, start+length);
 
       if (length)
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index 353fdb00ec8..0955a135de8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -263,7 +263,7 @@ static void radeonTexEnv( struct gl_context *ctx, GLenum target,
 
    if ( RADEON_DEBUG & RADEON_STATE ) {
       fprintf( stderr, "%s( %s )\n",
-	       __func__, _mesa_lookup_enum_by_nr( pname ) );
+	       __func__, _mesa_enum_to_string( pname ) );
    }
 
    switch ( pname ) {
@@ -335,7 +335,7 @@ static void radeonTexParameter( struct gl_context *ctx,
    radeonTexObj* t = radeon_tex_obj(texObj);
 
    radeon_print(RADEON_TEXTURE, RADEON_VERBOSE, "%s( %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( pname ) );
+	       _mesa_enum_to_string( pname ) );
 
    switch ( pname ) {
    case GL_TEXTURE_BASE_LEVEL:
@@ -359,7 +359,7 @@ static void radeonDeleteTexture( struct gl_context *ctx,
 
    radeon_print(RADEON_TEXTURE, RADEON_NORMAL,
 	 "%s( %p (target = %s) )\n", __func__, (void *)texObj,
-	       _mesa_lookup_enum_by_nr( texObj->Target ) );
+	       _mesa_enum_to_string( texObj->Target ) );
 
    if ( rmesa ) {
      radeon_firevertices(&rmesa->radeon);
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h b/src/mesa/drivers/dri/radeon/radeon_tex.h
index fa57c08987d..f8ec432755a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
@@ -51,4 +51,39 @@ extern void radeonTexUpdateParameters(struct gl_context *ctx, GLuint unit);
 
 extern void radeonInitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *functions );
 
+struct tx_table {
+   GLuint format, filter;
+};
+
+/* XXX verify this table against MESA_FORMAT_x values */
+static const struct tx_table tx_table[] =
+{
+   [ MESA_FORMAT_NONE ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { RADEON_TXFORMAT_ARGB8888, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { RADEON_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { RADEON_TXFORMAT_YVYU422, RADEON_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { RADEON_TXFORMAT_VYUY422, RADEON_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { RADEON_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { RADEON_TXFORMAT_DXT1 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { RADEON_TXFORMAT_DXT23 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { RADEON_TXFORMAT_DXT45 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+
 #endif /* __RADEON_TEX_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index 45667efb65f..ec835f248eb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -53,53 +53,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_tcl.h"
 
 
-#define RADEON_TXFORMAT_A8        RADEON_TXFORMAT_I8
-#define RADEON_TXFORMAT_L8        RADEON_TXFORMAT_I8
-#define RADEON_TXFORMAT_AL88      RADEON_TXFORMAT_AI88
-#define RADEON_TXFORMAT_YCBCR     RADEON_TXFORMAT_YVYU422
-#define RADEON_TXFORMAT_YCBCR_REV RADEON_TXFORMAT_VYUY422
-#define RADEON_TXFORMAT_RGB_DXT1  RADEON_TXFORMAT_DXT1
-#define RADEON_TXFORMAT_RGBA_DXT1 RADEON_TXFORMAT_DXT1
-#define RADEON_TXFORMAT_RGBA_DXT3 RADEON_TXFORMAT_DXT23
-#define RADEON_TXFORMAT_RGBA_DXT5 RADEON_TXFORMAT_DXT45
-
 #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
 			     && (tx_table[f].format != 0xffffffff) )
 
-struct tx_table {
-   GLuint format, filter;
-};
-
-/* XXX verify this table against MESA_FORMAT_x values */
-static const struct tx_table tx_table[] =
-{
-   [ MESA_FORMAT_NONE ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { RADEON_TXFORMAT_ARGB8888, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { RADEON_TXFORMAT_AL88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { RADEON_TXFORMAT_AL88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { RADEON_TXFORMAT_A8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { RADEON_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { RADEON_TXFORMAT_YCBCR, RADEON_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { RADEON_TXFORMAT_YCBCR_REV, RADEON_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { RADEON_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { RADEON_TXFORMAT_RGBA_DXT1 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { RADEON_TXFORMAT_RGBA_DXT3 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { RADEON_TXFORMAT_RGBA_DXT5 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
 /* ================================================================
  * Texture combine functions
  */
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index edfd48b283b..4794ddae069 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -224,7 +224,19 @@ static mesa_format radeonChoose8888TexFormat(radeonContextPtr rmesa,
 	const GLuint ui = 1;
 	const GLubyte littleEndian = *((const GLubyte *)&ui);
 
-	if (fbo)
+
+	/* Unfortunately, regardless the fbo flag, we might still be asked to
+	 * attach a texture to a fbo later, which then won't succeed if we chose
+	 * one which isn't renderable. And unlike more exotic formats, apps aren't
+	 * really prepared for the incomplete framebuffer this results in (they'd
+	 * have to retry with same internalFormat even, just different
+	 * srcFormat/srcType, which can't really be expected anyway).
+	 * Ideally, we'd defer format selection until later (if the texture is
+	 * used as a rt it's likely there's never data uploaded to it before attached
+	 * to a fbo), but this isn't really possible, so for now just always use
+	 * a renderable format.
+	 */
+	if (1 || fbo)
 		return _radeon_texformat_argb8888;
 
 	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
@@ -267,8 +279,8 @@ mesa_format radeonChooseTextureFormat(struct gl_context * ctx,
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s InternalFormat=%s(%d) type=%s format=%s\n",
 		__func__,
-		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
-		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+		_mesa_enum_to_string(internalFormat), internalFormat,
+		_mesa_enum_to_string(type), _mesa_enum_to_string(format));
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 			"%s do32bpt=%d force16bpt=%d\n",
 			__func__, do32bpt, force16bpt);
@@ -531,7 +543,7 @@ void radeon_image_target_texture_2d(struct gl_context *ctx, GLenum target,
 	__DRIscreen *screen;
 	__DRIimage *image;
 
-	screen = radeon->dri.screen;
+	screen = radeon->radeonScreen->driScreen;
 	image = screen->dri2.image->lookupEGLImage(screen, image_handle,
 						   screen->loaderPrivate);
 	if (image == NULL)
diff --git a/src/mesa/drivers/dri/swrast/Makefile.am b/src/mesa/drivers/dri/swrast/Makefile.am
index bfc3c10e334..9d21d9ea4dc 100644
--- a/src/mesa/drivers/dri/swrast/Makefile.am
+++ b/src/mesa/drivers/dri/swrast/Makefile.am
@@ -24,7 +24,6 @@
 include Makefile.sources
 
 AM_CFLAGS = \
-	-D__NOT_HAVE_DRM_H \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/ \
 	-I$(top_srcdir)/src/mapi \
@@ -33,6 +32,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	$(LIBDRM_CFLAGS) \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS)
 
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index 022523eb00b..5c7dcac3841 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -1124,7 +1124,7 @@ static struct name_function functions[] = {
    { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
    { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
    { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
-   { "OSMesaPixelsStore", (OSMESAproc) OSMesaPixelStore },
+   { "OSMesaPixelStore", (OSMESAproc) OSMesaPixelStore },
    { "OSMesaGetIntegerv", (OSMESAproc) OSMesaGetIntegerv },
    { "OSMesaGetDepthBuffer", (OSMESAproc) OSMesaGetDepthBuffer },
    { "OSMesaGetColorBuffer", (OSMESAproc) OSMesaGetColorBuffer },
diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index 9c2e29e6472..53c8fb893b5 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -69,6 +69,25 @@ check_valid_to_render(struct gl_context *ctx, const char *function)
          return false;
       }
 
+      /* The spec argues that this is allowed because a tess ctrl shader
+       * without a tess eval shader can be used with transform feedback.
+       * However, glBeginTransformFeedback doesn't allow GL_PATCHES and
+       * therefore doesn't allow tessellation.
+       *
+       * Further investigation showed that this is indeed a spec bug and
+       * a tess ctrl shader without a tess eval shader shouldn't have been
+       * allowed, because there is no API in GL 4.0 that can make use this
+       * to produce something useful.
+       *
+       * Also, all vendors except one don't support a tess ctrl shader without
+       * a tess eval shader anyway.
+       */
+      if (ctx->TessCtrlProgram._Current && !ctx->TessEvalProgram._Current) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(tess eval shader is missing)", function);
+         return false;
+      }
+
       /* Section 7.3 (Program Objects) of the OpenGL 4.5 Core Profile spec
        * says:
        *
@@ -127,6 +146,9 @@ _mesa_is_valid_prim_mode(struct gl_context *ctx, GLenum mode)
    if (mode <= GL_TRIANGLE_STRIP_ADJACENCY)
       return _mesa_has_geometry_shaders(ctx);
 
+   if (mode == GL_PATCHES)
+      return _mesa_has_tessellation(ctx);
+
    return false;
 }
 
@@ -136,6 +158,7 @@ _mesa_is_valid_prim_mode(struct gl_context *ctx, GLenum mode)
  * etc?  Also, do additional checking related to transformation feedback.
  * Note: this function cannot be called during glNewList(GL_COMPILE) because
  * this code depends on current transform feedback state.
+ * Also, do additional checking related to tessellation shaders.
  */
 GLboolean
 _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
@@ -170,11 +193,29 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
     *   TRIANGLES_ADJACENCY_ARB and <mode> is not
     *   TRIANGLES_ADJACENCY_ARB or TRIANGLE_STRIP_ADJACENCY_ARB.
     *
+    * The GL spec doesn't mention any interaction with tessellation, which
+    * is clearly a spec bug. The same rule should apply, but instead of
+    * the draw primitive mode, the tessellation evaluation shader primitive
+    * mode should be used for the checking.
    */
    if (ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]) {
       const GLenum geom_mode =
          ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]->Geom.InputType;
-      switch (mode) {
+      struct gl_shader_program *tes =
+         ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+      GLenum mode_before_gs = mode;
+
+      if (tes) {
+         if (tes->TessEval.PointMode)
+            mode_before_gs = GL_POINTS;
+         else if (tes->TessEval.PrimitiveMode == GL_ISOLINES)
+            mode_before_gs = GL_LINES;
+         else
+            /* the GL_QUADS mode generates triangles too */
+            mode_before_gs = GL_TRIANGLES;
+      }
+
+      switch (mode_before_gs) {
       case GL_POINTS:
          valid_enum = (geom_mode == GL_POINTS);
          break;
@@ -209,12 +250,42 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(mode=%s vs geometry shader input %s)",
                      name,
-                     _mesa_lookup_prim_by_nr(mode),
+                     _mesa_lookup_prim_by_nr(mode_before_gs),
                      _mesa_lookup_prim_by_nr(geom_mode));
          return GL_FALSE;
       }
    }
 
+   /* From the OpenGL 4.0 (Core Profile) spec (section 2.12):
+    *
+    *     "Tessellation operates only on patch primitives. If tessellation is
+    *      active, any command that transfers vertices to the GL will
+    *      generate an INVALID_OPERATION error if the primitive mode is not
+    *      PATCHES.
+    *      Patch primitives are not supported by pipeline stages below the
+    *      tessellation evaluation shader. If there is no active program
+    *      object or the active program object does not contain a tessellation
+    *      evaluation shader, the error INVALID_OPERATION is generated by any
+    *      command that transfers vertices to the GL if the primitive mode is
+    *      PATCHES."
+    *
+    */
+   if (ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL] ||
+       ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL]) {
+      if (mode != GL_PATCHES) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "only GL_PATCHES valid with tessellation");
+         return GL_FALSE;
+      }
+   }
+   else {
+      if (mode == GL_PATCHES) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "GL_PATCHES only valid with tessellation");
+         return GL_FALSE;
+      }
+   }
+
    /* From the GL_EXT_transform_feedback spec:
     *
     *     "The error INVALID_OPERATION is generated if Begin, or any command
@@ -247,6 +318,17 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
             pass = GL_FALSE;
          }
       }
+      else if (ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL]) {
+         struct gl_shader_program *tes =
+            ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+
+         if (tes->TessEval.PointMode)
+            pass = ctx->TransformFeedback.Mode == GL_POINTS;
+         else if (tes->TessEval.PrimitiveMode == GL_ISOLINES)
+            pass = ctx->TransformFeedback.Mode == GL_LINES;
+         else
+            pass = ctx->TransformFeedback.Mode == GL_TRIANGLES;
+      }
       else {
          switch (mode) {
          case GL_POINTS:
@@ -291,7 +373,7 @@ valid_elements_type(struct gl_context *ctx, GLenum type, const char *name)
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(type = %s)", name,
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(type));
       return false;
    }
 }
diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c
index 9fc35520a38..935ba05b7cc 100644
--- a/src/mesa/main/atifragshader.c
+++ b/src/mesa/main/atifragshader.c
@@ -132,21 +132,21 @@ static void debug_op(GLint optype, GLuint arg_count, GLenum op, GLuint dst,
 
   op_name = atifs_ops[(arg_count-1)+(optype?3:0)];
   
-  fprintf(stderr, "%s(%s, %s", op_name, _mesa_lookup_enum_by_nr(op),
-	      _mesa_lookup_enum_by_nr(dst));
+  fprintf(stderr, "%s(%s, %s", op_name, _mesa_enum_to_string(op),
+	      _mesa_enum_to_string(dst));
   if (!optype)
     fprintf(stderr, ", %d", dstMask);
   
   fprintf(stderr, ", %s", create_dst_mod_str(dstMod));
   
-  fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg1),
-	      _mesa_lookup_enum_by_nr(arg1Rep), arg1Mod);
+  fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg1),
+	      _mesa_enum_to_string(arg1Rep), arg1Mod);
   if (arg_count>1)
-    fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg2),
-	      _mesa_lookup_enum_by_nr(arg2Rep), arg2Mod);
+    fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg2),
+	      _mesa_enum_to_string(arg2Rep), arg2Mod);
   if (arg_count>2)
-    fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg3),
-	      _mesa_lookup_enum_by_nr(arg3Rep), arg3Mod);
+    fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg3),
+	      _mesa_enum_to_string(arg3Rep), arg3Mod);
 
   fprintf(stderr,")\n");
 
@@ -383,7 +383,7 @@ _mesa_EndFragmentShaderATI(void)
    for (j = 0; j < MAX_NUM_PASSES_ATI; j++) {
       for (i = 0; i < MAX_NUM_FRAGMENT_REGISTERS_ATI; i++) {
 	 GLuint op = curProg->SetupInst[j][i].Opcode;
-	 const char *op_enum = op > 5 ? _mesa_lookup_enum_by_nr(op) : "0";
+	 const char *op_enum = op > 5 ? _mesa_enum_to_string(op) : "0";
 	 GLuint src = curProg->SetupInst[j][i].src;
 	 GLuint swizzle = curProg->SetupInst[j][i].swizzle;
 	 fprintf(stderr, "%2d %04X %s %d %04X\n", i, op, op_enum, src,
@@ -392,8 +392,8 @@ _mesa_EndFragmentShaderATI(void)
       for (i = 0; i < curProg->numArithInstr[j]; i++) {
 	 GLuint op0 = curProg->Instructions[j][i].Opcode[0];
 	 GLuint op1 = curProg->Instructions[j][i].Opcode[1];
-	 const char *op0_enum = op0 > 5 ? _mesa_lookup_enum_by_nr(op0) : "0";
-	 const char *op1_enum = op1 > 5 ? _mesa_lookup_enum_by_nr(op1) : "0";
+	 const char *op0_enum = op0 > 5 ? _mesa_enum_to_string(op0) : "0";
+	 const char *op1_enum = op1 > 5 ? _mesa_enum_to_string(op1) : "0";
 	 GLuint count0 = curProg->Instructions[j][i].ArgCount[0];
 	 GLuint count1 = curProg->Instructions[j][i].ArgCount[1];
 	 fprintf(stderr, "%2d %04X %s %d %04X %s %d\n", i, op0, op0_enum, count0,
@@ -477,8 +477,8 @@ _mesa_PassTexCoordATI(GLuint dst, GLuint coord, GLenum swizzle)
 
 #if MESA_DEBUG_ATI_FS
    _mesa_debug(ctx, "%s(%s, %s, %s)\n", __func__,
-	       _mesa_lookup_enum_by_nr(dst), _mesa_lookup_enum_by_nr(coord),
-	       _mesa_lookup_enum_by_nr(swizzle));
+	       _mesa_enum_to_string(dst), _mesa_enum_to_string(coord),
+	       _mesa_enum_to_string(swizzle));
 #endif
 }
 
@@ -550,8 +550,8 @@ _mesa_SampleMapATI(GLuint dst, GLuint interp, GLenum swizzle)
 
 #if MESA_DEBUG_ATI_FS
    _mesa_debug(ctx, "%s(%s, %s, %s)\n", __func__,
-	       _mesa_lookup_enum_by_nr(dst), _mesa_lookup_enum_by_nr(interp),
-	       _mesa_lookup_enum_by_nr(swizzle));
+	       _mesa_enum_to_string(dst), _mesa_enum_to_string(interp),
+	       _mesa_enum_to_string(swizzle));
 #endif
 }
 
diff --git a/src/mesa/main/attrib.c b/src/mesa/main/attrib.c
index 53626e38be9..08f13178f84 100644
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -937,7 +937,7 @@ _mesa_PopAttrib(void)
 
       if (MESA_VERBOSE & VERBOSE_API) {
          _mesa_debug(ctx, "glPopAttrib %s\n",
-                     _mesa_lookup_enum_by_nr(attr->kind));
+                     _mesa_enum_to_string(attr->kind));
       }
 
       switch (attr->kind) {
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index d869fa2aa09..4fc32962425 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -128,28 +128,28 @@ validate_blend_factors(struct gl_context *ctx, const char *func,
    if (!legal_src_factor(ctx, sfactorRGB)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(sfactorRGB = %s)", func,
-                  _mesa_lookup_enum_by_nr(sfactorRGB));
+                  _mesa_enum_to_string(sfactorRGB));
       return GL_FALSE;
    }
 
    if (!legal_dst_factor(ctx, dfactorRGB)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(dfactorRGB = %s)", func,
-                  _mesa_lookup_enum_by_nr(dfactorRGB));
+                  _mesa_enum_to_string(dfactorRGB));
       return GL_FALSE;
    }
 
    if (sfactorA != sfactorRGB && !legal_src_factor(ctx, sfactorA)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(sfactorA = %s)", func,
-                  _mesa_lookup_enum_by_nr(sfactorA));
+                  _mesa_enum_to_string(sfactorA));
       return GL_FALSE;
    }
 
    if (dfactorA != dfactorRGB && !legal_dst_factor(ctx, dfactorA)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(dfactorA = %s)", func,
-                  _mesa_lookup_enum_by_nr(dfactorA));
+                  _mesa_enum_to_string(dfactorA));
       return GL_FALSE;
    }
 
@@ -208,10 +208,10 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendFuncSeparate %s %s %s %s\n",
-                  _mesa_lookup_enum_by_nr(sfactorRGB),
-                  _mesa_lookup_enum_by_nr(dfactorRGB),
-                  _mesa_lookup_enum_by_nr(sfactorA),
-                  _mesa_lookup_enum_by_nr(dfactorA));
+                  _mesa_enum_to_string(sfactorRGB),
+                  _mesa_enum_to_string(dfactorRGB),
+                  _mesa_enum_to_string(sfactorA),
+                  _mesa_enum_to_string(dfactorA));
 
    if (!validate_blend_factors(ctx, "glBlendFuncSeparate",
                                sfactorRGB, dfactorRGB,
@@ -342,7 +342,7 @@ _mesa_BlendEquation( GLenum mode )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquation(%s)\n",
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
 
    if (!legal_blend_equation(ctx, mode)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glBlendEquation");
@@ -385,7 +385,7 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationi(%u, %s)\n",
-                  buf, _mesa_lookup_enum_by_nr(mode));
+                  buf, _mesa_enum_to_string(mode));
 
    if (buf >= ctx->Const.MaxDrawBuffers) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBlendFuncSeparatei(buffer=%u)",
@@ -421,8 +421,8 @@ _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationSeparateEXT(%s %s)\n",
-                  _mesa_lookup_enum_by_nr(modeRGB),
-                  _mesa_lookup_enum_by_nr(modeA));
+                  _mesa_enum_to_string(modeRGB),
+                  _mesa_enum_to_string(modeA));
 
    if ( (modeRGB != modeA) && !ctx->Extensions.EXT_blend_equation_separate ) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -476,8 +476,8 @@ _mesa_BlendEquationSeparateiARB(GLuint buf, GLenum modeRGB, GLenum modeA)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationSeparatei(%u, %s %s)\n", buf,
-                  _mesa_lookup_enum_by_nr(modeRGB),
-                  _mesa_lookup_enum_by_nr(modeA));
+                  _mesa_enum_to_string(modeRGB),
+                  _mesa_enum_to_string(modeA));
 
    if (buf >= ctx->Const.MaxDrawBuffers) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBlendEquationSeparatei(buffer=%u)",
@@ -567,7 +567,10 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glAlphaFunc(%s, %f)\n",
-                  _mesa_lookup_enum_by_nr(func), ref);
+                  _mesa_enum_to_string(func), ref);
+
+   if (ctx->Color.AlphaFunc == func && ctx->Color.AlphaRefUnclamped == ref)
+      return; /* no change */
 
    switch (func) {
    case GL_NEVER:
@@ -578,9 +581,6 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref )
    case GL_NOTEQUAL:
    case GL_GEQUAL:
    case GL_ALWAYS:
-      if (ctx->Color.AlphaFunc == func && ctx->Color.AlphaRefUnclamped == ref)
-         return; /* no change */
-
       FLUSH_VERTICES(ctx, _NEW_COLOR);
       ctx->Color.AlphaFunc = func;
       ctx->Color.AlphaRefUnclamped = ref;
@@ -613,7 +613,7 @@ _mesa_LogicOp( GLenum opcode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glLogicOp(%s)\n", _mesa_lookup_enum_by_nr(opcode));
+      _mesa_debug(ctx, "glLogicOp(%s)\n", _mesa_enum_to_string(opcode));
 
    switch (opcode) {
       case GL_CLEAR:
@@ -790,7 +790,7 @@ _mesa_ClampColor(GLenum target, GLenum clamp)
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glClampColor(%s)",
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
 }
 
 static GLboolean
@@ -930,12 +930,10 @@ void _mesa_init_color( struct gl_context * ctx )
    ctx->Color._ClampFragmentColor = GL_FALSE;
    ctx->Color.ClampReadColor = GL_FIXED_ONLY_ARB;
 
-   if (ctx->API == API_OPENGLES2) {
-      /* GLES 3 behaves as though GL_FRAMEBUFFER_SRGB is always enabled. */
-      ctx->Color.sRGBEnabled = GL_TRUE;
-   } else {
-      ctx->Color.sRGBEnabled = GL_FALSE;
-   }
+   /* GLES 1/2/3 behaves as though GL_FRAMEBUFFER_SRGB is always enabled
+    * if EGL_KHR_gl_colorspace has been used to request sRGB.
+    */
+   ctx->Color.sRGBEnabled = _mesa_is_gles(ctx);
 }
 
 /*@}*/
diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c
index db8fee5a414..a32f1a42aea 100644
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -37,6 +37,7 @@
 #include "framebuffer.h"
 #include "glformats.h"
 #include "mtypes.h"
+#include "macros.h"
 #include "state.h"
 
 
@@ -59,6 +60,31 @@ find_attachment(const struct gl_framebuffer *fb,
 
 
 /**
+ * \return true if two regions overlap, false otherwise
+ */
+bool
+_mesa_regions_overlap(int srcX0, int srcY0,
+                      int srcX1, int srcY1,
+                      int dstX0, int dstY0,
+                      int dstX1, int dstY1)
+{
+   if (MAX2(srcX0, srcX1) < MIN2(dstX0, dstX1))
+      return false; /* dst completely right of src */
+
+   if (MAX2(dstX0, dstX1) < MIN2(srcX0, srcX1))
+      return false; /* dst completely left of src */
+
+   if (MAX2(srcY0, srcY1) < MIN2(dstY0, dstY1))
+      return false; /* dst completely above src */
+
+   if (MAX2(dstY0, dstY1) < MIN2(srcY0, srcY1))
+      return false; /* dst completely below src */
+
+   return true; /* some overlap */
+}
+
+
+/**
  * Helper function for checking if the datatypes of color buffers are
  * compatible for glBlitFramebuffer.  From the 3.1 spec, page 198:
  *
@@ -186,7 +212,7 @@ _mesa_blit_framebuffer(struct gl_context *ctx,
 
    if (!is_valid_blit_filter(ctx, filter)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid filter %s)", func,
-                  _mesa_lookup_enum_by_nr(filter));
+                  _mesa_enum_to_string(filter));
       return;
    }
 
@@ -194,7 +220,7 @@ _mesa_blit_framebuffer(struct gl_context *ctx,
         filter == GL_SCALED_RESOLVE_NICEST_EXT) &&
         (readFb->Visual.samples == 0 || drawFb->Visual.samples > 0)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s: invalid samples)", func,
-                  _mesa_lookup_enum_by_nr(filter));
+                  _mesa_enum_to_string(filter));
       return;
    }
 
@@ -522,7 +548,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                   " %d, %d, %d, %d, 0x%x, %s)\n",
                   srcX0, srcY0, srcX1, srcY1,
                   dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_lookup_enum_by_nr(filter));
+                  mask, _mesa_enum_to_string(filter));
 
    _mesa_blit_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
                           srcX0, srcY0, srcX1, srcY1,
@@ -547,7 +573,7 @@ _mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
                   readFramebuffer, drawFramebuffer,
                   srcX0, srcY0, srcX1, srcY1,
                   dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_lookup_enum_by_nr(filter));
+                  mask, _mesa_enum_to_string(filter));
 
    /*
     * According to PDF page 533 of the OpenGL 4.5 core spec (30.10.2014,
diff --git a/src/mesa/main/blit.h b/src/mesa/main/blit.h
index 54b946e3192..88dd4a9ec8d 100644
--- a/src/mesa/main/blit.h
+++ b/src/mesa/main/blit.h
@@ -28,6 +28,12 @@
 
 #include "glheader.h"
 
+extern bool
+_mesa_regions_overlap(int srcX0, int srcY0,
+                      int srcX1, int srcY1,
+                      int dstX0, int dstY0,
+                      int dstX1, int dstY1);
+
 extern void
 _mesa_blit_framebuffer(struct gl_context *ctx,
                        struct gl_framebuffer *readFb,
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 66dee680258..1cdea937f91 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -91,8 +91,9 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
    case GL_COPY_WRITE_BUFFER:
       return &ctx->CopyWriteBuffer;
    case GL_DRAW_INDIRECT_BUFFER:
-      if (ctx->API == API_OPENGL_CORE &&
-          ctx->Extensions.ARB_draw_indirect) {
+      if ((ctx->API == API_OPENGL_CORE &&
+           ctx->Extensions.ARB_draw_indirect) ||
+           _mesa_is_gles31(ctx)) {
          return &ctx->DrawIndirectBuffer;
       }
       break;
@@ -112,6 +113,11 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
          return &ctx->UniformBuffer;
       }
       break;
+   case GL_SHADER_STORAGE_BUFFER:
+      if (ctx->Extensions.ARB_shader_storage_buffer_object) {
+         return &ctx->ShaderStorageBuffer;
+      }
+      break;
    case GL_ATOMIC_COUNTER_BUFFER:
       if (ctx->Extensions.ARB_shader_atomic_counters) {
          return &ctx->AtomicBuffer;
@@ -831,6 +837,9 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
    _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer,
 				 ctx->Shared->NullBufferObj);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer,
+                                 ctx->Shared->NullBufferObj);
+
    _mesa_reference_buffer_object(ctx, &ctx->AtomicBuffer,
 				 ctx->Shared->NullBufferObj);
 
@@ -845,6 +854,14 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
       ctx->UniformBufferBindings[i].Size = -1;
    }
 
+   for (i = 0; i < MAX_COMBINED_SHADER_STORAGE_BUFFERS; i++) {
+      _mesa_reference_buffer_object(ctx,
+                                    &ctx->ShaderStorageBufferBindings[i].BufferObject,
+                                    ctx->Shared->NullBufferObj);
+      ctx->ShaderStorageBufferBindings[i].Offset = -1;
+      ctx->ShaderStorageBufferBindings[i].Size = -1;
+   }
+
    for (i = 0; i < MAX_COMBINED_ATOMIC_BUFFERS; i++) {
       _mesa_reference_buffer_object(ctx,
 				    &ctx->AtomicBufferBindings[i].BufferObject,
@@ -867,6 +884,8 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 
    _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer, NULL);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, NULL);
+
    _mesa_reference_buffer_object(ctx, &ctx->AtomicBuffer, NULL);
 
    _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer, NULL);
@@ -877,6 +896,12 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 				    NULL);
    }
 
+   for (i = 0; i < MAX_COMBINED_SHADER_STORAGE_BUFFERS; i++) {
+      _mesa_reference_buffer_object(ctx,
+                                    &ctx->ShaderStorageBufferBindings[i].BufferObject,
+                                    NULL);
+   }
+
    for (i = 0; i < MAX_COMBINED_ATOMIC_BUFFERS; i++) {
       _mesa_reference_buffer_object(ctx,
 				    &ctx->AtomicBufferBindings[i].BufferObject,
@@ -1158,7 +1183,7 @@ _mesa_BindBuffer(GLenum target, GLuint buffer)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBindBuffer(%s, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), buffer);
+                  _mesa_enum_to_string(target), buffer);
 
    bind_buffer_object(ctx, target, buffer);
 }
@@ -1240,6 +1265,17 @@ _mesa_DeleteBuffers(GLsizei n, const GLuint *ids)
             _mesa_BindBuffer( GL_UNIFORM_BUFFER, 0 );
          }
 
+         /* unbind SSBO binding points */
+         for (j = 0; j < ctx->Const.MaxShaderStorageBufferBindings; j++) {
+            if (ctx->ShaderStorageBufferBindings[j].BufferObject == bufObj) {
+               _mesa_BindBufferBase(GL_SHADER_STORAGE_BUFFER, j, 0);
+            }
+         }
+
+         if (ctx->ShaderStorageBuffer == bufObj) {
+            _mesa_BindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+         }
+
          /* unbind Atomci Buffer binding points */
          for (j = 0; j < ctx->Const.MaxAtomicBufferBindings; j++) {
             if (ctx->AtomicBufferBindings[j].BufferObject == bufObj) {
@@ -1500,9 +1536,9 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "%s(%s, %ld, %p, %s)\n",
                   func,
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   (long int) size, data,
-                  _mesa_lookup_enum_by_nr(usage));
+                  _mesa_enum_to_string(usage));
 
    if (size < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size < 0)", func);
@@ -1535,7 +1571,7 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
 
    if (!valid_usage) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid usage: %s)", func,
-                  _mesa_lookup_enum_by_nr(usage));
+                  _mesa_enum_to_string(usage));
       return;
    }
 
@@ -1990,7 +2026,7 @@ get_buffer_parameter(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname: %s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return false;
 }
 
@@ -2337,7 +2373,7 @@ _mesa_map_buffer_range(struct gl_context *ctx,
 
    if (offset + length > bufObj->Size) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(offset %ld + length %ld > buffer_size %ld)", func,
+                  "%s(offset %td + length %td > buffer_size %td)", func,
                   offset, length, bufObj->Size);
       return NULL;
    }
@@ -2999,6 +3035,33 @@ set_ubo_binding(struct gl_context *ctx,
 }
 
 /**
+ * Binds a buffer object to a shader storage buffer binding point.
+ *
+ * The caller is responsible for flushing vertices and updating
+ * NewDriverState.
+ */
+static void
+set_ssbo_binding(struct gl_context *ctx,
+                 struct gl_shader_storage_buffer_binding *binding,
+                 struct gl_buffer_object *bufObj,
+                 GLintptr offset,
+                 GLsizeiptr size,
+                 GLboolean autoSize)
+{
+   _mesa_reference_buffer_object(ctx, &binding->BufferObject, bufObj);
+
+   binding->Offset = offset;
+   binding->Size = size;
+   binding->AutomaticSize = autoSize;
+
+   /* If this is a real buffer object, mark it has having been used
+    * at some point as a SSBO.
+    */
+   if (size >= 0)
+      bufObj->UsageHistory |= USAGE_SHADER_STORAGE_BUFFER;
+}
+
+/**
  * Binds a buffer object to a uniform buffer binding point.
  *
  * Unlike set_ubo_binding(), this function also flushes vertices
@@ -3030,6 +3093,37 @@ bind_uniform_buffer(struct gl_context *ctx,
 }
 
 /**
+ * Binds a buffer object to a shader storage buffer binding point.
+ *
+ * Unlike set_ssbo_binding(), this function also flushes vertices
+ * and updates NewDriverState.  It also checks if the binding
+ * has actually changed before updating it.
+ */
+static void
+bind_shader_storage_buffer(struct gl_context *ctx,
+                           GLuint index,
+                           struct gl_buffer_object *bufObj,
+                           GLintptr offset,
+                           GLsizeiptr size,
+                           GLboolean autoSize)
+{
+   struct gl_shader_storage_buffer_binding *binding =
+      &ctx->ShaderStorageBufferBindings[index];
+
+   if (binding->BufferObject == bufObj &&
+       binding->Offset == offset &&
+       binding->Size == size &&
+       binding->AutomaticSize == autoSize) {
+      return;
+   }
+
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   set_ssbo_binding(ctx, binding, bufObj, offset, size, autoSize);
+}
+
+/**
  * Bind a region of a buffer object to a uniform block binding point.
  * \param index  the uniform buffer binding point index
  * \param bufObj  the buffer object
@@ -3064,6 +3158,40 @@ bind_buffer_range_uniform_buffer(struct gl_context *ctx,
    bind_uniform_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
 }
 
+/**
+ * Bind a region of a buffer object to a shader storage block binding point.
+ * \param index  the shader storage buffer binding point index
+ * \param bufObj  the buffer object
+ * \param offset  offset to the start of buffer object region
+ * \param size  size of the buffer object region
+ */
+static void
+bind_buffer_range_shader_storage_buffer(struct gl_context *ctx,
+                                        GLuint index,
+                                        struct gl_buffer_object *bufObj,
+                                        GLintptr offset,
+                                        GLsizeiptr size)
+{
+   if (index >= ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferRange(index=%d)", index);
+      return;
+   }
+
+   if (offset & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glBindBufferRange(offset misaligned %d/%d)", (int) offset,
+                  ctx->Const.ShaderStorageBufferOffsetAlignment);
+      return;
+   }
+
+   if (bufObj == ctx->Shared->NullBufferObj) {
+      offset = -1;
+      size = -1;
+   }
+
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
+   bind_shader_storage_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
+}
 
 /**
  * Bind a buffer object to a uniform block binding point.
@@ -3088,6 +3216,28 @@ bind_buffer_base_uniform_buffer(struct gl_context *ctx,
 }
 
 /**
+ * Bind a buffer object to a shader storage block binding point.
+ * As above, but offset = 0.
+ */
+static void
+bind_buffer_base_shader_storage_buffer(struct gl_context *ctx,
+                                       GLuint index,
+                                       struct gl_buffer_object *bufObj)
+{
+   if (index >= ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferBase(index=%d)", index);
+      return;
+   }
+
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
+
+   if (bufObj == ctx->Shared->NullBufferObj)
+      bind_shader_storage_buffer(ctx, index, bufObj, -1, -1, GL_TRUE);
+   else
+      bind_shader_storage_buffer(ctx, index, bufObj, 0, 0, GL_TRUE);
+}
+
+/**
  * Binds a buffer object to an atomic buffer binding point.
  *
  * The caller is responsible for validating the offset,
@@ -3219,6 +3369,35 @@ error_check_bind_uniform_buffers(struct gl_context *ctx,
    return true;
 }
 
+static bool
+error_check_bind_shader_storage_buffers(struct gl_context *ctx,
+                                        GLuint first, GLsizei count,
+                                        const char *caller)
+{
+   if (!ctx->Extensions.ARB_shader_storage_buffer_object) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "%s(target=GL_SHADER_STORAGE_BUFFER)", caller);
+      return false;
+   }
+
+   /* The ARB_multi_bind_spec says:
+    *
+    *     "An INVALID_OPERATION error is generated if <first> + <count> is
+    *      greater than the number of target-specific indexed binding points,
+    *      as described in section 6.7.1."
+    */
+   if (first + count > ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(first=%u + count=%d > the value of "
+                  "GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS=%u)",
+                  caller, first, count,
+                  ctx->Const.MaxShaderStorageBufferBindings);
+      return false;
+   }
+
+   return true;
+}
+
 /**
  * Unbind all uniform buffers in the range
  * <first> through <first>+<count>-1
@@ -3234,6 +3413,22 @@ unbind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count)
                       bufObj, -1, -1, GL_TRUE);
 }
 
+/**
+ * Unbind all shader storage buffers in the range
+ * <first> through <first>+<count>-1
+ */
+static void
+unbind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
+                              GLsizei count)
+{
+   struct gl_buffer_object *bufObj = ctx->Shared->NullBufferObj;
+   GLint i;
+
+   for (i = 0; i < count; i++)
+      set_ssbo_binding(ctx, &ctx->ShaderStorageBufferBindings[first + i],
+                       bufObj, -1, -1, GL_TRUE);
+}
+
 static void
 bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
                           const GLuint *buffers)
@@ -3301,6 +3496,73 @@ bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
 }
 
 static void
+bind_shader_storage_buffers_base(struct gl_context *ctx, GLuint first,
+                                 GLsizei count, const GLuint *buffers)
+{
+   GLint i;
+
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
+                                                "glBindBuffersBase"))
+      return;
+
+   /* Assume that at least one binding will be changed */
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   if (!buffers) {
+      /* The ARB_multi_bind spec says:
+       *
+       *   "If <buffers> is NULL, all bindings from <first> through
+       *    <first>+<count>-1 are reset to their unbound (zero) state."
+       */
+      unbind_shader_storage_buffers(ctx, first, count);
+      return;
+   }
+
+   /* Note that the error semantics for multi-bind commands differ from
+    * those of other GL commands.
+    *
+    * The Issues section in the ARB_multi_bind spec says:
+    *
+    *    "(11) Typically, OpenGL specifies that if an error is generated by a
+    *          command, that command has no effect.  This is somewhat
+    *          unfortunate for multi-bind commands, because it would require a
+    *          first pass to scan the entire list of bound objects for errors
+    *          and then a second pass to actually perform the bindings.
+    *          Should we have different error semantics?
+    *
+    *       RESOLVED:  Yes.  In this specification, when the parameters for
+    *       one of the <count> binding points are invalid, that binding point
+    *       is not updated and an error will be generated.  However, other
+    *       binding points in the same command will be updated if their
+    *       parameters are valid and no other error occurs."
+    */
+
+   _mesa_begin_bufferobj_lookups(ctx);
+
+   for (i = 0; i < count; i++) {
+      struct gl_shader_storage_buffer_binding *binding =
+          &ctx->ShaderStorageBufferBindings[first + i];
+      struct gl_buffer_object *bufObj;
+
+      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
+         bufObj = binding->BufferObject;
+      else
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
+                                                    "glBindBuffersBase");
+
+      if (bufObj) {
+         if (bufObj == ctx->Shared->NullBufferObj)
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
+         else
+            set_ssbo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
+      }
+   }
+
+   _mesa_end_bufferobj_lookups(ctx);
+}
+
+static void
 bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
                            const GLuint *buffers,
                            const GLintptr *offsets, const GLsizeiptr *sizes)
@@ -3405,6 +3667,112 @@ bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
    _mesa_end_bufferobj_lookups(ctx);
 }
 
+static void
+bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
+                                  GLsizei count, const GLuint *buffers,
+                                  const GLintptr *offsets,
+                                  const GLsizeiptr *sizes)
+{
+   GLint i;
+
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
+                                                "glBindBuffersRange"))
+      return;
+
+   /* Assume that at least one binding will be changed */
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   if (!buffers) {
+      /* The ARB_multi_bind spec says:
+       *
+       *    "If <buffers> is NULL, all bindings from <first> through
+       *     <first>+<count>-1 are reset to their unbound (zero) state.
+       *     In this case, the offsets and sizes associated with the
+       *     binding points are set to default values, ignoring
+       *     <offsets> and <sizes>."
+       */
+      unbind_shader_storage_buffers(ctx, first, count);
+      return;
+   }
+
+   /* Note that the error semantics for multi-bind commands differ from
+    * those of other GL commands.
+    *
+    * The Issues section in the ARB_multi_bind spec says:
+    *
+    *    "(11) Typically, OpenGL specifies that if an error is generated by a
+    *          command, that command has no effect.  This is somewhat
+    *          unfortunate for multi-bind commands, because it would require a
+    *          first pass to scan the entire list of bound objects for errors
+    *          and then a second pass to actually perform the bindings.
+    *          Should we have different error semantics?
+    *
+    *       RESOLVED:  Yes.  In this specification, when the parameters for
+    *       one of the <count> binding points are invalid, that binding point
+    *       is not updated and an error will be generated.  However, other
+    *       binding points in the same command will be updated if their
+    *       parameters are valid and no other error occurs."
+    */
+
+   _mesa_begin_bufferobj_lookups(ctx);
+
+   for (i = 0; i < count; i++) {
+      struct gl_shader_storage_buffer_binding *binding =
+         &ctx->ShaderStorageBufferBindings[first + i];
+      struct gl_buffer_object *bufObj;
+
+      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+         continue;
+
+      /* The ARB_multi_bind spec says:
+       *
+       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+       *      pair of values in <offsets> and <sizes> does not respectively
+       *      satisfy the constraints described for those parameters for the
+       *      specified target, as described in section 6.7.1 (per binding)."
+       *
+       * Section 6.7.1 refers to table 6.5, which says:
+       *
+       *     "┌───────────────────────────────────────────────────────────────┐
+       *      │ Shader storage buffer array bindings (see sec. 7.8)           │
+       *      ├─────────────────────┬─────────────────────────────────────────┤
+       *      │  ...                │  ...                                    │
+       *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
+       *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
+       *      │  ...                │  ...                                    │
+       *      │  size restriction   │  none                                   │
+       *      └─────────────────────┴─────────────────────────────────────────┘"
+       */
+      if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glBindBuffersRange(offsets[%u]=%" PRId64
+                     " is misaligned; it must be a multiple of the value of "
+                     "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
+                     "target=GL_SHADER_STORAGE_BUFFER)",
+                     i, (int64_t) offsets[i],
+                     ctx->Const.ShaderStorageBufferOffsetAlignment);
+         continue;
+      }
+
+      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
+         bufObj = binding->BufferObject;
+      else
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
+                                                    "glBindBuffersRange");
+
+      if (bufObj) {
+         if (bufObj == ctx->Shared->NullBufferObj)
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+         else
+            set_ssbo_binding(ctx, binding, bufObj,
+                             offsets[i], sizes[i], GL_FALSE);
+      }
+   }
+
+   _mesa_end_bufferobj_lookups(ctx);
+}
+
 static bool
 error_check_bind_xfb_buffers(struct gl_context *ctx,
                              struct gl_transform_feedback_object *tfObj,
@@ -3894,6 +4262,9 @@ _mesa_BindBufferRange(GLenum target, GLuint index,
    case GL_UNIFORM_BUFFER:
       bind_buffer_range_uniform_buffer(ctx, index, bufObj, offset, size);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_buffer_range_shader_storage_buffer(ctx, index, bufObj, offset, size);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffer(ctx, index, bufObj, offset, size,
                          "glBindBufferRange");
@@ -3960,6 +4331,9 @@ _mesa_BindBufferBase(GLenum target, GLuint index, GLuint buffer)
    case GL_UNIFORM_BUFFER:
       bind_buffer_base_uniform_buffer(ctx, index, bufObj);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_buffer_base_shader_storage_buffer(ctx, index, bufObj);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffer(ctx, index, bufObj, 0, 0,
                          "glBindBufferBase");
@@ -3984,13 +4358,17 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
    case GL_UNIFORM_BUFFER:
       bind_uniform_buffers_range(ctx, first, count, buffers, offsets, sizes);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_shader_storage_buffers_range(ctx, first, count, buffers, offsets,
+                                        sizes);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffers_range(ctx, first, count, buffers,
                                 offsets, sizes);
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersRange(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       break;
    }
 }
@@ -4008,12 +4386,15 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
    case GL_UNIFORM_BUFFER:
       bind_uniform_buffers_base(ctx, first, count, buffers);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_shader_storage_buffers_base(ctx, first, count, buffers);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffers_base(ctx, first, count, buffers);
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersBase(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       break;
    }
 }
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 0536266d756..93588a2ee18 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -251,7 +251,7 @@ _mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API) {
-      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_enum_to_string(buffer));
    }
 
    if (buffer == GL_NONE) {
@@ -264,14 +264,14 @@ _mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
       if (destMask == BAD_MASK) {
          /* totally bogus buffer */
          _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
       destMask &= supportedMask;
       if (destMask == 0x0) {
          /* none of the named color buffers exist! */
          _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid buffer %s)",
-                     caller, _mesa_lookup_enum_by_nr(buffer));
+                     caller, _mesa_enum_to_string(buffer));
          return;
       }
    }
@@ -411,7 +411,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
           */
          if (destMask[output] == BAD_MASK) {
             _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -427,7 +427,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
           */
          if (_mesa_bitcount(destMask[output]) > 1) {
             _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -445,7 +445,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
          if (destMask[output] == 0) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(unsupported buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -459,7 +459,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
              buffers[output] != GL_COLOR_ATTACHMENT0 + output) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(unsupported buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -471,7 +471,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
          if (destMask[output] & usedBufferMask) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(duplicated buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -700,7 +700,7 @@ _mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_enum_to_string(buffer));
 
    if (buffer == GL_NONE) {
       /* This is legal--it means that no buffer should be bound for reading. */
@@ -712,14 +712,14 @@ _mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
       if (srcBuffer == -1) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
       supportedMask = supported_buffer_bitmask(ctx, fb);
       if (((1 << srcBuffer) & supportedMask) == 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
    }
diff --git a/src/mesa/main/clear.c b/src/mesa/main/clear.c
index 426caea4709..3bfcc5c0e39 100644
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -325,6 +325,18 @@ _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
       _mesa_update_state( ctx );
    }
 
+   /* Page 498 of the PDF, section '17.4.3.1 Clearing Individual Buffers'
+    * of the OpenGL 4.5 spec states:
+    *
+    *    "An INVALID_ENUM error is generated by ClearBufferiv and
+    *     ClearNamedFramebufferiv if buffer is not COLOR or STENCIL."
+    */
+   if (buffer == GL_DEPTH || buffer == GL_DEPTH_STENCIL) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glClearBufferiv(buffer=GL_DEPTH || GL_DEPTH_STENCIL)");
+      return;
+   }
+
    switch (buffer) {
    case GL_STENCIL:
       /* Page 264 (page 280 of the PDF) of the OpenGL 3.0 spec says:
@@ -395,7 +407,7 @@ _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferiv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -485,7 +497,7 @@ _mesa_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferuiv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -596,7 +608,7 @@ _mesa_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferfv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -636,7 +648,7 @@ _mesa_ClearBufferfi(GLenum buffer, GLint drawbuffer,
 
    if (buffer != GL_DEPTH_STENCIL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferfi(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 
diff --git a/src/mesa/main/condrender.c b/src/mesa/main/condrender.c
index 77e4b95ee8f..46c6036d2a5 100644
--- a/src/mesa/main/condrender.c
+++ b/src/mesa/main/condrender.c
@@ -87,7 +87,7 @@ _mesa_BeginConditionalRender(GLuint queryId, GLenum mode)
       /* fallthrough - invalid */
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBeginConditionalRender(mode=%s)",
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
       return;
    }
 
@@ -184,7 +184,7 @@ _mesa_check_conditional_render(struct gl_context *ctx)
    default:
       _mesa_problem(ctx, "Bad cond render mode %s in "
                     " _mesa_check_conditional_render()",
-                    _mesa_lookup_enum_by_nr(ctx->Query.CondRenderMode));
+                    _mesa_enum_to_string(ctx->Query.CondRenderMode));
       return GL_TRUE;
    }
 }
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 9c3baf4c6aa..b35031db3c9 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -171,8 +171,10 @@
 #define MAX_PROGRAM_LOCAL_PARAMS       4096
 #define MAX_UNIFORMS                   4096
 #define MAX_UNIFORM_BUFFERS            15 /* + 1 default uniform buffer */
+#define MAX_SHADER_STORAGE_BUFFERS     7  /* + 1 default shader storage buffer */
 /* 6 is for vertex, hull, domain, geometry, fragment, and compute shader. */
 #define MAX_COMBINED_UNIFORM_BUFFERS   (MAX_UNIFORM_BUFFERS * 6)
+#define MAX_COMBINED_SHADER_STORAGE_BUFFERS   (MAX_SHADER_STORAGE_BUFFERS * 6)
 #define MAX_ATOMIC_COUNTERS            4096
 /* 6 is for vertex, hull, domain, geometry, fragment, and compute shader. */
 #define MAX_COMBINED_ATOMIC_BUFFERS    (MAX_UNIFORM_BUFFERS * 6)
@@ -272,6 +274,12 @@
 #define MAX_VERTEX_STREAMS                  4
 /*@}*/
 
+/** For GL_ARB_shader_subroutine */
+/*@{*/
+#define MAX_SUBROUTINES                   256
+#define MAX_SUBROUTINE_UNIFORM_LOCATIONS  1024
+/*@}*/
+
 /** For GL_INTEL_performance_query */
 /*@{*/
 #define MAX_PERFQUERY_QUERY_NAME_LENGTH     256
@@ -294,6 +302,14 @@
 /** For GL_ARB_pipeline_statistics_query */
 #define MAX_PIPELINE_STATISTICS             11
 
+/** For GL_ARB_tessellation_shader */
+/*@{*/
+#define MAX_TESS_GEN_LEVEL 64
+#define MAX_PATCH_VERTICES 32
+#define MAX_TESS_PATCH_COMPONENTS 120
+#define MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS 4096
+/*@}*/
+
 /*
  * Color channel component order
  * 
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 79fa01849e0..888c461d1c2 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -120,6 +120,7 @@
 #include "shaderobj.h"
 #include "shaderimage.h"
 #include "util/simple_list.h"
+#include "util/strtod.h"
 #include "state.h"
 #include "stencil.h"
 #include "texcompress_s3tc.h"
@@ -338,31 +339,6 @@ _mesa_destroy_visual( struct gl_config *vis )
 
 
 /**
- * This is lame.  gdb only seems to recognize enum types that are
- * actually used somewhere.  We want to be able to print/use enum
- * values such as TEXTURE_2D_INDEX in gdb.  But we don't actually use
- * the gl_texture_index type anywhere.  Thus, this lame function.
- */
-static void
-dummy_enum_func(void)
-{
-   gl_buffer_index bi = BUFFER_FRONT_LEFT;
-   gl_face_index fi = FACE_POS_X;
-   gl_frag_result fr = FRAG_RESULT_DEPTH;
-   gl_texture_index ti = TEXTURE_2D_ARRAY_INDEX;
-   gl_vert_attrib va = VERT_ATTRIB_POS;
-   gl_varying_slot vs = VARYING_SLOT_POS;
-
-   (void) bi;
-   (void) fi;
-   (void) fr;
-   (void) ti;
-   (void) va;
-   (void) vs;
-}
-
-
-/**
  * One-time initialization mutex lock.
  *
  * \sa Used by one_time_init().
@@ -370,6 +346,16 @@ dummy_enum_func(void)
 mtx_t OneTimeLock = _MTX_INITIALIZER_NP;
 
 
+/**
+ * Calls all the various one-time-fini functions in Mesa
+ */
+
+static void
+one_time_fini(void)
+{
+   _mesa_destroy_shader_compiler();
+   _mesa_locale_fini();
+}
 
 /**
  * Calls all the various one-time-init functions in Mesa.
@@ -391,13 +377,14 @@ one_time_init( struct gl_context *ctx )
    if (!api_init_mask) {
       GLuint i;
 
-      /* do some implementation tests */
-      assert( sizeof(GLbyte) == 1 );
-      assert( sizeof(GLubyte) == 1 );
-      assert( sizeof(GLshort) == 2 );
-      assert( sizeof(GLushort) == 2 );
-      assert( sizeof(GLint) == 4 );
-      assert( sizeof(GLuint) == 4 );
+      STATIC_ASSERT(sizeof(GLbyte) == 1);
+      STATIC_ASSERT(sizeof(GLubyte) == 1);
+      STATIC_ASSERT(sizeof(GLshort) == 2);
+      STATIC_ASSERT(sizeof(GLushort) == 2);
+      STATIC_ASSERT(sizeof(GLint) == 4);
+      STATIC_ASSERT(sizeof(GLuint) == 4);
+
+      _mesa_locale_init();
 
       _mesa_one_time_init_extension_overrides();
 
@@ -407,6 +394,8 @@ one_time_init( struct gl_context *ctx )
          _mesa_ubyte_to_float_color_tab[i] = (float) i / 255.0F;
       }
 
+      atexit(one_time_fini);
+
 #if defined(DEBUG) && defined(__DATE__) && defined(__TIME__)
       if (MESA_VERBOSE != 0) {
 	 _mesa_debug(ctx, "Mesa %s DEBUG build %s %s\n",
@@ -429,13 +418,6 @@ one_time_init( struct gl_context *ctx )
    api_init_mask |= 1 << ctx->API;
 
    mtx_unlock(&OneTimeLock);
-
-   /* Hopefully atexit() is widely available.  If not, we may need some
-    * #ifdef tests here.
-    */
-   atexit(_mesa_destroy_shader_compiler);
-
-   dummy_enum_func();
 }
 
 
@@ -496,6 +478,8 @@ init_program_limits(struct gl_constants *consts, gl_shader_stage stage,
       prog->MaxInputComponents = 16 * 4; /* old limit not to break tnl and swrast */
       prog->MaxOutputComponents = 0; /* value not used */
       break;
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       prog->MaxParameters = MAX_VERTEX_PROGRAM_PARAMS;
       prog->MaxAttribs = MAX_VERTEX_GENERIC_ATTRIBS;
@@ -554,6 +538,8 @@ init_program_limits(struct gl_constants *consts, gl_shader_stage stage,
 
    prog->MaxAtomicBuffers = 0;
    prog->MaxAtomicCounters = 0;
+
+   prog->MaxShaderStorageBlocks = 8;
 }
 
 
@@ -615,6 +601,12 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api)
    consts->MaxUniformBlockSize = 16384;
    consts->UniformBufferOffsetAlignment = 1;
 
+   /** GL_ARB_shader_storage_buffer_object */
+   consts->MaxCombinedShaderStorageBlocks = 8;
+   consts->MaxShaderStorageBufferBindings = 8;
+   consts->MaxShaderStorageBlockSize = 128 * 1024 * 1024; /* 2^27 */
+   consts->ShaderStorageBufferOffsetAlignment = 256;
+
    /* GL_ARB_explicit_uniform_location, GL_MAX_UNIFORM_LOCATIONS */
    consts->MaxUserAssignableUniformLocations =
       4 * MESA_SHADER_STAGES * MAX_UNIFORMS;
@@ -724,6 +716,14 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api)
 
    /** GL_KHR_context_flush_control */
    consts->ContextReleaseBehavior = GL_CONTEXT_RELEASE_BEHAVIOR_FLUSH;
+
+   /** GL_ARB_tessellation_shader */
+   consts->MaxTessGenLevel = MAX_TESS_GEN_LEVEL;
+   consts->MaxPatchVertices = MAX_PATCH_VERTICES;
+   consts->Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits = MAX_TEXTURE_IMAGE_UNITS;
+   consts->Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits = MAX_TEXTURE_IMAGE_UNITS;
+   consts->MaxTessPatchComponents = MAX_TESS_PATCH_COMPONENTS;
+   consts->MaxTessControlTotalOutputComponents = MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS;
 }
 
 
@@ -1331,6 +1331,8 @@ _mesa_free_context_data( struct gl_context *ctx )
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current, NULL);
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._TnlProgram, NULL);
 
+   _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current, NULL);
+   _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current, NULL);
    _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
 
    _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, NULL);
diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
index 6f3c941016f..0f7529ad975 100644
--- a/src/mesa/main/context.h
+++ b/src/mesa/main/context.h
@@ -343,6 +343,26 @@ _mesa_has_compute_shaders(const struct gl_context *ctx)
       (ctx->API == API_OPENGLES2 && ctx->Version >= 31);
 }
 
+/**
+ * Checks if the context supports shader subroutines.
+ */
+static inline bool
+_mesa_has_shader_subroutine(const struct gl_context *ctx)
+{
+   return ctx->API == API_OPENGL_CORE &&
+      (ctx->Version >= 40 || ctx->Extensions.ARB_shader_subroutine);
+}
+
+/**
+ * Checks if the context supports tessellation.
+ */
+static inline GLboolean
+_mesa_has_tessellation(const struct gl_context *ctx)
+{
+   return ctx->API == API_OPENGL_CORE &&
+          ctx->Extensions.ARB_tessellation_shader;
+}
+
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index e8732c6175b..05bc50dd2c6 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -93,7 +93,7 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCopyImageSubData(%sTarget = %s)", dbg_prefix,
-                  _mesa_lookup_enum_by_nr(*target));
+                  _mesa_enum_to_string(*target));
       return false;
    }
 
@@ -159,7 +159,7 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
       if ((*tex_obj)->Target != *target) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCopyImageSubData(%sTarget = %s)", dbg_prefix,
-                     _mesa_lookup_enum_by_nr(*target));
+                     _mesa_enum_to_string(*target));
          return false;
       }
 
@@ -416,9 +416,9 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
       _mesa_debug(ctx, "glCopyImageSubData(%u, %s, %d, %d, %d, %d, "
                                           "%u, %s, %d, %d, %d, %d, "
                                           "%d, %d, %d)\n",
-                  srcName, _mesa_lookup_enum_by_nr(srcTarget), srcLevel,
+                  srcName, _mesa_enum_to_string(srcTarget), srcLevel,
                   srcX, srcY, srcZ,
-                  dstName, _mesa_lookup_enum_by_nr(dstTarget), dstLevel,
+                  dstName, _mesa_enum_to_string(dstTarget), dstLevel,
                   dstX, dstY, dstZ,
                   srcWidth, srcHeight, srcWidth);
 
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index d783e34222f..87eb63ea374 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -232,11 +232,13 @@ struct dd_function_table {
 
 
    /**
-    * Called by glGetTexImage().
+    * Called by glGetTexImage(), glGetTextureSubImage().
     */
-   void (*GetTexImage)( struct gl_context *ctx,
-                        GLenum format, GLenum type, GLvoid *pixels,
-                        struct gl_texture_image *texImage );
+   void (*GetTexSubImage)(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage);
 
    /**
     * Called by glClearTex[Sub]Image
@@ -326,16 +328,19 @@ struct dd_function_table {
    void (*CompressedTexSubImage)(struct gl_context *ctx, GLuint dims,
                                  struct gl_texture_image *texImage,
                                  GLint xoffset, GLint yoffset, GLint zoffset,
-                                 GLsizei width, GLint height, GLint depth,
+                                 GLsizei width, GLsizei height, GLsizei depth,
                                  GLenum format,
                                  GLsizei imageSize, const GLvoid *data);
 
    /**
     * Called by glGetCompressedTexImage.
     */
-   void (*GetCompressedTexImage)(struct gl_context *ctx,
-                                 struct gl_texture_image *texImage,
-                                 GLvoid *data);
+   void (*GetCompressedTexSubImage)(struct gl_context *ctx,
+                                    struct gl_texture_image *texImage,
+                                    GLint xoffset, GLint yoffset,
+                                    GLint zoffset, GLsizei width,
+                                    GLsizei height, GLsizei depth,
+                                    GLvoid *data);
    /*@}*/
 
    /**
diff --git a/src/mesa/main/debug.c b/src/mesa/main/debug.c
index c93e84a04d0..5ca7d5ce500 100644
--- a/src/mesa/main/debug.c
+++ b/src/mesa/main/debug.c
@@ -272,7 +272,9 @@ write_texture_image(struct gl_texture_object *texObj,
       store = ctx->Pack; /* save */
       ctx->Pack = ctx->DefaultPacking;
 
-      ctx->Driver.GetTexImage(ctx, GL_RGBA, GL_UNSIGNED_BYTE, buffer, img);
+      ctx->Driver.GetTexSubImage(ctx,
+                                 0, 0, 0, img->Width, img->Height, img->Depth,
+                                 GL_RGBA, GL_UNSIGNED_BYTE, buffer, img);
 
       /* make filename */
       _mesa_snprintf(s, sizeof(s), "/tmp/tex%u.l%u.f%u.ppm", texObj->Name, level, face);
@@ -411,7 +413,7 @@ dump_renderbuffer(const struct gl_renderbuffer *rb, GLboolean writeImage)
 {
    printf("Renderbuffer %u: %u x %u  IntFormat = %s\n",
 	  rb->Name, rb->Width, rb->Height,
-	  _mesa_lookup_enum_by_nr(rb->InternalFormat));
+	  _mesa_enum_to_string(rb->InternalFormat));
    if (writeImage) {
       _mesa_write_renderbuffer_image(rb);
    }
diff --git a/src/mesa/main/depth.c b/src/mesa/main/depth.c
index bb4591cf152..c3534407599 100644
--- a/src/mesa/main/depth.c
+++ b/src/mesa/main/depth.c
@@ -63,7 +63,7 @@ _mesa_DepthFunc( GLenum func )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_lookup_enum_by_nr(func));
+      _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_enum_to_string(func));
 
    if (ctx->Depth.Func == func)
       return;
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index aafe486fb60..5554738d1a3 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -9000,7 +9000,7 @@ _mesa_NewList(GLuint name, GLenum mode)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glNewList %u %s\n", name,
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
 
    if (name == 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glNewList");
@@ -9688,7 +9688,7 @@ _mesa_initialize_save_table(const struct gl_context *ctx)
 static const char *
 enum_string(GLenum k)
 {
-   return _mesa_lookup_enum_by_nr(k);
+   return _mesa_enum_to_string(k);
 }
 
 
@@ -9827,19 +9827,19 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
             break;
          case OPCODE_BIND_TEXTURE:
             fprintf(f, "BindTexture %s %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui), n[2].ui);
+                         _mesa_enum_to_string(n[1].ui), n[2].ui);
             break;
          case OPCODE_SHADE_MODEL:
-            fprintf(f, "ShadeModel %s\n", _mesa_lookup_enum_by_nr(n[1].ui));
+            fprintf(f, "ShadeModel %s\n", _mesa_enum_to_string(n[1].ui));
             break;
          case OPCODE_MAP1:
             fprintf(f, "Map1 %s %.3f %.3f %d %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui),
+                         _mesa_enum_to_string(n[1].ui),
                          n[2].f, n[3].f, n[4].i, n[5].i);
             break;
          case OPCODE_MAP2:
             fprintf(f, "Map2 %s %.3f %.3f %.3f %.3f %d %d %d %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui),
+                         _mesa_enum_to_string(n[1].ui),
                          n[2].f, n[3].f, n[4].f, n[5].f,
                          n[6].i, n[7].i, n[8].i, n[9].i);
             break;
@@ -9918,7 +9918,7 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
 
          case OPCODE_PROVOKING_VERTEX:
             fprintf(f, "ProvokingVertex %s\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui));
+                         _mesa_enum_to_string(n[1].ui));
             break;
 
             /*
diff --git a/src/mesa/main/drawpix.c b/src/mesa/main/drawpix.c
index 55035f214b3..720a082ce6d 100644
--- a/src/mesa/main/drawpix.c
+++ b/src/mesa/main/drawpix.c
@@ -53,10 +53,10 @@ _mesa_DrawPixels( GLsizei width, GLsizei height,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glDrawPixels(%d, %d, %s, %s, %p) // to %s at %d, %d\n",
                   width, height,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type),
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type),
                   pixels,
-                  _mesa_lookup_enum_by_nr(ctx->DrawBuffer->ColorDrawBuffer[0]),
+                  _mesa_enum_to_string(ctx->DrawBuffer->ColorDrawBuffer[0]),
                   IROUND(ctx->Current.RasterPos[0]),
                   IROUND(ctx->Current.RasterPos[1]));
 
@@ -96,8 +96,8 @@ _mesa_DrawPixels( GLsizei width, GLsizei height,
    err = _mesa_error_check_format_and_type(ctx, format, type);
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err, "glDrawPixels(invalid format %s and/or type %s)",
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       goto end;
    }
 
@@ -198,9 +198,9 @@ _mesa_CopyPixels( GLint srcx, GLint srcy, GLsizei width, GLsizei height,
       _mesa_debug(ctx,
                   "glCopyPixels(%d, %d, %d, %d, %s) // from %s to %s at %d, %d\n",
                   srcx, srcy, width, height,
-                  _mesa_lookup_enum_by_nr(type),
-                  _mesa_lookup_enum_by_nr(ctx->ReadBuffer->ColorReadBuffer),
-                  _mesa_lookup_enum_by_nr(ctx->DrawBuffer->ColorDrawBuffer[0]),
+                  _mesa_enum_to_string(type),
+                  _mesa_enum_to_string(ctx->ReadBuffer->ColorReadBuffer),
+                  _mesa_enum_to_string(ctx->DrawBuffer->ColorDrawBuffer[0]),
                   IROUND(ctx->Current.RasterPos[0]),
                   IROUND(ctx->Current.RasterPos[1]));
 
@@ -218,7 +218,7 @@ _mesa_CopyPixels( GLint srcx, GLint srcy, GLsizei width, GLsizei height,
        type != GL_STENCIL &&
        type != GL_DEPTH_STENCIL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glCopyPixels(type=%s)",
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(type));
       return;
    }
 
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index 9008a386343..42f67990784 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -146,7 +146,7 @@ client_state(struct gl_context *ctx, GLenum cap, GLboolean state)
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "gl%sClientState(%s)",
-               state ? "Enable" : "Disable", _mesa_lookup_enum_by_nr(cap));
+               state ? "Enable" : "Disable", _mesa_enum_to_string(cap));
 }
 
 
@@ -283,7 +283,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "%s %s (newstate is %x)\n",
                   state ? "glEnable" : "glDisable",
-                  _mesa_lookup_enum_by_nr(cap),
+                  _mesa_enum_to_string(cap),
                   ctx->NewState);
 
    switch (cap) {
@@ -1001,7 +1001,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
 
       /* ARB_texture_multisample */
       case GL_SAMPLE_MASK:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles31(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_texture_multisample, cap);
          if (ctx->Multisample.SampleMask == state)
@@ -1022,7 +1022,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "gl%s(%s)",
-               state ? "Enable" : "Disable", _mesa_lookup_enum_by_nr(cap));
+               state ? "Enable" : "Disable", _mesa_enum_to_string(cap));
 }
 
 
@@ -1101,7 +1101,7 @@ _mesa_set_enablei(struct gl_context *ctx, GLenum cap,
 invalid_enum_error:
     _mesa_error(ctx, GL_INVALID_ENUM, "%s(cap=%s)",
                 state ? "glEnablei" : "glDisablei",
-                _mesa_lookup_enum_by_nr(cap));
+                _mesa_enum_to_string(cap));
 }
 
 
@@ -1143,7 +1143,7 @@ _mesa_IsEnabledi( GLenum cap, GLuint index )
       return (ctx->Scissor.EnableFlags >> index) & 1;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glIsEnabledIndexed(cap=%s)",
-                  _mesa_lookup_enum_by_nr(cap));
+                  _mesa_enum_to_string(cap));
       return GL_FALSE;
    }
 }
@@ -1603,7 +1603,7 @@ _mesa_IsEnabled( GLenum cap )
 
       /* ARB_texture_multisample */
       case GL_SAMPLE_MASK:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles31(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_texture_multisample);
          return ctx->Multisample.SampleMask;
@@ -1623,6 +1623,6 @@ _mesa_IsEnabled( GLenum cap )
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "glIsEnabled(%s)",
-               _mesa_lookup_enum_by_nr(cap));
+               _mesa_enum_to_string(cap));
    return GL_FALSE;
 }
diff --git a/src/mesa/main/enums.h b/src/mesa/main/enums.h
index 66bdd53bbab..0e18cd407e9 100644
--- a/src/mesa/main/enums.h
+++ b/src/mesa/main/enums.h
@@ -42,7 +42,7 @@ extern "C" {
 #endif
 
 
-extern const char *_mesa_lookup_enum_by_nr( int nr );
+extern const char *_mesa_enum_to_string( int nr );
 
 /* Get the name of an enum given that it is a primitive type.  Avoids
  * GL_FALSE/GL_POINTS ambiguity and others.
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index b3406665d94..f720de316e4 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -1314,7 +1314,7 @@ flush_delayed_errors( struct gl_context *ctx )
    if (ctx->ErrorDebugCount) {
       _mesa_snprintf(s, MAX_DEBUG_MESSAGE_LENGTH, "%d similar %s errors", 
                      ctx->ErrorDebugCount,
-                     _mesa_lookup_enum_by_nr(ctx->ErrorValue));
+                     _mesa_enum_to_string(ctx->ErrorValue));
 
       output_if_debug("Mesa", s, GL_TRUE);
 
@@ -1503,7 +1503,7 @@ _mesa_error( struct gl_context *ctx, GLenum error, const char *fmtString, ... )
       }
 
       len = _mesa_snprintf(s2, MAX_DEBUG_MESSAGE_LENGTH, "%s in %s",
-                           _mesa_lookup_enum_by_nr(error), s);
+                           _mesa_enum_to_string(error), s);
       if (len >= MAX_DEBUG_MESSAGE_LENGTH) {
          /* Same as above. */
          assert(0);
diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index 24f234f7f10..81e47a8b8c1 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -37,6 +37,7 @@
 
 
 #include <stdio.h>
+#include <stdarg.h>
 #include "compiler.h"
 #include "glheader.h"
 #include "mtypes.h"
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 4176a69ed7c..d934d19c3e7 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -121,6 +121,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_framebuffer_object",                  o(ARB_framebuffer_object),                  GL,             2005 },
    { "GL_ARB_framebuffer_sRGB",                    o(EXT_framebuffer_sRGB),                    GL,             1998 },
    { "GL_ARB_get_program_binary",                  o(dummy_true),                              GL,             2010 },
+   { "GL_ARB_get_texture_sub_image",               o(dummy_true),                              GL,             2014 },
    { "GL_ARB_gpu_shader5",                         o(ARB_gpu_shader5),                         GLC,            2010 },
    { "GL_ARB_gpu_shader_fp64",                     o(ARB_gpu_shader_fp64),                     GLC,            2010 },
    { "GL_ARB_half_float_pixel",                    o(dummy_true),                              GL,             2003 },
@@ -154,6 +155,8 @@ static const struct extension extension_table[] = {
    { "GL_ARB_shader_objects",                      o(dummy_true),                              GL,             2002 },
    { "GL_ARB_shader_precision",                    o(ARB_shader_precision),                    GL,             2010 },
    { "GL_ARB_shader_stencil_export",               o(ARB_shader_stencil_export),               GL,             2009 },
+   { "GL_ARB_shader_storage_buffer_object",        o(ARB_shader_storage_buffer_object),        GL,             2012 },
+   { "GL_ARB_shader_subroutine",                   o(ARB_shader_subroutine),                   GLC,            2010 },
    { "GL_ARB_shader_texture_lod",                  o(ARB_shader_texture_lod),                  GL,             2009 },
    { "GL_ARB_shading_language_100",                o(dummy_true),                              GLL,            2003 },
    { "GL_ARB_shading_language_packing",            o(ARB_shading_language_packing),            GL,             2011 },
@@ -382,6 +385,9 @@ static const struct extension extension_table[] = {
    { "GL_NV_point_sprite",                         o(NV_point_sprite),                         GL,             2001 },
    { "GL_NV_primitive_restart",                    o(NV_primitive_restart),                    GLL,            2002 },
    { "GL_NV_read_buffer",                          o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_depth",                           o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_depth_stencil",                   o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_stencil",                         o(dummy_true),                              ES2,            2011 },
    { "GL_NV_texgen_reflection",                    o(dummy_true),                              GLL,            1999 },
    { "GL_NV_texture_barrier",                      o(NV_texture_barrier),                      GL,             2009 },
    { "GL_NV_texture_env_combine4",                 o(NV_texture_env_combine4),                 GLL,            1999 },
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index f8dcf122d99..841834030df 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -2007,7 +2007,7 @@ renderbuffer_storage(struct gl_context *ctx, struct gl_renderbuffer *rb,
    baseFormat = _mesa_base_fbo_format(ctx, internalFormat);
    if (baseFormat == 0) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(internalFormat=%s)",
-                  func, _mesa_lookup_enum_by_nr(internalFormat));
+                  func, _mesa_enum_to_string(internalFormat));
       return;
    }
 
@@ -2095,12 +2095,12 @@ renderbuffer_storage_named(GLuint renderbuffer, GLenum internalFormat,
       if (samples == NO_SAMPLES)
          _mesa_debug(ctx, "%s(%u, %s, %d, %d)\n",
                      func, renderbuffer,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(internalFormat),
                      width, height);
       else
          _mesa_debug(ctx, "%s(%u, %s, %d, %d, %d)\n",
                      func, renderbuffer,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(internalFormat),
                      width, height, samples);
    }
 
@@ -2131,14 +2131,14 @@ renderbuffer_storage_target(GLenum target, GLenum internalFormat,
       if (samples == NO_SAMPLES)
          _mesa_debug(ctx, "%s(%s, %s, %d, %d)\n",
                      func,
-                     _mesa_lookup_enum_by_nr(target),
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target),
+                     _mesa_enum_to_string(internalFormat),
                      width, height);
       else
          _mesa_debug(ctx, "%s(%s, %s, %d, %d, %d)\n",
                      func,
-                     _mesa_lookup_enum_by_nr(target),
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target),
+                     _mesa_enum_to_string(internalFormat),
                      width, height, samples);
    }
 
@@ -2311,7 +2311,7 @@ get_render_buffer_parameteriv(struct gl_context *ctx,
       /* fallthrough */
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname=%s)", func,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 }
@@ -2694,13 +2694,13 @@ _mesa_CheckFramebufferStatus(GLenum target)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glCheckFramebufferStatus(%s)\n",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
 
    fb = get_framebuffer_target(ctx, target);
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCheckFramebufferStatus(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return 0;
    }
 
@@ -2732,7 +2732,7 @@ _mesa_CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target)
       default:
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCheckNamedFramebufferStatus(invalid target %s)",
-                     _mesa_lookup_enum_by_nr(target));
+                     _mesa_enum_to_string(target));
          return 0;
    }
 
@@ -2851,7 +2851,7 @@ check_layered_texture_target(struct gl_context *ctx, GLenum target,
 
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "%s(invalid texture target %s)", caller,
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
    return false;
 }
 
@@ -2893,7 +2893,7 @@ check_texture_target(struct gl_context *ctx, GLenum target,
 
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "%s(invalid texture target %s)", caller,
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
    return false;
 }
 
@@ -2944,8 +2944,9 @@ check_textarget(struct gl_context *ctx, int dims, GLenum target,
          break;
       case GL_TEXTURE_2D_MULTISAMPLE:
       case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-         err = _mesa_is_gles(ctx)
-               || !ctx->Extensions.ARB_texture_multisample;
+         err = (_mesa_is_gles(ctx) ||
+                !ctx->Extensions.ARB_texture_multisample) &&
+               !_mesa_is_gles31(ctx);
          break;
       default:
          err = true;
@@ -2962,7 +2963,7 @@ check_textarget(struct gl_context *ctx, int dims, GLenum target,
    if (err) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(invalid textarget %s)",
-                  caller, _mesa_lookup_enum_by_nr(textarget));
+                  caller, _mesa_enum_to_string(textarget));
       return false;
    }
 
@@ -3074,7 +3075,7 @@ _mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
    att = get_attachment(ctx, fb, attachment);
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3157,7 +3158,7 @@ framebuffer_texture_with_dims(int dims, GLenum target,
    fb = get_framebuffer_target(ctx, target);
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3225,7 +3226,7 @@ _mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferTextureLayer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3304,7 +3305,7 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb;
    struct gl_texture_object *texObj;
-   GLboolean layered;
+   GLboolean layered = GL_FALSE;
 
    const char *func = "FramebufferTexture";
 
@@ -3319,7 +3320,7 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferTexture(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3347,7 +3348,7 @@ _mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb;
    struct gl_texture_object *texObj;
-   GLboolean layered;
+   GLboolean layered = GL_FALSE;
 
    const char *func = "glNamedFramebufferTexture";
 
@@ -3400,7 +3401,7 @@ _mesa_framebuffer_renderbuffer(struct gl_context *ctx,
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(invalid attachment %s)", func,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3440,7 +3441,7 @@ _mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferRenderbuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3539,7 +3540,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
           attachment != GL_DEPTH && attachment != GL_STENCIL) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "%s(invalid attachment %s)", caller,
-                     _mesa_lookup_enum_by_nr(attachment));
+                     _mesa_enum_to_string(attachment));
          return;
       }
       /* the default / window-system FBO */
@@ -3552,7 +3553,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
 
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3609,7 +3610,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3626,7 +3627,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3637,7 +3638,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
          goto invalid_pname_enum;
       } else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       } else if (att->Type == GL_TEXTURE) {
          if (att->Texture && (att->Texture->Target == GL_TEXTURE_3D ||
              att->Texture->Target == GL_TEXTURE_2D_ARRAY)) {
@@ -3659,7 +3660,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          if (ctx->Extensions.EXT_framebuffer_sRGB) {
@@ -3682,7 +3683,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          mesa_format format = att->Renderbuffer->Format;
@@ -3734,7 +3735,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else if (att->Texture) {
          const struct gl_texture_image *texImage =
@@ -3763,7 +3764,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
          *params = att->Layered;
       } else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       } else {
          goto invalid_pname_enum;
       }
@@ -3776,7 +3777,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
 
 invalid_pname_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname %s)", caller,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return;
 }
 
@@ -3792,7 +3793,7 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
    if (!buffer) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetFramebufferAttachmentParameteriv(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4009,7 +4010,7 @@ invalidate_framebuffer_storage(struct gl_context *ctx,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", name,
-               _mesa_lookup_enum_by_nr(attachments[i]));
+               _mesa_enum_to_string(attachments[i]));
    return;
 }
 
@@ -4026,7 +4027,7 @@ _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glInvalidateSubFramebuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4076,7 +4077,7 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glInvalidateFramebuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4152,7 +4153,7 @@ _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
          "glDiscardFramebufferEXT(target %s)",
-         _mesa_lookup_enum_by_nr(target));
+         _mesa_enum_to_string(target));
       return;
    }
 
@@ -4189,5 +4190,5 @@ _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glDiscardFramebufferEXT(attachment %s)",
-              _mesa_lookup_enum_by_nr(attachments[i]));
+              _mesa_enum_to_string(attachments[i]));
 }
diff --git a/src/mesa/main/feedback.c b/src/mesa/main/feedback.c
index 6bc4294f9c7..699e2a855a3 100644
--- a/src/mesa/main/feedback.c
+++ b/src/mesa/main/feedback.c
@@ -415,7 +415,7 @@ _mesa_RenderMode( GLenum mode )
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glRenderMode %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glRenderMode %s\n", _mesa_enum_to_string(mode));
 
    FLUSH_VERTICES(ctx, _NEW_RENDERMODE);
 
diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c
index 70adaf88551..95b428dca3e 100644
--- a/src/mesa/main/ffvertex_prog.c
+++ b/src/mesa/main/ffvertex_prog.c
@@ -189,15 +189,15 @@ static void make_state_key( struct gl_context *ctx, struct state_key *key )
 	 if (light->Enabled) {
 	    key->unit[i].light_enabled = 1;
 
-	    if (light->EyePosition[3] == 0.0)
+	    if (light->EyePosition[3] == 0.0F)
 	       key->unit[i].light_eyepos3_is_zero = 1;
 
-	    if (light->SpotCutoff == 180.0)
+	    if (light->SpotCutoff == 180.0F)
 	       key->unit[i].light_spotcutoff_is_180 = 1;
 
-	    if (light->ConstantAttenuation != 1.0 ||
-		light->LinearAttenuation != 0.0 ||
-		light->QuadraticAttenuation != 0.0)
+	    if (light->ConstantAttenuation != 1.0F ||
+		light->LinearAttenuation != 0.0F ||
+		light->QuadraticAttenuation != 0.0F)
 	       key->unit[i].light_attenuated = 1;
 	 }
       }
diff --git a/src/mesa/main/fog.c b/src/mesa/main/fog.c
index 3bce289e785..45f343d61c8 100644
--- a/src/mesa/main/fog.c
+++ b/src/mesa/main/fog.c
@@ -115,7 +115,7 @@ _mesa_Fogfv( GLenum pname, const GLfloat *params )
 	 ctx->Fog.Mode = m;
 	 break;
       case GL_FOG_DENSITY:
-	 if (*params<0.0) {
+	 if (*params<0.0F) {
 	    _mesa_error( ctx, GL_INVALID_VALUE, "glFog" );
             return;
 	 }
diff --git a/src/mesa/main/format_parser.py b/src/mesa/main/format_parser.py
index 11184f78e2c..799b14f0b1c 100755
--- a/src/mesa/main/format_parser.py
+++ b/src/mesa/main/format_parser.py
@@ -40,9 +40,6 @@ SRGB = 'srgb'
 YUV = 'yuv'
 ZS = 'zs'
 
-def is_power_of_two(x):
-   return not bool(x & (x - 1))
-
 VERY_LARGE = 99999999999999999999999
 
 class Channel:
@@ -100,10 +97,6 @@ class Channel:
       else:
          return 1
 
-   def is_power_of_two(self):
-      """Returns true if the size of this channel is a power of two."""
-      return is_power_of_two(self.size)
-
    def datatype(self):
       """Returns the datatype corresponding to a channel type and size"""
       return _get_datatype(self.type, self.size)
diff --git a/src/mesa/main/format_utils.h b/src/mesa/main/format_utils.h
index 7f500ec78da..618f43d0aaa 100644
--- a/src/mesa/main/format_utils.h
+++ b/src/mesa/main/format_utils.h
@@ -33,6 +33,7 @@
 
 #include "imports.h"
 #include "macros.h"
+#include "util/rounding.h"
 
 extern const mesa_array_format RGBA32_FLOAT;
 extern const mesa_array_format RGBA8_UBYTE;
@@ -84,7 +85,7 @@ _mesa_float_to_unorm(float x, unsigned dst_bits)
    else if (x > 1.0f)
       return MAX_UINT(dst_bits);
    else
-      return F_TO_I(x * MAX_UINT(dst_bits));
+      return _mesa_lroundevenf(x * MAX_UINT(dst_bits));
 }
 
 static inline unsigned
@@ -98,7 +99,7 @@ _mesa_unorm_to_unorm(unsigned x, unsigned src_bits, unsigned dst_bits)
 {
    if (src_bits < dst_bits) {
       return EXTEND_NORMALIZED_INT(x, src_bits, dst_bits);
-   } else {
+   } else if (src_bits > dst_bits) {
       unsigned src_half = (1 << (src_bits - 1)) - 1;
 
       if (src_bits + dst_bits > sizeof(x) * 8) {
@@ -108,6 +109,8 @@ _mesa_unorm_to_unorm(unsigned x, unsigned src_bits, unsigned dst_bits)
       } else {
          return (x * MAX_UINT(dst_bits) + src_half) / MAX_UINT(src_bits);
       }
+   } else {
+      return x;
    }
 }
 
@@ -128,7 +131,7 @@ _mesa_float_to_snorm(float x, unsigned dst_bits)
    else if (x > 1.0f)
       return MAX_INT(dst_bits);
    else
-      return F_TO_I(x * MAX_INT(dst_bits));
+      return _mesa_lroundevenf(x * MAX_INT(dst_bits));
 }
 
 static inline int
diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index 7741cabada1..85f7b6b5664 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -74,13 +74,15 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       /* These enums are only valid if ARB_texture_multisample is supported */
-      if (_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample)
+      if ((_mesa_is_desktop_gl(ctx) &&
+           ctx->Extensions.ARB_texture_multisample) ||
+          _mesa_is_gles31(ctx))
          break;
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -107,7 +109,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
        _mesa_base_fbo_format(ctx, internalformat) == 0) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(internalformat=%s)",
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -119,7 +121,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    if (bufSize < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glGetInternalformativ(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -168,7 +170,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(pname=%s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index baeb1bfe5de..d7b2bae59e7 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -354,14 +354,22 @@ _mesa_array_format_flip_channels(mesa_array_format format)
       return format;
 
    if (num_channels == 2) {
-      _mesa_array_format_set_swizzle(&format, swizzle[1], swizzle[0],
-                                     swizzle[2], swizzle[3]);
+      /* Assert that the swizzle makes sense for 2 channels */
+      for (unsigned i = 0; i < 4; i++)
+         assert(swizzle[i] != 2 && swizzle[i] != 3);
+
+      static const uint8_t flip_xy[6] = { 1, 0, 2, 3, 4, 5 };
+      _mesa_array_format_set_swizzle(&format,
+                                     flip_xy[swizzle[0]], flip_xy[swizzle[1]],
+                                     flip_xy[swizzle[2]], flip_xy[swizzle[3]]);
       return format;
    }
 
    if (num_channels == 4) {
-      _mesa_array_format_set_swizzle(&format, swizzle[3], swizzle[2],
-                                     swizzle[1], swizzle[0]);
+      static const uint8_t flip[6] = { 3, 2, 1, 0, 4, 5 };
+      _mesa_array_format_set_swizzle(&format,
+                                     flip[swizzle[0]], flip[swizzle[1]],
+                                     flip[swizzle[2]], flip[swizzle[3]]);
       return format;
    }
 
@@ -372,10 +380,11 @@ uint32_t
 _mesa_format_to_array_format(mesa_format format)
 {
    const struct gl_format_info *info = _mesa_get_format_info(format);
-   if (_mesa_little_endian())
-      return info->ArrayFormat;
-   else
+   if (info->ArrayFormat && !_mesa_little_endian() &&
+       info->Layout == MESA_FORMAT_LAYOUT_PACKED)
       return _mesa_array_format_flip_channels(info->ArrayFormat);
+   else
+      return info->ArrayFormat;
 }
 
 static struct hash_table *format_array_format_table;
diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index 7e451caf0ff..d938e6ad513 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -191,6 +191,11 @@ static inline void
 _mesa_array_format_set_swizzle(mesa_array_format *f,
                                int32_t x, int32_t y, int32_t z, int32_t w)
 {
+   *f &= ~(MESA_ARRAY_FORMAT_SWIZZLE_X_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_Y_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_Z_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_W_MASK);
+
    *f |= ((x << 8 ) & MESA_ARRAY_FORMAT_SWIZZLE_X_MASK) |
          ((y << 11) & MESA_ARRAY_FORMAT_SWIZZLE_Y_MASK) |
          ((z << 14) & MESA_ARRAY_FORMAT_SWIZZLE_Z_MASK) |
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 77c04b8dab8..37e2c29c89c 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -938,7 +938,7 @@ _mesa_print_framebuffer(const struct gl_framebuffer *fb)
 
    fprintf(stderr, "Mesa Framebuffer %u at %p\n", fb->Name, (void *) fb);
    fprintf(stderr, "  Size: %u x %u  Status: %s\n", fb->Width, fb->Height,
-           _mesa_lookup_enum_by_nr(fb->_Status));
+           _mesa_enum_to_string(fb->_Status));
    fprintf(stderr, "  Attachments:\n");
 
    for (i = 0; i < BUFFER_COUNT; i++) {
diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index 9aef090194e..c18f9d5223f 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -83,7 +83,7 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx,
 
    if (error) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGenerate%sMipmap(target=%s)",
-                  suffix, _mesa_lookup_enum_by_nr(target));
+                  suffix, _mesa_enum_to_string(target));
       return;
    }
 
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 3d6d63916b3..307a5ffbd1c 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -149,6 +149,8 @@ enum value_extra {
    EXTRA_EXT_UBO_GS4,
    EXTRA_EXT_ATOMICS_GS4,
    EXTRA_EXT_SHADER_IMAGE_GS4,
+   EXTRA_EXT_ATOMICS_TESS,
+   EXTRA_EXT_SHADER_IMAGE_TESS,
 };
 
 #define NO_EXTRA NULL
@@ -349,12 +351,58 @@ static const int extra_ARB_shader_image_load_store_and_geometry_shader[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_shader_atomic_counters_and_tessellation[] = {
+   EXTRA_EXT_ATOMICS_TESS,
+   EXTRA_END
+};
+
+static const int extra_ARB_shader_image_load_store_and_tessellation[] = {
+   EXTRA_EXT_SHADER_IMAGE_TESS,
+   EXTRA_END
+};
+
 static const int extra_ARB_draw_indirect_es31[] = {
    EXT(ARB_draw_indirect),
    EXTRA_API_ES31,
    EXTRA_END
 };
 
+static const int extra_ARB_shader_image_load_store_es31[] = {
+   EXT(ARB_shader_image_load_store),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_shader_atomic_counters_es31[] = {
+   EXT(ARB_shader_atomic_counters),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_texture_multisample_es31[] = {
+   EXT(ARB_texture_multisample),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_texture_gather_es31[] = {
+   EXT(ARB_texture_gather),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_compute_shader_es31[] = {
+   EXT(ARB_compute_shader),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_explicit_uniform_location_es31[] = {
+   EXT(ARB_explicit_uniform_location),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
@@ -401,6 +449,8 @@ EXTRA_EXT(ARB_explicit_uniform_location);
 EXTRA_EXT(ARB_clip_control);
 EXTRA_EXT(EXT_polygon_offset_clamp);
 EXTRA_EXT(ARB_framebuffer_no_attachments);
+EXTRA_EXT(ARB_tessellation_shader);
+EXTRA_EXT(ARB_shader_subroutine);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -626,7 +676,7 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       break;
 
    case GL_EDGE_FLAG:
-      v->value_bool = ctx->Current.Attrib[VERT_ATTRIB_EDGEFLAG][0] == 1.0;
+      v->value_bool = ctx->Current.Attrib[VERT_ATTRIB_EDGEFLAG][0] == 1.0F;
       break;
 
    case GL_READ_BUFFER:
@@ -1149,6 +1199,16 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
          api_found = (ctx->Extensions.ARB_shader_image_load_store &&
                       _mesa_has_geometry_shaders(ctx));
          break;
+      case EXTRA_EXT_ATOMICS_TESS:
+         api_check = GL_TRUE;
+         api_found = ctx->Extensions.ARB_shader_atomic_counters &&
+                     _mesa_has_tessellation(ctx);
+         break;
+      case EXTRA_EXT_SHADER_IMAGE_TESS:
+         api_check = GL_TRUE;
+         api_found = ctx->Extensions.ARB_shader_image_load_store &&
+                     _mesa_has_tessellation(ctx);
+         break;
       case EXTRA_END:
 	 break;
       default: /* *e is a offset into the extension struct */
@@ -1161,7 +1221,7 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
 
    if (api_check && !api_found) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-                  _mesa_lookup_enum_by_nr(d->pname));
+                  _mesa_enum_to_string(d->pname));
       return GL_FALSE;
    }
 
@@ -1208,10 +1268,13 @@ find_value(const char *func, GLenum pname, void **p, union value *v)
     * value since it's compatible with GLES2 its entry in table_set[] is at the
     * end.
     */
-   STATIC_ASSERT(ARRAY_SIZE(table_set) == API_OPENGL_LAST + 2);
+   STATIC_ASSERT(ARRAY_SIZE(table_set) == API_OPENGL_LAST + 3);
    if (_mesa_is_gles3(ctx)) {
       api = API_OPENGL_LAST + 1;
    }
+   if (_mesa_is_gles31(ctx)) {
+      api = API_OPENGL_LAST + 2;
+   }
    mask = ARRAY_SIZE(table(api)) - 1;
    hash = (pname * prime_factor);
    while (1) {
@@ -1222,7 +1285,7 @@ find_value(const char *func, GLenum pname, void **p, union value *v)
        * any valid enum. */
       if (unlikely(idx == 0)) {
          _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
          return &error_value;
       }
 
@@ -2004,11 +2067,11 @@ find_value_indexed(const char *func, GLenum pname, GLuint index, union value *v)
 
  invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return TYPE_INVALID;
  invalid_value:
    _mesa_error(ctx, GL_INVALID_VALUE, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return TYPE_INVALID;
 }
 
diff --git a/src/mesa/main/get_hash_generator.py b/src/mesa/main/get_hash_generator.py
index b200d197341..c777b782442 100644
--- a/src/mesa/main/get_hash_generator.py
+++ b/src/mesa/main/get_hash_generator.py
@@ -44,7 +44,7 @@ prime_factor = 89
 prime_step = 281
 hash_table_size = 1024
 
-gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3"])
+gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3", "GLES31"])
 
 def print_header():
    print "typedef const unsigned short table_t[%d];\n" % (hash_table_size)
@@ -68,6 +68,7 @@ api_enum = [
    'GLES2',
    'GL_CORE',
    'GLES3', # Not in gl_api enum in mtypes.h
+   'GLES31', # Not in gl_api enum in mtypes.h
 ]
 
 def api_index(api):
@@ -167,10 +168,13 @@ def generate_hash_tables(enum_list, enabled_apis, param_descriptors):
 
          for api in valid_apis:
             add_to_hash_table(tables[api], hash_val, len(params))
-            # Also add GLES2 items to the GLES3 hash table
+            # Also add GLES2 items to the GLES3 and GLES31 hash table
             if api == "GLES2":
                add_to_hash_table(tables["GLES3"], hash_val, len(params))
-
+               add_to_hash_table(tables["GLES31"], hash_val, len(params))
+            # Also add GLES3 items to the GLES31 hash table
+            if api == "GLES3":
+               add_to_hash_table(tables["GLES31"], hash_val, len(params))
          params.append(["GL_" + enum_name, param[1]])
 
    sorted_tables={}
@@ -206,7 +210,7 @@ if __name__ == '__main__':
       die("missing descriptor file (-f)\n")
 
    # generate the code for all APIs
-   enabled_apis = set(["GLES", "GLES2", "GLES3", "GL", "GL_CORE"])
+   enabled_apis = set(["GLES", "GLES2", "GLES3", "GLES31", "GL", "GL_CORE"])
 
    try:
       api_desc = gl_XML.parse_GL_API(api_desc_file)
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 74ff3ba6619..7dc92f10100 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -351,6 +351,9 @@ descriptor=[
 # GL_ARB_framebuffer_object
   [ "MAX_SAMPLES", "CONTEXT_INT(Const.MaxSamples), extra_ARB_framebuffer_object_EXT_framebuffer_multisample" ],
 
+# GL_ARB_sampler_objects / GL 3.3 / GLES 3.0
+  [ "SAMPLER_BINDING", "LOC_CUSTOM, TYPE_INT, GL_SAMPLER_BINDING, NO_EXTRA" ],
+
 # GL_ARB_sync
   [ "MAX_SERVER_WAIT_TIMEOUT", "CONTEXT_INT64(Const.MaxServerWaitTimeout), extra_ARB_sync" ],
 
@@ -404,9 +407,49 @@ descriptor=[
   [ "TEXTURE_EXTERNAL_OES", "LOC_CUSTOM, TYPE_BOOLEAN, 0, extra_OES_EGL_image_external" ],
 ]},
 
-{ "apis": ["GL", "GL_CORE", "GLES3"], "params": [
-# GL_ARB_sampler_objects / GL 3.3 / GLES 3.0
-  [ "SAMPLER_BINDING", "LOC_CUSTOM, TYPE_INT, GL_SAMPLER_BINDING, NO_EXTRA" ],
+# Enums in OpenGL and ES 3.1
+{ "apis": ["GL", "GL_CORE", "GLES31"], "params": [
+# GL_ARB_shader_image_load_store / GLES 3.1
+  [ "MAX_IMAGE_UNITS", "CONTEXT_INT(Const.MaxImageUnits), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_VERTEX_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_FRAGMENT_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_COMBINED_IMAGE_UNIFORMS", "CONTEXT_INT(Const.MaxCombinedImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+
+# GL_ARB_shader_atomic_counters / GLES 3.1
+  [ "ATOMIC_COUNTER_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_ATOMIC_COUNTER_BUFFER_BINDINGS", "CONTEXT_INT(Const.MaxAtomicBufferBindings), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_ATOMIC_COUNTER_BUFFER_SIZE", "CONTEXT_INT(Const.MaxAtomicBufferSize), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_VERTEX_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_VERTEX_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_FRAGMENT_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_FRAGMENT_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_COMBINED_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.MaxCombinedAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_COMBINED_ATOMIC_COUNTERS", "CONTEXT_INT(Const.MaxCombinedAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+
+# GL_ARB_texture_multisample / GLES 3.1
+  [ "TEXTURE_BINDING_2D_MULTISAMPLE", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_INDEX, extra_ARB_texture_multisample_es31" ],
+  [ "MAX_COLOR_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxColorTextureSamples), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_DEPTH_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxDepthTextureSamples), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_INTEGER_SAMPLES", "CONTEXT_INT(Const.MaxIntegerSamples), extra_ARB_texture_multisample_es31" ],
+  [ "SAMPLE_MASK", "CONTEXT_BOOL(Multisample.SampleMask), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_SAMPLE_MASK_WORDS", "CONST(1), extra_ARB_texture_multisample_es31" ],
+
+# GL_ARB_texture_gather / GLES 3.1
+  [ "MIN_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MinProgramTextureGatherOffset), extra_ARB_texture_gather_es31"],
+  [ "MAX_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MaxProgramTextureGatherOffset), extra_ARB_texture_gather_es31"],
+
+# GL_ARB_compute_shader / GLES 3.1
+  [ "MAX_COMPUTE_WORK_GROUP_INVOCATIONS", "CONTEXT_INT(Const.MaxComputeWorkGroupInvocations), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_UNIFORM_BLOCKS", "CONST(MAX_COMPUTE_UNIFORM_BLOCKS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_TEXTURE_IMAGE_UNITS", "CONST(MAX_COMPUTE_TEXTURE_IMAGE_UNITS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_ATOMIC_COUNTERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTERS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_SHARED_MEMORY_SIZE", "CONST(MAX_COMPUTE_SHARED_MEMORY_SIZE), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader_es31" ],
+
+# GL_ARB_explicit_uniform_location / GLES 3.1
+  [ "MAX_UNIFORM_LOCATIONS", "CONTEXT_INT(Const.MaxUserAssignableUniformLocations), extra_ARB_explicit_uniform_location_es31" ],
 ]},
 
 # Enums in OpenGL Core profile and ES 3.1
@@ -498,7 +541,6 @@ descriptor=[
   [ "MAX_LIST_NESTING", "CONST(MAX_LIST_NESTING), NO_EXTRA" ],
   [ "MAX_NAME_STACK_DEPTH", "CONST(MAX_NAME_STACK_DEPTH), NO_EXTRA" ],
   [ "MAX_PIXEL_MAP_TABLE", "CONST(MAX_PIXEL_MAP_TABLE), NO_EXTRA" ],
-  [ "MAX_UNIFORM_LOCATIONS", "CONTEXT_INT(Const.MaxUserAssignableUniformLocations), extra_ARB_explicit_uniform_location" ],
   [ "NAME_STACK_DEPTH", "CONTEXT_INT(Select.NameStackDepth), NO_EXTRA" ],
   [ "PACK_LSB_FIRST", "CONTEXT_BOOL(Pack.LsbFirst), NO_EXTRA" ],
   [ "PACK_SWAP_BYTES", "CONTEXT_BOOL(Pack.SwapBytes), NO_EXTRA" ],
@@ -699,13 +741,7 @@ descriptor=[
   [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
 
 # GL_ARB_texture_multisample / GL 3.2
-  [ "TEXTURE_BINDING_2D_MULTISAMPLE", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_INDEX, extra_ARB_texture_multisample" ],
   [ "TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX, extra_ARB_texture_multisample" ],
-  [ "MAX_COLOR_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxColorTextureSamples), extra_ARB_texture_multisample" ],
-  [ "MAX_DEPTH_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxDepthTextureSamples), extra_ARB_texture_multisample" ],
-  [ "MAX_INTEGER_SAMPLES", "CONTEXT_INT(Const.MaxIntegerSamples), extra_ARB_texture_multisample" ],
-  [ "SAMPLE_MASK", "CONTEXT_BOOL(Multisample.SampleMask), extra_ARB_texture_multisample" ],
-  [ "MAX_SAMPLE_MASK_WORDS", "CONST(1), extra_ARB_texture_multisample" ],
 
 # GL 3.0
   [ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ],
@@ -756,48 +792,23 @@ descriptor=[
   [ "TEXTURE_BINDING_CUBE_MAP_ARRAY_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_CUBE_ARRAY_INDEX, extra_ARB_texture_cube_map_array" ],
 
 # GL_ARB_texture_gather
-  [ "MIN_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MinProgramTextureGatherOffset), extra_ARB_texture_gather"],
-  [ "MAX_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MaxProgramTextureGatherOffset), extra_ARB_texture_gather"],
   [ "MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB", "CONTEXT_INT(Const.MaxProgramTextureGatherComponents), extra_ARB_texture_gather"],
 
 # GL_ARB_separate_shader_objects
   [ "PROGRAM_PIPELINE_BINDING", "LOC_CUSTOM, TYPE_INT, GL_PROGRAM_PIPELINE_BINDING, NO_EXTRA" ],
 
 # GL_ARB_shader_atomic_counters
-  [ "ATOMIC_COUNTER_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_shader_atomic_counters" ],
-  [ "MAX_ATOMIC_COUNTER_BUFFER_BINDINGS", "CONTEXT_INT(Const.MaxAtomicBufferBindings), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_ATOMIC_COUNTER_BUFFER_SIZE", "CONTEXT_INT(Const.MaxAtomicBufferSize), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_VERTEX_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_VERTEX_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_FRAGMENT_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_FRAGMENT_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters), extra_ARB_shader_atomic_counters" ],
   [ "MAX_GEOMETRY_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
   [ "MAX_GEOMETRY_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
-  [ "MAX_COMBINED_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.MaxCombinedAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_COMBINED_ATOMIC_COUNTERS", "CONTEXT_INT(Const.MaxCombinedAtomicCounters), extra_ARB_shader_atomic_counters" ],
 
 # GL_ARB_vertex_attrib_binding
   [ "MAX_VERTEX_ATTRIB_RELATIVE_OFFSET", "CONTEXT_ENUM(Const.MaxVertexAttribRelativeOffset), NO_EXTRA" ],
   [ "MAX_VERTEX_ATTRIB_BINDINGS", "CONTEXT_ENUM(Const.MaxVertexAttribBindings), NO_EXTRA" ],
 
 # GL_ARB_shader_image_load_store
-  [ "MAX_IMAGE_UNITS", "CONTEXT_INT(Const.MaxImageUnits), extra_ARB_shader_image_load_store"],
-  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedImageUnitsAndFragmentOutputs), extra_ARB_shader_image_load_store"],
-  [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store"],
-  [ "MAX_VERTEX_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms), extra_ARB_shader_image_load_store"],
+  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedImageUnitsAndFragmentOutputs), extra_ARB_shader_image_load_store" ],
+  [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store" ],
   [ "MAX_GEOMETRY_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms), extra_ARB_shader_image_load_store_and_geometry_shader"],
-  [ "MAX_FRAGMENT_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms), extra_ARB_shader_image_load_store"],
-  [ "MAX_COMBINED_IMAGE_UNIFORMS", "CONTEXT_INT(Const.MaxCombinedImageUniforms), extra_ARB_shader_image_load_store"],
-
-# GL_ARB_compute_shader
-  [ "MAX_COMPUTE_WORK_GROUP_INVOCATIONS", "CONTEXT_INT(Const.MaxComputeWorkGroupInvocations), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_UNIFORM_BLOCKS", "CONST(MAX_COMPUTE_UNIFORM_BLOCKS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_TEXTURE_IMAGE_UNITS", "CONST(MAX_COMPUTE_TEXTURE_IMAGE_UNITS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_ATOMIC_COUNTERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTERS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_SHARED_MEMORY_SIZE", "CONST(MAX_COMPUTE_SHARED_MEMORY_SIZE), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader" ],
 
 # GL_ARB_framebuffer_no_attachments
   ["MAX_FRAMEBUFFER_WIDTH", "CONTEXT_INT(Const.MaxFramebufferWidth), extra_ARB_framebuffer_no_attachments"],
@@ -826,6 +837,38 @@ descriptor=[
   [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
   [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
   [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
+
+# GL_ARB_tessellation_shader
+  [ "PATCH_VERTICES", "CONTEXT_INT(TessCtrlProgram.patch_vertices), extra_ARB_tessellation_shader" ],
+  [ "PATCH_DEFAULT_OUTER_LEVEL", "CONTEXT_FLOAT4(TessCtrlProgram.patch_default_outer_level), extra_ARB_tessellation_shader" ],
+  [ "PATCH_DEFAULT_INNER_LEVEL", "CONTEXT_FLOAT2(TessCtrlProgram.patch_default_inner_level), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_GEN_LEVEL", "CONTEXT_INT(Const.MaxTessGenLevel), extra_ARB_tessellation_shader" ],
+  [ "MAX_PATCH_VERTICES", "CONTEXT_INT(Const.MaxPatchVertices), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_PATCH_COMPONENTS", "CONTEXT_INT(Const.MaxTessPatchComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.MaxTessControlTotalOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_INPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxInputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_INPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxInputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformBlocks), extra_ARB_tessellation_shader" ],
+  [ "MAX_COMBINED_TESS_CONTROL_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxCombinedUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_COMBINED_TESS_EVALUATION_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxCombinedUniformComponents), extra_ARB_tessellation_shader" ],
+# Dependencies on GL_ARB_tessellation_shader
+  [ "MAX_TESS_CONTROL_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_CONTROL_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_EVALUATION_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_EVALUATION_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_CONTROL_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxImageUniforms), extra_ARB_shader_image_load_store_and_tessellation"],
+  [ "MAX_TESS_EVALUATION_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxImageUniforms), extra_ARB_shader_image_load_store_and_tessellation"],
+
+# GL_ARB_shader_subroutine
+  [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), extra_ARB_shader_subroutine" ],
+  [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), extra_ARB_shader_subroutine" ],
 ]}
 
 ]
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 72d99ca4e22..9873fdbf1a4 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -208,7 +208,7 @@ _mesa_GetPointerv( GLenum pname, GLvoid **params )
       return;
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetPointerv %s\n", _mesa_lookup_enum_by_nr(pname));
+      _mesa_debug(ctx, "glGetPointerv %s\n", _mesa_enum_to_string(pname));
 
    switch (pname) {
       case GL_VERTEX_ARRAY_POINTER:
@@ -299,7 +299,7 @@ _mesa_GetError( void )
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetError <-- %s\n", _mesa_lookup_enum_by_nr(e));
+      _mesa_debug(ctx, "glGetError <-- %s\n", _mesa_enum_to_string(e));
 
    ctx->ErrorValue = (GLenum) GL_NO_ERROR;
    ctx->ErrorDebugCount = 0;
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index ac69fabccaa..3eb66dab7f8 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -186,7 +186,7 @@ get_map_idx(GLenum value)
       return IDX_RG;
    default:
       _mesa_problem(NULL, "Unexpected inFormat %s",
-                    _mesa_lookup_enum_by_nr(value));
+                    _mesa_enum_to_string(value));
       return 0;
    }
 }
@@ -216,8 +216,8 @@ _mesa_compute_component_mapping(GLenum inFormat, GLenum outFormat, GLubyte *map)
 
 #if 0
    printf("from %x/%s to %x/%s map %d %d %d %d %d %d\n",
-	  inFormat, _mesa_lookup_enum_by_nr(inFormat),
-	  outFormat, _mesa_lookup_enum_by_nr(outFormat),
+	  inFormat, _mesa_enum_to_string(inFormat),
+	  outFormat, _mesa_enum_to_string(outFormat),
 	  map[0],
 	  map[1],
 	  map[2],
@@ -1278,9 +1278,53 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format)
    }
 }
 
+/**
+ * Convert various unpack formats to the corresponding base format.
+ */
+GLenum
+_mesa_unpack_format_to_base_format(GLenum format)
+{
+   switch(format) {
+   case GL_RED_INTEGER:
+      return GL_RED;
+   case GL_GREEN_INTEGER:
+      return GL_GREEN;
+   case GL_BLUE_INTEGER:
+      return GL_BLUE;
+   case GL_ALPHA_INTEGER:
+      return GL_ALPHA;
+   case GL_RG_INTEGER:
+      return GL_RG;
+   case GL_RGB_INTEGER:
+      return GL_RGB;
+   case GL_RGBA_INTEGER:
+      return GL_RGBA;
+   case GL_BGR_INTEGER:
+      return GL_BGR;
+   case GL_BGRA_INTEGER:
+      return GL_BGRA;
+   case GL_LUMINANCE_INTEGER_EXT:
+      return GL_LUMINANCE;
+   case GL_LUMINANCE_ALPHA_INTEGER_EXT:
+      return GL_LUMINANCE_ALPHA;
+   case GL_RED:
+   case GL_GREEN:
+   case GL_BLUE:
+   case GL_RG:
+   case GL_RGB:
+   case GL_RGBA:
+   case GL_BGR:
+   case GL_BGRA:
+   case GL_ALPHA:
+   case GL_LUMINANCE:
+   case GL_LUMINANCE_ALPHA:
+   default:
+      return format;
+   }
+}
 
 /**
- * Convert various base formats to the cooresponding integer format.
+ * Convert various base formats to the corresponding integer format.
  */
 GLenum
 _mesa_base_format_to_integer_format(GLenum format)
@@ -2605,8 +2649,6 @@ get_swizzle_from_gl_format(GLenum format, uint8_t *swizzle)
 uint32_t
 _mesa_format_from_format_and_type(GLenum format, GLenum type)
 {
-   mesa_array_format array_format;
-
    bool is_array_format = true;
    uint8_t swizzle[4];
    bool normalized = false, is_float = false, is_signed = false;
@@ -2662,15 +2704,9 @@ _mesa_format_from_format_and_type(GLenum format, GLenum type)
       normalized = !_mesa_is_enum_format_integer(format);
       num_channels = _mesa_components_in_format(format);
 
-      array_format =
-         MESA_ARRAY_FORMAT(type_size, is_signed, is_float,
-                           normalized, num_channels,
-                           swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-
-      if (!_mesa_little_endian())
-         array_format = _mesa_array_format_flip_channels(array_format);
-
-      return array_format;
+      return MESA_ARRAY_FORMAT(type_size, is_signed, is_float,
+                               normalized, num_channels,
+                               swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
    }
 
    /* Otherwise this is not an array format, so return the mesa_format
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index 8881cb7d86b..419955a6033 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -101,6 +101,9 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format);
 extern GLenum
 _mesa_base_format_to_integer_format(GLenum format);
 
+extern GLenum
+_mesa_unpack_format_to_base_format(GLenum format);
+
 extern GLboolean
 _mesa_base_format_has_channel(GLenum base_format, GLenum pname);
 
diff --git a/src/mesa/main/hint.c b/src/mesa/main/hint.c
index 3e056ebaf13..984239a7276 100644
--- a/src/mesa/main/hint.c
+++ b/src/mesa/main/hint.c
@@ -40,8 +40,8 @@ _mesa_Hint( GLenum target, GLenum mode )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glHint %s %s\n",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(mode));
 
    if (mode != GL_NICEST && mode != GL_FASTEST && mode != GL_DONT_CARE) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glHint(mode)");
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index 68c7316575c..350e6752c8b 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -369,7 +369,7 @@ _mesa_float_to_half(float val)
           * or normal.
           */
          e = 0;
-         m = (int) _mesa_roundevenf((1 << 24) * fabsf(fi.f));
+         m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f));
       }
       else if (new_exp > 15) {
          /* map this value to infinity */
@@ -383,7 +383,7 @@ _mesa_float_to_half(float val)
           * either normal or infinite.
           */
          e = new_exp + 15;
-         m = (int) _mesa_roundevenf(flt_m / (float) (1 << 13));
+         m = _mesa_lroundevenf(flt_m / (float) (1 << 13));
       }
    }
 
diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h
index 9ffe3decd0f..d61279ac4e5 100644
--- a/src/mesa/main/imports.h
+++ b/src/mesa/main/imports.h
@@ -170,34 +170,6 @@ static inline int IROUND_POS(float f)
    return (int) (f + 0.5F);
 }
 
-#ifdef __x86_64__
-#  include <xmmintrin.h>
-#endif
-
-/**
- * Convert float to int using a fast method.  The rounding mode may vary.
- */
-static inline int F_TO_I(float f)
-{
-#if defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__)
-   int r;
-   __asm__ ("fistpl %0" : "=m" (r) : "t" (f) : "st");
-   return r;
-#elif defined(USE_X86_ASM) && defined(_MSC_VER)
-   int r;
-   _asm {
-	 fld f
-	 fistp r
-	}
-   return r;
-#elif defined(__x86_64__)
-   return _mm_cvt_ss2si(_mm_load_ss(&f));
-#else
-   return IROUND(f);
-#endif
-}
-
-
 /** Return (as an integer) floor of float */
 static inline int IFLOOR(float f)
 {
diff --git a/src/mesa/main/light.c b/src/mesa/main/light.c
index 4021dbef922..14b4b04162b 100644
--- a/src/mesa/main/light.c
+++ b/src/mesa/main/light.c
@@ -42,16 +42,16 @@ _mesa_ShadeModel( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glShadeModel %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glShadeModel %s\n", _mesa_enum_to_string(mode));
+
+   if (ctx->Light.ShadeModel == mode)
+      return;
 
    if (mode != GL_FLAT && mode != GL_SMOOTH) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glShadeModel");
       return;
    }
 
-   if (ctx->Light.ShadeModel == mode)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_LIGHT);
    ctx->Light.ShadeModel = mode;
 
@@ -143,7 +143,7 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
       COPY_3V(light->SpotDirection, params);
       break;
    case GL_SPOT_EXPONENT:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       assert(params[0] <= ctx->Const.MaxSpotExponent);
       if (light->SpotExponent == params[0])
 	 return;
@@ -151,12 +151,12 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
       light->SpotExponent = params[0];
       break;
    case GL_SPOT_CUTOFF:
-      assert(params[0] == 180.0 || (params[0] >= 0.0 && params[0] <= 90.0));
+      assert(params[0] == 180.0F || (params[0] >= 0.0F && params[0] <= 90.0F));
       if (light->SpotCutoff == params[0])
          return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->SpotCutoff = params[0];
-      light->_CosCutoff = (GLfloat) (cos(light->SpotCutoff * M_PI / 180.0));
+      light->_CosCutoff = (cosf(light->SpotCutoff * M_PI / 180.0));
       if (light->_CosCutoff < 0)
          light->_CosCutoff = 0;
       if (light->SpotCutoff != 180.0F)
@@ -165,21 +165,21 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
          light->_Flags &= ~LIGHT_SPOT;
       break;
    case GL_CONSTANT_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->ConstantAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->ConstantAttenuation = params[0];
       break;
    case GL_LINEAR_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->LinearAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->LinearAttenuation = params[0];
       break;
    case GL_QUADRATIC_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->QuadraticAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
@@ -238,31 +238,31 @@ _mesa_Lightfv( GLenum light, GLenum pname, const GLfloat *params )
       params = temp;
       break;
    case GL_SPOT_EXPONENT:
-      if (params[0] < 0.0 || params[0] > ctx->Const.MaxSpotExponent) {
+      if (params[0] < 0.0F || params[0] > ctx->Const.MaxSpotExponent) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_SPOT_CUTOFF:
-      if ((params[0] < 0.0 || params[0] > 90.0) && params[0] != 180.0) {
+      if ((params[0] < 0.0F || params[0] > 90.0F) && params[0] != 180.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_CONSTANT_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_LINEAR_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_QUADRATIC_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
@@ -463,14 +463,14 @@ _mesa_LightModelfv( GLenum pname, const GLfloat *params )
       case GL_LIGHT_MODEL_LOCAL_VIEWER:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_pname;
-         newbool = (params[0]!=0.0);
+         newbool = (params[0] != 0.0F);
 	 if (ctx->Light.Model.LocalViewer == newbool)
 	    return;
 	 FLUSH_VERTICES(ctx, _NEW_LIGHT);
 	 ctx->Light.Model.LocalViewer = newbool;
          break;
       case GL_LIGHT_MODEL_TWO_SIDE:
-         newbool = (params[0]!=0.0);
+         newbool = (params[0] != 0.0F);
 	 if (ctx->Light.Model.TwoSide == newbool)
 	    return;
 	 FLUSH_VERTICES(ctx, _NEW_LIGHT);
@@ -723,8 +723,8 @@ _mesa_ColorMaterial( GLenum face, GLenum mode )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glColorMaterial %s %s\n",
-                  _mesa_lookup_enum_by_nr(face),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(face),
+                  _mesa_enum_to_string(mode));
 
    bitmask = _mesa_material_bitmask(ctx, face, mode, legal, "glColorMaterial");
    if (bitmask == 0)
@@ -975,7 +975,7 @@ compute_light_positions( struct gl_context *ctx )
       }
       else {
          /* positional light w/ homogeneous coordinate, divide by W */
-         GLfloat wInv = (GLfloat)1.0 / light->_Position[3];
+         GLfloat wInv = 1.0F / light->_Position[3];
          light->_Position[0] *= wInv;
          light->_Position[1] *= wInv;
          light->_Position[2] *= wInv;
@@ -1024,7 +1024,7 @@ update_modelview_scale( struct gl_context *ctx )
    if (!_math_matrix_is_length_preserving(ctx->ModelviewMatrixStack.Top)) {
       const GLfloat *m = ctx->ModelviewMatrixStack.Top->inv;
       GLfloat f = m[2] * m[2] + m[6] * m[6] + m[10] * m[10];
-      if (f < 1e-12) f = 1.0;
+      if (f < 1e-12f) f = 1.0f;
       if (ctx->_NeedEyeCoords)
 	 ctx->_ModelViewInvScale = 1.0f / sqrtf(f);
       else
diff --git a/src/mesa/main/lines.c b/src/mesa/main/lines.c
index 3c08ed2e713..c020fb3eb9e 100644
--- a/src/mesa/main/lines.c
+++ b/src/mesa/main/lines.c
@@ -45,7 +45,7 @@ _mesa_LineWidth( GLfloat width )
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glLineWidth %f\n", width);
 
-   if (width<=0.0) {
+   if (width <= 0.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glLineWidth" );
       return;
    }
@@ -63,7 +63,7 @@ _mesa_LineWidth( GLfloat width )
    if (ctx->API == API_OPENGL_CORE
        && ((ctx->Const.ContextFlags & GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT)
            != 0)
-       && width > 1.0) {
+       && width > 1.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glLineWidth" );
       return;
    }
diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h
index 0608650aeb4..54df50c9cfe 100644
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -33,6 +33,7 @@
 
 #include "util/macros.h"
 #include "util/u_math.h"
+#include "util/rounding.h"
 #include "imports.h"
 
 
@@ -131,12 +132,12 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
 #define INT_TO_USHORT(i)   ((i) < 0 ? 0 : ((GLushort) ((i) >> 15)))
 #define UINT_TO_USHORT(i)  ((i) < 0 ? 0 : ((GLushort) ((i) >> 16)))
 #define UNCLAMPED_FLOAT_TO_USHORT(us, f)  \
-        us = ( (GLushort) F_TO_I( CLAMP((f), 0.0F, 1.0F) * 65535.0F) )
+        us = ( (GLushort) _mesa_lroundevenf( CLAMP((f), 0.0F, 1.0F) * 65535.0F) )
 #define CLAMPED_FLOAT_TO_USHORT(us, f)  \
-        us = ( (GLushort) F_TO_I( (f) * 65535.0F) )
+        us = ( (GLushort) _mesa_lroundevenf( (f) * 65535.0F) )
 
 #define UNCLAMPED_FLOAT_TO_SHORT(s, f)  \
-        s = ( (GLshort) F_TO_I( CLAMP((f), -1.0F, 1.0F) * 32767.0F) )
+        s = ( (GLshort) _mesa_lroundevenf( CLAMP((f), -1.0F, 1.0F) * 32767.0F) )
 
 /***
  *** UNCLAMPED_FLOAT_TO_UBYTE: clamp float to [0,1] and map to ubyte in [0,255]
@@ -167,9 +168,9 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
         } while (0)
 #else
 #define UNCLAMPED_FLOAT_TO_UBYTE(ub, f) \
-	ub = ((GLubyte) F_TO_I(CLAMP((f), 0.0F, 1.0F) * 255.0F))
+	ub = ((GLubyte) _mesa_lroundevenf(CLAMP((f), 0.0F, 1.0F) * 255.0F))
 #define CLAMPED_FLOAT_TO_UBYTE(ub, f) \
-	ub = ((GLubyte) F_TO_I((f) * 255.0F))
+	ub = ((GLubyte) _mesa_lroundevenf((f) * 255.0F))
 #endif
 
 static fi_type UINT_AS_UNION(GLuint u)
@@ -679,17 +680,6 @@ minify(unsigned value, unsigned levels)
 }
 
 /**
- * Return true if the given value is a power of two.
- *
- * Note that this considers 0 a power of two.
- */
-static inline bool
-is_power_of_two(unsigned value)
-{
-   return (value & (value - 1)) == 0;
-}
-
-/**
  * Align a value up to an alignment value
  *
  * If \c value is not already aligned to the requested alignment value, it
diff --git a/src/mesa/main/matrix.c b/src/mesa/main/matrix.c
index 80c8a248ce4..2b8016a4a72 100644
--- a/src/mesa/main/matrix.c
+++ b/src/mesa/main/matrix.c
@@ -229,7 +229,7 @@ _mesa_PushMatrix( void )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPushMatrix %s\n",
-                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                  _mesa_enum_to_string(ctx->Transform.MatrixMode));
 
    if (stack->Depth + 1 >= stack->MaxDepth) {
       if (ctx->Transform.MatrixMode == GL_TEXTURE) {
@@ -239,7 +239,7 @@ _mesa_PushMatrix( void )
       }
       else {
          _mesa_error(ctx,  GL_STACK_OVERFLOW, "glPushMatrix(mode=%s)",
-                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                     _mesa_enum_to_string(ctx->Transform.MatrixMode));
       }
       return;
    }
@@ -270,7 +270,7 @@ _mesa_PopMatrix( void )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPopMatrix %s\n",
-                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                  _mesa_enum_to_string(ctx->Transform.MatrixMode));
 
    if (stack->Depth == 0) {
       if (ctx->Transform.MatrixMode == GL_TEXTURE) {
@@ -280,7 +280,7 @@ _mesa_PopMatrix( void )
       }
       else {
          _mesa_error(ctx,  GL_STACK_UNDERFLOW, "glPopMatrix(mode=%s)",
-                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                     _mesa_enum_to_string(ctx->Transform.MatrixMode));
       }
       return;
    }
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 7732d09b2ec..1e22f930092 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -2077,9 +2077,12 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
 
       /* Get the uncompressed image */
       assert(srcImage->Level == texObj->BaseLevel);
-      ctx->Driver.GetTexImage(ctx,
-                              temp_base_format, temp_datatype,
-                              temp_src, srcImage);
+      ctx->Driver.GetTexSubImage(ctx,
+                                 0, 0, 0,
+                                 srcImage->Width, srcImage->Height,
+                                 srcImage->Depth,
+                                 temp_base_format, temp_datatype,
+                                 temp_src, srcImage);
       /* restore packing mode */
       ctx->Pack = save;
    }
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 2d285b87a78..83f3717754d 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -90,7 +90,7 @@ struct vbo_context;
 
 
 /** Extra draw modes beyond GL_POINTS, GL_TRIANGLE_FAN, etc */
-#define PRIM_MAX                 GL_TRIANGLE_STRIP_ADJACENCY
+#define PRIM_MAX                 GL_PATCHES
 #define PRIM_OUTSIDE_BEGIN_END   (PRIM_MAX + 1)
 #define PRIM_UNKNOWN             (PRIM_MAX + 2)
 
@@ -109,6 +109,8 @@ _mesa_varying_slot_in_fs(gl_varying_slot slot)
    case VARYING_SLOT_EDGE:
    case VARYING_SLOT_CLIP_VERTEX:
    case VARYING_SLOT_LAYER:
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+   case VARYING_SLOT_TESS_LEVEL_INNER:
       return GL_FALSE;
    default:
       return GL_TRUE;
@@ -1254,6 +1256,7 @@ typedef enum {
    USAGE_UNIFORM_BUFFER = 0x1,
    USAGE_TEXTURE_BUFFER = 0x2,
    USAGE_ATOMIC_COUNTER_BUFFER = 0x4,
+   USAGE_SHADER_STORAGE_BUFFER = 0x8,
 } gl_buffer_usage;
 
 
@@ -1654,6 +1657,11 @@ struct gl_transform_feedback_info
     * multiple transform feedback outputs in the same buffer.
     */
    unsigned BufferStride[MAX_FEEDBACK_BUFFERS];
+
+   /**
+    * Which transform feedback stream this buffer binding is associated with.
+    */
+   unsigned BufferStream[MAX_FEEDBACK_BUFFERS];
 };
 
 
@@ -1891,6 +1899,8 @@ struct gl_program
    GLbitfield64 InputsRead;     /**< Bitmask of which input regs are read */
    GLbitfield64 DoubleInputsRead;     /**< Bitmask of which input regs are read  and are doubles */
    GLbitfield64 OutputsWritten; /**< Bitmask of which output regs are written */
+   GLbitfield PatchInputsRead;  /**< VAR[0..31] usage for patch inputs (user-defined only) */
+   GLbitfield PatchOutputsWritten; /**< VAR[0..31] usage for patch outputs (user-defined only) */
    GLbitfield SystemValuesRead;   /**< Bitmask of SYSTEM_VALUE_x inputs used */
    GLbitfield TexturesUsed[MAX_COMBINED_TEXTURE_IMAGE_UNITS];  /**< TEXTURE_x_BIT bitmask */
    GLbitfield SamplersUsed;   /**< Bitfield of which samplers are used */
@@ -1958,6 +1968,29 @@ struct gl_vertex_program
 };
 
 
+/** Tessellation control program object */
+struct gl_tess_ctrl_program
+{
+   struct gl_program Base;   /**< base class */
+
+   /* output layout */
+   GLint VerticesOut;
+};
+
+
+/** Tessellation evaluation program object */
+struct gl_tess_eval_program
+{
+   struct gl_program Base;   /**< base class */
+
+   /* input layout */
+   GLenum PrimitiveMode; /* GL_TRIANGLES, GL_QUADS or GL_ISOLINES */
+   GLenum Spacing;       /* GL_EQUAL, GL_FRACTIONAL_EVEN, GL_FRACTIONAL_ODD */
+   GLenum VertexOrder;   /* GL_CW or GL_CCW */
+   bool PointMode;
+};
+
+
 /** Geometry program object */
 struct gl_geometry_program
 {
@@ -2060,6 +2093,27 @@ struct gl_vertex_program_state
    GLboolean _Overriden;
 };
 
+/**
+ * Context state for tessellation control programs.
+ */
+struct gl_tess_ctrl_program_state
+{
+   /** Currently bound and valid shader. */
+   struct gl_tess_ctrl_program *_Current;
+
+   GLint patch_vertices;
+   GLfloat patch_default_outer_level[4];
+   GLfloat patch_default_inner_level[2];
+};
+
+/**
+ * Context state for tessellation evaluation programs.
+ */
+struct gl_tess_eval_program_state
+{
+   /** Currently bound and valid shader. */
+   struct gl_tess_eval_program *_Current;
+};
 
 /**
  * Context state for geometry programs.
@@ -2154,13 +2208,23 @@ struct gl_ati_fragment_shader_state
    struct ati_fragment_shader *Current;
 };
 
+/**
+ *  Shader subroutine function definition
+ */
+struct gl_subroutine_function
+{
+   char *name;
+   int num_compat_types;
+   const struct glsl_type **types;
+};
 
 /**
  * A GLSL vertex or fragment shader object.
  */
 struct gl_shader
 {
-   /** GL_FRAGMENT_SHADER || GL_VERTEX_SHADER || GL_GEOMETRY_SHADER_ARB.
+   /** GL_FRAGMENT_SHADER || GL_VERTEX_SHADER || GL_GEOMETRY_SHADER_ARB ||
+    *  GL_TESS_CONTROL_SHADER || GL_TESS_EVALUATION_SHADER.
     * Must be the first field.
     */
    GLenum Type;
@@ -2240,6 +2304,41 @@ struct gl_shader
    bool pixel_center_integer;
 
    /**
+    * Tessellation Control shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * 0 - vertices not declared in shader, or
+       * 1 .. GL_MAX_PATCH_VERTICES
+       */
+      GLint VerticesOut;
+   } TessCtrl;
+
+   /**
+    * Tessellation Evaluation shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * GL_TRIANGLES, GL_QUADS, GL_ISOLINES or PRIM_UNKNOWN if it's not set
+       * in this shader.
+       */
+      GLenum PrimitiveMode;
+      /**
+       * GL_EQUAL, GL_FRACTIONAL_ODD, GL_FRACTIONAL_EVEN, or 0 if it's not set
+       * in this shader.
+       */
+      GLenum Spacing;
+      /**
+       * GL_CW, GL_CCW, or 0 if it's not set in this shader.
+       */
+      GLenum VertexOrder;
+      /**
+       * 1, 0, or -1 if it's not set in this shader.
+       */
+      int PointMode;
+   } TessEval;
+
+   /**
     * Geometry shader state from GLSL 1.50 layout qualifiers.
     */
    struct {
@@ -2304,6 +2403,25 @@ struct gl_shader
        */
       unsigned LocalSize[3];
    } Comp;
+
+   /**
+     * Number of types for subroutine uniforms.
+     */
+   GLuint NumSubroutineUniformTypes;
+
+   /**
+     * Subroutine uniform remap table
+     * based on the program level uniform remap table.
+     */
+   GLuint NumSubroutineUniformRemapTable;
+   struct gl_uniform_storage **SubroutineUniformRemapTable;
+
+   /**
+    * Num of subroutine functions for this stage
+    * and storage for them.
+    */
+   GLuint NumSubroutineFunctions;
+   struct gl_subroutine_function *SubroutineFunctions;
 };
 
 
@@ -2365,6 +2483,11 @@ struct gl_uniform_block
    GLuint UniformBufferSize;
 
    /**
+    * Is this actually an interface block for a shader storage buffer?
+    */
+   bool IsShaderStorage;
+
+   /**
     * Layout specified in the shader
     *
     * This isn't accessible through the API, but it is used while
@@ -2468,6 +2591,37 @@ struct gl_shader_program
    enum gl_frag_depth_layout FragDepthLayout;
 
    /**
+    * Tessellation Control shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * 0 - vertices not declared in shader, or
+       * 1 .. GL_MAX_PATCH_VERTICES
+       */
+      GLint VerticesOut;
+   } TessCtrl;
+
+   /**
+    * Tessellation Evaluation shader state from layout qualifiers.
+    */
+   struct {
+      /** GL_TRIANGLES, GL_QUADS or GL_ISOLINES */
+      GLenum PrimitiveMode;
+      /** GL_EQUAL, GL_FRACTIONAL_ODD or GL_FRACTIONAL_EVEN */
+      GLenum Spacing;
+      /** GL_CW or GL_CCW */
+      GLenum VertexOrder;
+      bool PointMode;
+      /**
+       * True if gl_ClipDistance is written to.  Copied into
+       * gl_tess_eval_program by _mesa_copy_linked_program_data().
+       */
+      GLboolean UsesClipDistance;
+      GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
+                                         0 if not present. */
+   } TessEval;
+
+   /**
     * Geometry shader state - copied into gl_geometry_program by
     * _mesa_copy_linked_program_data().
     */
@@ -2681,6 +2835,7 @@ struct gl_shader_compiler_options
    GLboolean EmitNoIndirectOutput;  /**< No indirect addressing of outputs */
    GLboolean EmitNoIndirectTemp;    /**< No indirect addressing of temps */
    GLboolean EmitNoIndirectUniform; /**< No indirect addressing of constants */
+   GLboolean EmitNoIndirectSampler; /**< No indirect addressing of samplers */
    /*@}*/
 
    GLuint MaxIfDepth;               /**< Maximum nested IF blocks */
@@ -3100,6 +3255,9 @@ struct gl_program_constants
 
    /* GL_ARB_shader_image_load_store */
    GLuint MaxImageUniforms;
+
+   /* GL_ARB_shader_storage_buffer_object */
+   GLuint MaxShaderStorageBlocks;
 };
 
 
@@ -3197,6 +3355,15 @@ struct gl_constants
    GLuint UniformBufferOffsetAlignment;
    /** @} */
 
+   /** @{
+    * GL_ARB_shader_storage_buffer_object
+    */
+   GLuint MaxCombinedShaderStorageBlocks;
+   GLuint MaxShaderStorageBufferBindings;
+   GLuint MaxShaderStorageBlockSize;
+   GLuint ShaderStorageBufferOffsetAlignment;
+   /** @} */
+
    /**
     * GL_ARB_explicit_uniform_location
     */
@@ -3423,6 +3590,13 @@ struct gl_constants
    GLenum ContextReleaseBehavior;
 
    struct gl_shader_compiler_options ShaderCompilerOptions[MESA_SHADER_STAGES];
+
+   /** GL_ARB_tessellation_shader */
+   GLuint MaxPatchVertices;
+   GLuint MaxTessGenLevel;
+   GLuint MaxTessPatchComponents;
+   GLuint MaxTessControlTotalOutputComponents;
+   bool LowerTessLevel; /**< Lower gl_TessLevel* from float[n] to vecn? */
 };
 
 
@@ -3484,6 +3658,8 @@ struct gl_extensions
    GLboolean ARB_shader_image_load_store;
    GLboolean ARB_shader_precision;
    GLboolean ARB_shader_stencil_export;
+   GLboolean ARB_shader_storage_buffer_object;
+   GLboolean ARB_shader_subroutine;
    GLboolean ARB_shader_texture_lod;
    GLboolean ARB_shading_language_packing;
    GLboolean ARB_shading_language_420pack;
@@ -3815,6 +3991,12 @@ struct gl_driver_flags
     */
    uint64_t NewUniformBuffer;
 
+   /**
+    * gl_context::ShaderStorageBufferBindings
+    * gl_shader_program::ShaderStorageBlocks
+    */
+   uint64_t NewShaderStorageBuffer;
+
    uint64_t NewTextureBuffer;
 
    /**
@@ -3826,6 +4008,11 @@ struct gl_driver_flags
     * gl_context::ImageUnits
     */
    uint64_t NewImageUnits;
+
+   /**
+    * gl_context::TessCtrlProgram::patch_default_*
+    */
+   uint64_t NewDefaultTessLevels;
 };
 
 struct gl_uniform_buffer_binding
@@ -3842,6 +4029,20 @@ struct gl_uniform_buffer_binding
    GLboolean AutomaticSize;
 };
 
+struct gl_shader_storage_buffer_binding
+{
+   struct gl_buffer_object *BufferObject;
+   /** Start of shader storage block data in the buffer */
+   GLintptr Offset;
+   /** Size of data allowed to be referenced from the buffer (in bytes) */
+   GLsizeiptr Size;
+   /**
+    * glBindBufferBase() indicates that the Size should be ignored and only
+    * limited by the current size of the BufferObject.
+    */
+   GLboolean AutomaticSize;
+};
+
 /**
  * ARB_shader_image_load_store image unit.
  */
@@ -4047,6 +4248,8 @@ struct gl_context
    struct gl_fragment_program_state FragmentProgram;
    struct gl_geometry_program_state GeometryProgram;
    struct gl_compute_program_state ComputeProgram;
+   struct gl_tess_ctrl_program_state TessCtrlProgram;
+   struct gl_tess_eval_program_state TessEvalProgram;
    struct gl_ati_fragment_shader_state ATIFragmentShader;
 
    struct gl_pipeline_shader_state Pipeline; /**< GLSL pipeline shader object state */
@@ -4089,6 +4292,12 @@ struct gl_context
    struct gl_buffer_object *UniformBuffer;
 
    /**
+    * Current GL_ARB_shader_storage_buffer_object binding referenced by
+    * GL_SHADER_STORAGE_BUFFER target for glBufferData, glMapBuffer, etc.
+    */
+   struct gl_buffer_object *ShaderStorageBuffer;
+
+   /**
     * Array of uniform buffers for GL_ARB_uniform_buffer_object and GL 3.1.
     * This is set up using glBindBufferRange() or glBindBufferBase().  They are
     * associated with uniform blocks by glUniformBlockBinding()'s state in the
@@ -4098,6 +4307,15 @@ struct gl_context
       UniformBufferBindings[MAX_COMBINED_UNIFORM_BUFFERS];
 
    /**
+    * Array of shader storage buffers for ARB_shader_storage_buffer_object
+    * and GL 4.3. This is set up using glBindBufferRange() or
+    * glBindBufferBase().  They are associated with shader storage blocks by
+    * glShaderStorageBlockBinding()'s state in the shader program.
+    */
+   struct gl_shader_storage_buffer_binding
+      ShaderStorageBufferBindings[MAX_COMBINED_SHADER_STORAGE_BUFFERS];
+
+   /**
     * Object currently associated with the GL_ATOMIC_COUNTER_BUFFER
     * target.
     */
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 816837b95bd..09e6154f7ec 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -43,7 +43,7 @@ _mesa_SampleCoverage(GLclampf value, GLboolean invert)
 
    FLUSH_VERTICES(ctx, 0);
 
-   ctx->Multisample.SampleCoverageValue = (GLfloat) CLAMP(value, 0.0, 1.0);
+   ctx->Multisample.SampleCoverageValue = CLAMP(value, 0.0f, 1.0f);
    ctx->Multisample.SampleCoverageInvert = invert;
    ctx->NewState |= _NEW_MULTISAMPLE;
 }
@@ -134,7 +134,7 @@ _mesa_MinSampleShading(GLclampf value)
 
    FLUSH_VERTICES(ctx, 0);
 
-   ctx->Multisample.MinSampleShadingValue = CLAMP(value, 0.0, 1.0);
+   ctx->Multisample.MinSampleShadingValue = CLAMP(value, 0.0f, 1.0f);
    ctx->NewState |= _NEW_MULTISAMPLE;
 }
 
@@ -164,8 +164,11 @@ _mesa_check_sample_count(struct gl_context *ctx, GLenum target,
     *
     *     "If internalformat is a signed or unsigned integer format and samples
     *     is greater than zero, then the error INVALID_OPERATION is generated."
+    *
+    * This restriction is relaxed for OpenGL ES 3.1.
     */
-   if (_mesa_is_gles3(ctx) && _mesa_is_enum_format_integer(internalFormat)
+   if ((ctx->API == API_OPENGLES2 && ctx->Version == 30) &&
+       _mesa_is_enum_format_integer(internalFormat)
        && samples > 0) {
       return GL_INVALID_OPERATION;
    }
diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c
index 5626054687b..1019f893ba8 100644
--- a/src/mesa/main/objectlabel.c
+++ b/src/mesa/main/objectlabel.c
@@ -234,7 +234,7 @@ get_label_pointer(struct gl_context *ctx, GLenum identifier, GLuint name,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(identifier = %s)",
-               caller, _mesa_lookup_enum_by_nr(identifier));
+               caller, _mesa_enum_to_string(identifier));
    return NULL;
 }
 
diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
index f72360817e9..7147fd6e4fe 100644
--- a/src/mesa/main/pack.c
+++ b/src/mesa/main/pack.c
@@ -470,7 +470,7 @@ extract_uint_indexes(GLuint n, GLuint indexes[],
 static inline GLuint
 clamp_float_to_uint(GLfloat f)
 {
-   return f < 0.0F ? 0 : F_TO_I(f);
+   return f < 0.0F ? 0 : _mesa_lroundevenf(f);
 }
 
 
@@ -478,7 +478,7 @@ static inline GLuint
 clamp_half_to_uint(GLhalfARB h)
 {
    GLfloat f = _mesa_half_to_float(h);
-   return f < 0.0F ? 0 : F_TO_I(f);
+   return f < 0.0F ? 0 : _mesa_lroundevenf(f);
 }
 
 
@@ -796,7 +796,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
     * back to an int type can introduce errors that will show up as
     * artifacts in things like depth peeling which uses glCopyTexImage.
     */
-   if (ctx->Pixel.DepthScale == 1.0 && ctx->Pixel.DepthBias == 0.0) {
+   if (ctx->Pixel.DepthScale == 1.0F && ctx->Pixel.DepthBias == 0.0F) {
       if (srcType == GL_UNSIGNED_INT && dstType == GL_UNSIGNED_SHORT) {
          const GLuint *src = (const GLuint *) source;
          GLushort *dst = (GLushort *) dest;
@@ -874,8 +874,8 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
       case GL_UNSIGNED_INT_24_8_EXT: /* GL_EXT_packed_depth_stencil */
          if (dstType == GL_UNSIGNED_INT_24_8_EXT &&
              depthMax == 0xffffff &&
-             ctx->Pixel.DepthScale == 1.0 &&
-             ctx->Pixel.DepthBias == 0.0) {
+             ctx->Pixel.DepthScale == 1.0F &&
+             ctx->Pixel.DepthBias == 0.0F) {
             const GLuint *src = (const GLuint *) source;
             GLuint *zValues = (GLuint *) dest;
             GLuint i;
@@ -945,7 +945,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
    {
       const GLfloat scale = ctx->Pixel.DepthScale;
       const GLfloat bias = ctx->Pixel.DepthBias;
-      if (scale != 1.0 || bias != 0.0) {
+      if (scale != 1.0F || bias != 0.0F) {
          GLuint i;
          for (i = 0; i < n; i++) {
             depthValues[i] = depthValues[i] * scale + bias;
@@ -958,7 +958,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
    if (needClamp) {
       GLuint i;
       for (i = 0; i < n; i++) {
-         depthValues[i] = (GLfloat)CLAMP(depthValues[i], 0.0, 1.0);
+         depthValues[i] = CLAMP(depthValues[i], 0.0F, 1.0F);
       }
    }
 
@@ -1025,7 +1025,7 @@ _mesa_pack_depth_span( struct gl_context *ctx, GLuint n, GLvoid *dest,
       return;
    }
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0) {
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F) {
       memcpy(depthCopy, depthSpan, n * sizeof(GLfloat));
       _mesa_scale_and_bias_depth(ctx, n, depthCopy);
       depthSpan = depthCopy;
@@ -1153,7 +1153,7 @@ _mesa_pack_depth_stencil_span(struct gl_context *ctx,GLuint n,
       return;
    }
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0) {
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F) {
       memcpy(depthCopy, depthVals, n * sizeof(GLfloat));
       _mesa_scale_and_bias_depth(ctx, n, depthCopy);
       depthVals = depthCopy;
diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 279ae2078fe..07acbf10c1d 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -244,14 +244,13 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
     *
     *     "If stages is not the special value ALL_SHADER_BITS, and has a bit
     *     set that is not recognized, the error INVALID_VALUE is generated."
-    *
-    * NOT YET SUPPORTED:
-    * GL_TESS_CONTROL_SHADER_BIT
-    * GL_TESS_EVALUATION_SHADER_BIT
     */
    any_valid_stages = GL_VERTEX_SHADER_BIT | GL_FRAGMENT_SHADER_BIT;
    if (_mesa_has_geometry_shaders(ctx))
       any_valid_stages |= GL_GEOMETRY_SHADER_BIT;
+   if (_mesa_has_tessellation(ctx))
+      any_valid_stages |= GL_TESS_CONTROL_SHADER_BIT |
+                          GL_TESS_EVALUATION_SHADER_BIT;
 
    if (stages != GL_ALL_SHADER_BITS && (stages & ~any_valid_stages) != 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glUseProgramStages(Stages)");
@@ -327,6 +326,12 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
 
    if ((stages & GL_GEOMETRY_SHADER_BIT) != 0)
       _mesa_use_shader_program(ctx, GL_GEOMETRY_SHADER, shProg, pipe);
+
+   if ((stages & GL_TESS_CONTROL_SHADER_BIT) != 0)
+      _mesa_use_shader_program(ctx, GL_TESS_CONTROL_SHADER, shProg, pipe);
+
+   if ((stages & GL_TESS_EVALUATION_SHADER_BIT) != 0)
+      _mesa_use_shader_program(ctx, GL_TESS_EVALUATION_SHADER, shProg, pipe);
 }
 
 /**
@@ -588,6 +593,7 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
    /* Are geometry shaders available in this context?
     */
    const bool has_gs = _mesa_has_geometry_shaders(ctx);
+   const bool has_tess = _mesa_has_tessellation(ctx);;
 
    if (!pipe) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -615,11 +621,17 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
          ? pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name : 0;
       return;
    case GL_TESS_EVALUATION_SHADER:
-      /* NOT YET SUPPORTED */
-      break;
+      if (!has_tess)
+         break;
+      *params = pipe->CurrentProgram[MESA_SHADER_TESS_EVAL]
+         ? pipe->CurrentProgram[MESA_SHADER_TESS_EVAL]->Name : 0;
+      return;
    case GL_TESS_CONTROL_SHADER:
-      /* NOT YET SUPPORTED */
-      break;
+      if (!has_tess)
+         break;
+      *params = pipe->CurrentProgram[MESA_SHADER_TESS_CTRL]
+         ? pipe->CurrentProgram[MESA_SHADER_TESS_CTRL]->Name : 0;
+      return;
    case GL_GEOMETRY_SHADER:
       if (!has_gs)
          break;
@@ -635,7 +647,7 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
    }
 
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramPipelineiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 /**
@@ -777,7 +789,9 @@ _mesa_validate_program_pipeline(struct gl_context* ctx,
     *           executable vertex shader."
     */
    if (!pipe->CurrentProgram[MESA_SHADER_VERTEX]
-       && pipe->CurrentProgram[MESA_SHADER_GEOMETRY]) {
+       && (pipe->CurrentProgram[MESA_SHADER_GEOMETRY] ||
+           pipe->CurrentProgram[MESA_SHADER_TESS_CTRL] ||
+           pipe->CurrentProgram[MESA_SHADER_TESS_EVAL])) {
       pipe->InfoLog = ralloc_strdup(pipe, "Program lacks a vertex shader");
       goto err;
    }
diff --git a/src/mesa/main/pixel.c b/src/mesa/main/pixel.c
index ecda2694fc8..608a5454702 100644
--- a/src/mesa/main/pixel.c
+++ b/src/mesa/main/pixel.c
@@ -455,12 +455,12 @@ _mesa_GetnPixelMapusvARB( GLenum map, GLsizei bufSize, GLushort *values )
    /* special cases */
    case GL_PIXEL_MAP_I_TO_I:
       for (i = 0; i < mapsize; i++) {
-         values[i] = (GLushort) CLAMP(ctx->PixelMaps.ItoI.Map[i], 0.0, 65535.);
+         values[i] = (GLushort) CLAMP(ctx->PixelMaps.ItoI.Map[i], 0.0F, 65535.0F);
       }
       break;
    case GL_PIXEL_MAP_S_TO_S:
       for (i = 0; i < mapsize; i++) {
-         values[i] = (GLushort) CLAMP(ctx->PixelMaps.StoS.Map[i], 0.0, 65535.);
+         values[i] = (GLushort) CLAMP(ctx->PixelMaps.StoS.Map[i], 0.0F, 65535.0F);
       }
       break;
    default:
diff --git a/src/mesa/main/pixeltransfer.c b/src/mesa/main/pixeltransfer.c
index 94464ea6709..22eac00a7df 100644
--- a/src/mesa/main/pixeltransfer.c
+++ b/src/mesa/main/pixeltransfer.c
@@ -35,6 +35,7 @@
 #include "pixeltransfer.h"
 #include "imports.h"
 #include "mtypes.h"
+#include "util/rounding.h"
 
 
 /*
@@ -47,25 +48,25 @@ _mesa_scale_and_bias_rgba(GLuint n, GLfloat rgba[][4],
                           GLfloat rBias, GLfloat gBias,
                           GLfloat bBias, GLfloat aBias)
 {
-   if (rScale != 1.0 || rBias != 0.0) {
+   if (rScale != 1.0F || rBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][RCOMP] = rgba[i][RCOMP] * rScale + rBias;
       }
    }
-   if (gScale != 1.0 || gBias != 0.0) {
+   if (gScale != 1.0F || gBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][GCOMP] = rgba[i][GCOMP] * gScale + gBias;
       }
    }
-   if (bScale != 1.0 || bBias != 0.0) {
+   if (bScale != 1.0F || bBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][BCOMP] = rgba[i][BCOMP] * bScale + bBias;
       }
    }
-   if (aScale != 1.0 || aBias != 0.0) {
+   if (aScale != 1.0F || aBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][ACOMP] = rgba[i][ACOMP] * aScale + aBias;
@@ -94,10 +95,10 @@ _mesa_map_rgba( const struct gl_context *ctx, GLuint n, GLfloat rgba[][4] )
       GLfloat g = CLAMP(rgba[i][GCOMP], 0.0F, 1.0F);
       GLfloat b = CLAMP(rgba[i][BCOMP], 0.0F, 1.0F);
       GLfloat a = CLAMP(rgba[i][ACOMP], 0.0F, 1.0F);
-      rgba[i][RCOMP] = rMap[F_TO_I(r * rscale)];
-      rgba[i][GCOMP] = gMap[F_TO_I(g * gscale)];
-      rgba[i][BCOMP] = bMap[F_TO_I(b * bscale)];
-      rgba[i][ACOMP] = aMap[F_TO_I(a * ascale)];
+      rgba[i][RCOMP] = rMap[(int)_mesa_lroundevenf(r * rscale)];
+      rgba[i][GCOMP] = gMap[(int)_mesa_lroundevenf(g * gscale)];
+      rgba[i][BCOMP] = bMap[(int)_mesa_lroundevenf(b * bscale)];
+      rgba[i][ACOMP] = aMap[(int)_mesa_lroundevenf(a * ascale)];
    }
 }
 
@@ -236,7 +237,7 @@ _mesa_apply_ci_transfer_ops(const struct gl_context *ctx,
       GLuint i;
       for (i = 0; i < n; i++) {
          const GLuint j = indexes[i] & mask;
-         indexes[i] = F_TO_I(ctx->PixelMaps.ItoI.Map[j]);
+         indexes[i] = _mesa_lroundevenf(ctx->PixelMaps.ItoI.Map[j]);
       }
    }
 }
diff --git a/src/mesa/main/points.c b/src/mesa/main/points.c
index 5ad1f38f366..863e3c1af32 100644
--- a/src/mesa/main/points.c
+++ b/src/mesa/main/points.c
@@ -45,7 +45,7 @@ _mesa_PointSize( GLfloat size )
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (size <= 0.0) {
+   if (size <= 0.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glPointSize" );
       return;
    }
@@ -119,9 +119,9 @@ _mesa_PointParameterfv( GLenum pname, const GLfloat *params)
             return;
          FLUSH_VERTICES(ctx, _NEW_POINT);
          COPY_3V(ctx->Point.Params, params);
-         ctx->Point._Attenuated = (ctx->Point.Params[0] != 1.0 ||
-                                   ctx->Point.Params[1] != 0.0 ||
-                                   ctx->Point.Params[2] != 0.0);
+         ctx->Point._Attenuated = (ctx->Point.Params[0] != 1.0F ||
+                                   ctx->Point.Params[1] != 0.0F ||
+                                   ctx->Point.Params[2] != 0.0F);
          break;
       case GL_POINT_SIZE_MIN_EXT:
          if (params[0] < 0.0F) {
diff --git a/src/mesa/main/polygon.c b/src/mesa/main/polygon.c
index a1f0aa02da1..60af88f9857 100644
--- a/src/mesa/main/polygon.c
+++ b/src/mesa/main/polygon.c
@@ -56,7 +56,7 @@ _mesa_CullFace( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glCullFace %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glCullFace %s\n", _mesa_enum_to_string(mode));
 
    if (mode!=GL_FRONT && mode!=GL_BACK && mode!=GL_FRONT_AND_BACK) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glCullFace" );
@@ -91,16 +91,16 @@ _mesa_FrontFace( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glFrontFace %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glFrontFace %s\n", _mesa_enum_to_string(mode));
+
+   if (ctx->Polygon.FrontFace == mode)
+      return;
 
    if (mode!=GL_CW && mode!=GL_CCW) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glFrontFace" );
       return;
    }
 
-   if (ctx->Polygon.FrontFace == mode)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_POLYGON);
    ctx->Polygon.FrontFace = mode;
 
@@ -128,8 +128,8 @@ _mesa_PolygonMode( GLenum face, GLenum mode )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPolygonMode %s %s\n",
-                  _mesa_lookup_enum_by_nr(face),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(face),
+                  _mesa_enum_to_string(mode));
 
    if (mode!=GL_POINT && mode!=GL_LINE && mode!=GL_FILL) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glPolygonMode(mode)" );
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index d857b84e60d..23d2b4d2da0 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -28,10 +28,11 @@
 #include "main/mtypes.h"
 #include "main/shaderapi.h"
 #include "main/shaderobj.h"
+#include "main/context.h"
 #include "program_resource.h"
-
+#include "ir_uniform.h"
 static bool
-supported_interface_enum(GLenum iface)
+supported_interface_enum(struct gl_context *ctx, GLenum iface)
 {
    switch (iface) {
    case GL_UNIFORM:
@@ -42,17 +43,21 @@ supported_interface_enum(GLenum iface)
    case GL_ATOMIC_COUNTER_BUFFER:
       return true;
    case GL_VERTEX_SUBROUTINE:
-   case GL_TESS_CONTROL_SUBROUTINE:
-   case GL_TESS_EVALUATION_SUBROUTINE:
-   case GL_GEOMETRY_SUBROUTINE:
    case GL_FRAGMENT_SUBROUTINE:
-   case GL_COMPUTE_SUBROUTINE:
    case GL_VERTEX_SUBROUTINE_UNIFORM:
-   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
-   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
    case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      return _mesa_has_shader_subroutine(ctx);
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      return _mesa_has_geometry_shaders(ctx) && _mesa_has_shader_subroutine(ctx);
+   case GL_COMPUTE_SUBROUTINE:
    case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      return _mesa_has_compute_shaders(ctx) && _mesa_has_shader_subroutine(ctx);
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      return _mesa_has_tessellation(ctx) && _mesa_has_shader_subroutine(ctx);
    case GL_BUFFER_VARIABLE:
    case GL_SHADER_STORAGE_BLOCK:
    default:
@@ -79,9 +84,9 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
    }
 
    /* Validate interface. */
-   if (!supported_interface_enum(programInterface)) {
+   if (!supported_interface_enum(ctx, programInterface)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramInterfaceiv(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return;
    }
 
@@ -96,8 +101,8 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
       if (programInterface == GL_ATOMIC_COUNTER_BUFFER) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glGetProgramInterfaceiv(%s pname %s)",
-                     _mesa_lookup_enum_by_nr(programInterface),
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(programInterface),
+                     _mesa_enum_to_string(pname));
          return;
       }
       /* Name length consists of base name, 3 additional chars '[0]' if
@@ -138,15 +143,40 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
       default:
         _mesa_error(ctx, GL_INVALID_OPERATION,
                     "glGetProgramInterfaceiv(%s pname %s)",
-                    _mesa_lookup_enum_by_nr(programInterface),
-                    _mesa_lookup_enum_by_nr(pname));
+                    _mesa_enum_to_string(programInterface),
+                    _mesa_enum_to_string(pname));
       };
       break;
    case GL_MAX_NUM_COMPATIBLE_SUBROUTINES:
+      switch (programInterface) {
+      case GL_VERTEX_SUBROUTINE_UNIFORM:
+      case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM: {
+         for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
+            if (shProg->ProgramResourceList[i].Type == programInterface) {
+               struct gl_uniform_storage *uni =
+                  (struct gl_uniform_storage *)
+                  shProg->ProgramResourceList[i].Data;
+               *params = MAX2(*params, uni->num_compatible_subroutines);
+            }
+         }
+         break;
+      }
+
+      default:
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glGetProgramInterfaceiv(%s pname %s)",
+                     _mesa_enum_to_string(programInterface),
+                     _mesa_enum_to_string(pname));
+      }
+      break;
    default:
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glGetProgramInterfaceiv(pname %s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
    }
 }
 
@@ -173,32 +203,12 @@ is_xfb_marker(const char *str)
    return false;
 }
 
-/**
- * Checks if given name index is legal for GetProgramResourceIndex,
- * check is written to be compatible with GL_ARB_array_of_arrays.
- */
-static bool
-valid_program_resource_index_name(const GLchar *name)
-{
-   const char *array = strstr(name, "[");
-   const char *close = strrchr(name, ']');
-
-   /* Not array, no need for the check. */
-   if (!array)
-      return true;
-
-   /* Last array index has to be zero. */
-   if (!close || *--close != '0')
-      return false;
-
-   return true;
-}
-
 GLuint GLAPIENTRY
 _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
                               const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+   unsigned array_index = 0;
    struct gl_program_resource *res;
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program,
@@ -206,6 +216,11 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
    if (!shProg || !name)
       return GL_INVALID_INDEX;
 
+   if (!supported_interface_enum(ctx, programInterface)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
+                  _mesa_enum_to_string(programInterface));
+      return GL_INVALID_INDEX;
+   }
    /*
     * For the interface TRANSFORM_FEEDBACK_VARYING, the value INVALID_INDEX
     * should be returned when querying the index assigned to the special names
@@ -217,24 +232,33 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
       return GL_INVALID_INDEX;
 
    switch (programInterface) {
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_VERTEX_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
    case GL_UNIFORM:
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      /* Validate name syntax for array variables */
-      if (!valid_program_resource_index_name(name))
-         return GL_INVALID_INDEX;
-      /* fall-through */
    case GL_UNIFORM_BLOCK:
-      res = _mesa_program_resource_find_name(shProg, programInterface, name);
-      if (!res)
+      res = _mesa_program_resource_find_name(shProg, programInterface, name,
+                                             &array_index);
+      if (!res || array_index > 0)
          return GL_INVALID_INDEX;
 
       return _mesa_program_resource_index(shProg, res);
    case GL_ATOMIC_COUNTER_BUFFER:
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
    }
 
    return GL_INVALID_INDEX;
@@ -250,19 +274,13 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
       _mesa_lookup_shader_program_err(ctx, program,
                                       "glGetProgramResourceName");
 
-   /* Set user friendly return values in case of errors. */
-   if (name)
-      *name = '\0';
-   if (length)
-      *length = 0;
-
    if (!shProg || !name)
       return;
 
    if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
-       !supported_interface_enum(programInterface)) {
+       !supported_interface_enum(ctx, programInterface)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceName(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return;
    }
 
@@ -300,36 +318,6 @@ _mesa_GetProgramResourceiv(GLuint program, GLenum programInterface,
                                 propCount, props, bufSize, length, params);
 }
 
-/**
- * Function verifies syntax of given name for GetProgramResourceLocation
- * and GetProgramResourceLocationIndex for the following cases:
- *
- * "array element portion of a string passed to GetProgramResourceLocation
- * or GetProgramResourceLocationIndex must not have, a "+" sign, extra
- * leading zeroes, or whitespace".
- *
- * Check is written to be compatible with GL_ARB_array_of_arrays.
- */
-static bool
-invalid_array_element_syntax(const GLchar *name)
-{
-   char *first = strchr(name, '[');
-   char *last = strrchr(name, '[');
-
-   if (!first)
-      return false;
-
-   /* No '+' or ' ' allowed anywhere. */
-   if (strchr(first, '+') || strchr(first, ' '))
-      return true;
-
-   /* Check that last array index is 0. */
-   if (last[1] == '0' && last[2] != ']')
-      return true;
-
-   return false;
-}
-
 static struct gl_shader_program *
 lookup_linked_program(GLuint program, const char *caller)
 {
@@ -356,7 +344,7 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocation");
 
-   if (!shProg || !name || invalid_array_element_syntax(name))
+   if (!shProg || !name)
       return -1;
 
    /* Validate programInterface. */
@@ -366,24 +354,33 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
    case GL_PROGRAM_OUTPUT:
       break;
 
-   /* For reference valid cases requiring additional extension support:
-    * GL_ARB_shader_subroutine
-    * GL_ARB_tessellation_shader
-    * GL_ARB_compute_shader
-    */
    case GL_VERTEX_SUBROUTINE_UNIFORM:
-   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
-   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
    case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_geometry_shaders(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
    case GL_COMPUTE_SUBROUTINE_UNIFORM:
-
+      if (!_mesa_has_compute_shaders(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_tessellation(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
    default:
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceLocation(%s %s)",
-                  _mesa_lookup_enum_by_nr(programInterface), name);
+         goto fail;
    }
 
    return _mesa_program_resource_location(shProg, programInterface, name);
+fail:
+   _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceLocation(%s %s)",
+               _mesa_enum_to_string(programInterface), name);
+   return -1;
 }
 
 /**
@@ -397,7 +394,7 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocationIndex");
 
-   if (!shProg || !name || invalid_array_element_syntax(name))
+   if (!shProg || !name)
       return -1;
 
    /* From the GL_ARB_program_interface_query spec:
@@ -408,7 +405,7 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
    if (programInterface != GL_PROGRAM_OUTPUT) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetProgramResourceLocationIndex(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return -1;
    }
 
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index 5ff1b953231..98366857f62 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -217,7 +217,7 @@ get_query_binding_point(struct gl_context *ctx, GLenum target, GLuint index)
 
    case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
    case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
-      if (ctx->Extensions.ARB_tessellation_shader)
+      if (_mesa_has_tessellation(ctx))
          return get_pipe_stats_binding_point(ctx, target);
       else
          return NULL;
@@ -295,7 +295,7 @@ _mesa_CreateQueries(GLenum target, GLsizei n, GLuint *ids)
       break;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glCreateQueries(invalid target = %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -390,7 +390,7 @@ _mesa_BeginQueryIndexed(GLenum target, GLuint index, GLuint id)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBeginQueryIndexed(%s, %u, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), index, id);
+                  _mesa_enum_to_string(target), index, id);
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -412,7 +412,7 @@ _mesa_BeginQueryIndexed(GLenum target, GLuint index, GLuint id)
    if (*bindpt) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glBeginQuery{Indexed}(target=%s is active)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -496,7 +496,7 @@ _mesa_EndQueryIndexed(GLenum target, GLuint index)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glEndQueryIndexed(%s, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), index);
+                  _mesa_enum_to_string(target), index);
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -516,8 +516,8 @@ _mesa_EndQueryIndexed(GLenum target, GLuint index)
    if (q && q->Target != target) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glEndQuery(target=%s with active query of target %s)",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(q->Target));
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(q->Target));
       return;
    }
 
@@ -553,7 +553,7 @@ _mesa_QueryCounter(GLuint id, GLenum target)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glQueryCounter(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
 
    /* error checking */
    if (target != GL_TIMESTAMP) {
@@ -628,9 +628,9 @@ _mesa_GetQueryIndexediv(GLenum target, GLuint index, GLenum pname,
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryIndexediv(%s, %u, %s)\n",
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   index,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -712,7 +712,7 @@ _mesa_GetQueryIndexediv(GLenum target, GLuint index, GLenum pname,
          default:
             _mesa_problem(ctx,
                           "Unknown target in glGetQueryIndexediv(target = %s)",
-                          _mesa_lookup_enum_by_nr(target));
+                          _mesa_enum_to_string(target));
             *params = 0;
             break;
          }
@@ -740,7 +740,7 @@ _mesa_GetQueryObjectiv(GLuint id, GLenum pname, GLint *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectiv(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -794,7 +794,7 @@ _mesa_GetQueryObjectuiv(GLuint id, GLenum pname, GLuint *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectuiv(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -851,7 +851,7 @@ _mesa_GetQueryObjecti64v(GLuint id, GLenum pname, GLint64EXT *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjecti64v(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -894,7 +894,7 @@ _mesa_GetQueryObjectui64v(GLuint id, GLenum pname, GLuint64EXT *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectui64v(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index a3357cd6419..d826ecfc3d5 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -47,28 +47,47 @@
  * Return true if the conversion L=R+G+B is needed.
  */
 GLboolean
-_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
+_mesa_need_rgb_to_luminance_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat)
 {
-   GLenum baseTexFormat = _mesa_get_format_base_format(texFormat);
-
-   return (baseTexFormat == GL_RG ||
-           baseTexFormat == GL_RGB ||
-           baseTexFormat == GL_RGBA) &&
-          (format == GL_LUMINANCE ||
-           format == GL_LUMINANCE_ALPHA ||
-           format == GL_LUMINANCE_INTEGER_EXT ||
-           format == GL_LUMINANCE_ALPHA_INTEGER_EXT);
+   return (srcBaseFormat == GL_RG ||
+           srcBaseFormat == GL_RGB ||
+           srcBaseFormat == GL_RGBA) &&
+          (dstBaseFormat == GL_LUMINANCE ||
+           dstBaseFormat == GL_LUMINANCE_ALPHA);
 }
 
+/**
+ * Return true if the conversion L,I to RGB conversion is needed.
+ */
+GLboolean
+_mesa_need_luminance_to_rgb_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat)
+{
+   return (srcBaseFormat == GL_LUMINANCE ||
+           srcBaseFormat == GL_LUMINANCE_ALPHA ||
+           srcBaseFormat == GL_INTENSITY) &&
+          (dstBaseFormat == GL_GREEN ||
+           dstBaseFormat == GL_BLUE ||
+           dstBaseFormat == GL_RG ||
+           dstBaseFormat == GL_RGB ||
+           dstBaseFormat == GL_BGR ||
+           dstBaseFormat == GL_RGBA ||
+           dstBaseFormat == GL_BGRA);
+}
 
 /**
  * Return transfer op flags for this ReadPixels operation.
  */
-static GLbitfield
-get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
-                            GLenum format, GLenum type, GLboolean uses_blit)
+GLbitfield
+_mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
+                                  mesa_format texFormat,
+                                  GLenum format, GLenum type,
+                                  GLboolean uses_blit)
 {
    GLbitfield transferOps = ctx->_ImageTransferState;
+   GLenum srcBaseFormat = _mesa_get_format_base_format(texFormat);
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    if (format == GL_DEPTH_COMPONENT ||
        format == GL_DEPTH_STENCIL ||
@@ -105,7 +124,7 @@ get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
     * have any effect anyway.
     */
    if (_mesa_get_format_datatype(texFormat) == GL_UNSIGNED_NORMALIZED &&
-       !_mesa_need_rgb_to_luminance_conversion(texFormat, format)) {
+       !_mesa_need_rgb_to_luminance_conversion(srcBaseFormat, dstBaseFormat)) {
       transferOps &= ~IMAGE_CLAMP_BIT;
    }
 
@@ -128,7 +147,7 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 {
    struct gl_renderbuffer *rb =
          _mesa_get_read_renderbuffer_for_format(ctx, format);
-   GLenum srcType;
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    assert(rb);
 
@@ -149,28 +168,14 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 
    default:
       /* Color formats. */
-      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format)) {
-         return GL_TRUE;
-      }
-
-      /* Conversion between signed and unsigned integers needs masking
-       * (it isn't just memcpy). */
-      srcType = _mesa_get_format_datatype(rb->Format);
-
-      if ((srcType == GL_INT &&
-           (type == GL_UNSIGNED_INT ||
-            type == GL_UNSIGNED_SHORT ||
-            type == GL_UNSIGNED_BYTE)) ||
-          (srcType == GL_UNSIGNED_INT &&
-           (type == GL_INT ||
-            type == GL_SHORT ||
-            type == GL_BYTE))) {
+      if (_mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat,
+                                                 dstBaseFormat)) {
          return GL_TRUE;
       }
 
       /* And finally, see if there are any transfer ops. */
-      return get_readpixels_transfer_ops(ctx, rb->Format, format, type,
-                                         uses_blit) != 0;
+      return _mesa_get_readpixels_transfer_ops(ctx, rb->Format, format, type,
+                                               uses_blit) != 0;
    }
    return GL_FALSE;
 }
@@ -263,7 +268,7 @@ read_uint_depth_pixels( struct gl_context *ctx,
    GLubyte *map, *dst;
    int stride, dstStride, j;
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0)
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F)
       return GL_FALSE;
 
    if (packing->SwapBytes)
@@ -432,18 +437,19 @@ read_rgba_pixels( struct gl_context *ctx,
    uint8_t rebase_swizzle[4];
    struct gl_framebuffer *fb = ctx->ReadBuffer;
    struct gl_renderbuffer *rb = fb->_ColorReadBuffer;
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    if (!rb)
       return;
 
-   transferOps = get_readpixels_transfer_ops(ctx, rb->Format, format, type,
-                                             GL_FALSE);
+   transferOps = _mesa_get_readpixels_transfer_ops(ctx, rb->Format, format,
+                                                   type, GL_FALSE);
    /* Describe the dst format */
    dst_is_integer = _mesa_is_enum_format_integer(format);
    dst_stride = _mesa_image_row_stride(packing, width, format, type);
    dst_format = _mesa_format_from_format_and_type(format, type);
    convert_rgb_to_lum =
-      _mesa_need_rgb_to_luminance_conversion(rb->Format, format);
+      _mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat, dstBaseFormat);
    dst = (GLubyte *) _mesa_image_address2d(packing, pixels, width, height,
                                            format, type, 0, 0);
 
@@ -815,7 +821,7 @@ read_depth_stencil_pixels(struct gl_context *ctx,
                           const struct gl_pixelstore_attrib *packing )
 {
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
+      = ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F;
    const GLboolean stencilTransfer = ctx->Pixel.IndexShift
       || ctx->Pixel.IndexOffset || ctx->Pixel.MapStencilFlag;
    GLubyte *dst;
@@ -910,10 +916,8 @@ read_pixels_es3_error_check(GLenum format, GLenum type,
    const GLenum data_type = _mesa_get_format_datatype(rb->Format);
    GLboolean is_unsigned_int = GL_FALSE;
    GLboolean is_signed_int = GL_FALSE;
-
-   if (!_mesa_is_color_format(internalFormat)) {
-      return GL_INVALID_OPERATION;
-   }
+   GLboolean is_float_depth = (internalFormat == GL_DEPTH_COMPONENT32F) ||
+         (internalFormat == GL_DEPTH32F_STENCIL8);
 
    is_unsigned_int = _mesa_is_enum_format_unsigned_int(internalFormat);
    if (!is_unsigned_int) {
@@ -944,6 +948,43 @@ read_pixels_es3_error_check(GLenum format, GLenum type,
           (is_unsigned_int && type == GL_UNSIGNED_INT))
          return GL_NO_ERROR;
       break;
+   case GL_DEPTH_STENCIL:
+      switch (type) {
+      case GL_FLOAT_32_UNSIGNED_INT_24_8_REV:
+         if (is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      case GL_UNSIGNED_INT_24_8:
+         if (!is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
+   case GL_DEPTH_COMPONENT:
+      switch (type) {
+      case GL_FLOAT:
+         if (is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      case GL_UNSIGNED_SHORT:
+      case GL_UNSIGNED_INT_24_8:
+         if (!is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
+   case GL_STENCIL_INDEX:
+      switch (type) {
+      case GL_UNSIGNED_BYTE:
+         return GL_NO_ERROR;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
    }
 
    return GL_INVALID_OPERATION;
@@ -966,8 +1007,8 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glReadPixels(%d, %d, %s, %s, %p)\n",
                   width, height,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type),
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type),
                   pixels);
 
    if (width < 0 || height < 0) {
@@ -1017,15 +1058,10 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
          err = read_pixels_es3_error_check(format, type, rb);
       }
 
-      if (err == GL_NO_ERROR && (format == GL_DEPTH_COMPONENT
-          || format == GL_DEPTH_STENCIL)) {
-         err = GL_INVALID_ENUM;
-      }
-
       if (err != GL_NO_ERROR) {
          _mesa_error(ctx, err, "glReadPixels(invalid format %s and/or type %s)",
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type));
+                     _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type));
          return;
       }
    }
@@ -1033,8 +1069,8 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
    err = _mesa_error_check_format_and_type(ctx, format, type);
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err, "glReadPixels(invalid format %s and/or type %s)",
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return;
    }
 
diff --git a/src/mesa/main/readpix.h b/src/mesa/main/readpix.h
index 1636dd9ce3e..481ad9d9c37 100644
--- a/src/mesa/main/readpix.h
+++ b/src/mesa/main/readpix.h
@@ -38,7 +38,18 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
                                  GLenum type, GLboolean uses_blit);
 
 extern GLboolean
-_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format);
+_mesa_need_rgb_to_luminance_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat);
+
+extern GLboolean
+_mesa_need_luminance_to_rgb_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat);
+
+extern GLbitfield
+_mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
+                                  mesa_format texFormat,
+                                  GLenum format, GLenum type,
+                                  GLboolean uses_blit);
 
 extern void
 _mesa_readpixels(struct gl_context *ctx,
diff --git a/src/mesa/main/samplerobj.c b/src/mesa/main/samplerobj.c
index a3aacc66aa3..32180fb1ba2 100644
--- a/src/mesa/main/samplerobj.c
+++ b/src/mesa/main/samplerobj.c
@@ -689,7 +689,7 @@ set_sampler_max_anisotropy(struct gl_context *ctx,
    if (samp->MaxAnisotropy == param)
       return GL_FALSE;
 
-   if (param < 1.0)
+   if (param < 1.0F)
       return INVALID_VALUE;
 
    flush(ctx);
@@ -813,7 +813,7 @@ _mesa_SamplerParameteri(GLuint sampler, GLenum pname, GLint param)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteri(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteri(param=%d)\n",
@@ -906,7 +906,7 @@ _mesa_SamplerParameterf(GLuint sampler, GLenum pname, GLfloat param)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterf(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterf(param=%f)\n",
@@ -1006,7 +1006,7 @@ _mesa_SamplerParameteriv(GLuint sampler, GLenum pname, const GLint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteriv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteriv(param=%d)\n",
@@ -1099,7 +1099,7 @@ _mesa_SamplerParameterfv(GLuint sampler, GLenum pname, const GLfloat *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterfv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterfv(param=%f)\n",
@@ -1184,7 +1184,7 @@ _mesa_SamplerParameterIiv(GLuint sampler, GLenum pname, const GLint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIiv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIiv(param=%d)\n",
@@ -1270,7 +1270,7 @@ _mesa_SamplerParameterIuiv(GLuint sampler, GLenum pname, const GLuint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIuiv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIuiv(param=%u)\n",
@@ -1380,7 +1380,7 @@ _mesa_GetSamplerParameteriv(GLuint sampler, GLenum pname, GLint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameteriv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1466,7 +1466,7 @@ _mesa_GetSamplerParameterfv(GLuint sampler, GLenum pname, GLfloat *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterfv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1545,7 +1545,7 @@ _mesa_GetSamplerParameterIiv(GLuint sampler, GLenum pname, GLint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterIiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1624,7 +1624,7 @@ _mesa_GetSamplerParameterIuiv(GLuint sampler, GLenum pname, GLuint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterIuiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index a6246a39aad..ee7320221e2 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -44,7 +44,8 @@ extern "C" {
 
 static GLint
 program_resource_location(struct gl_shader_program *shProg,
-                          struct gl_program_resource *res, const char *name);
+                          struct gl_program_resource *res, const char *name,
+                          unsigned array_index);
 
 /**
  * Declare convenience functions to return resource data in a given type.
@@ -61,6 +62,7 @@ DECL_RESOURCE_FUNC(UBO, gl_uniform_block);
 DECL_RESOURCE_FUNC(UNI, gl_uniform_storage);
 DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer);
 DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
 void GLAPIENTRY
 _mesa_BindAttribLocation(GLhandleARB program, GLuint index,
@@ -189,63 +191,6 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
                                   (GLint *) type, "glGetActiveAttrib");
 }
 
-/* Locations associated with shader variables (array or non-array) can be
- * queried using its base name or using the base name appended with the
- * valid array index. For example, in case of below vertex shader, valid
- * queries can be made to know the location of "xyz", "array", "array[0]",
- * "array[1]", "array[2]" and "array[3]". In this example index reurned
- * will be 0, 0, 0, 1, 2, 3 respectively.
- *
- * [Vertex Shader]
- * layout(location=0) in vec4 xyz;
- * layout(location=1) in vec4[4] array;
- * void main()
- * { }
- *
- * This requirement came up with the addition of ARB_program_interface_query
- * to OpenGL 4.3 specification. See page 101 (page 122 of the PDF) for details.
- *
- * This utility function is used by:
- * _mesa_GetAttribLocation
- * _mesa_GetFragDataLocation
- * _mesa_GetFragDataIndex
- *
- * Returns 0:
- *    if the 'name' string matches var->name.
- * Returns 'matched index':
- *    if the 'name' string matches var->name appended with valid array index.
- */
-int static inline
-get_matching_index(const ir_variable *const var, const char *name) {
-   unsigned idx = 0;
-   const char *const paren = strchr(name, '[');
-   const unsigned len = (paren != NULL) ? paren - name : strlen(name);
-
-   if (paren != NULL) {
-      if (!var->type->is_array())
-         return -1;
-
-      char *endptr;
-      idx = (unsigned) strtol(paren + 1, &endptr, 10);
-      const unsigned idx_len = endptr != (paren + 1) ? endptr - paren - 1 : 0;
-
-      /* Validate the sub string representing index in 'name' string */
-      if ((idx > 0 && paren[1] == '0') /* leading zeroes */
-          || (idx == 0 && idx_len > 1) /* all zeroes */
-          || paren[1] == ' ' /* whitespace */
-          || endptr[0] != ']' /* closing brace */
-          || endptr[1] != '\0' /* null char */
-          || idx_len == 0 /* missing index */
-          || idx >= var->type->length) /* exceeding array bound */
-         return -1;
-   }
-
-   if (strncmp(var->name, name, len) == 0 && var->name[len] == '\0')
-      return idx;
-
-   return -1;
-}
-
 GLint GLAPIENTRY
 _mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
 {
@@ -271,13 +216,15 @@ _mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
    if (shProg->_LinkedShaders[MESA_SHADER_VERTEX] == NULL)
       return -1;
 
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, GL_PROGRAM_INPUT, name);
+      _mesa_program_resource_find_name(shProg, GL_PROGRAM_INPUT, name,
+                                       &array_index);
 
    if (!res)
       return -1;
 
-   GLint loc = program_resource_location(shProg, res, name);
+   GLint loc = program_resource_location(shProg, res, name, array_index);
 
    /* The extra check against against 0 is made because of builtin-attribute
     * locations that have offset applied. Function program_resource_location
@@ -455,13 +402,15 @@ _mesa_GetFragDataLocation(GLuint program, const GLchar *name)
    if (shProg->_LinkedShaders[MESA_SHADER_FRAGMENT] == NULL)
       return -1;
 
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, GL_PROGRAM_OUTPUT, name);
+      _mesa_program_resource_find_name(shProg, GL_PROGRAM_OUTPUT, name,
+                                       &array_index);
 
    if (!res)
       return -1;
 
-   GLint loc = program_resource_location(shProg, res, name);
+   GLint loc = program_resource_location(shProg, res, name, array_index);
 
    /* The extra check against against 0 is made because of builtin-attribute
     * locations that have offset applied. Function program_resource_location
@@ -497,6 +446,20 @@ _mesa_program_resource_name(struct gl_program_resource *res)
       return RESOURCE_VAR(res)->name;
    case GL_UNIFORM:
       return RESOURCE_UNI(res)->name;
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      return RESOURCE_UNI(res)->name + MESA_SUBROUTINE_PREFIX_LEN;
+   case GL_VERTEX_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE:
+      return RESOURCE_SUB(res)->name;
    default:
       assert(!"support for resource type not implemented");
    }
@@ -515,7 +478,19 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
    case GL_PROGRAM_OUTPUT:
       return RESOURCE_VAR(res)->data.max_array_access;
    case GL_UNIFORM:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
       return RESOURCE_UNI(res)->array_elements;
+   case GL_VERTEX_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE:
    case GL_ATOMIC_COUNTER_BUFFER:
    case GL_UNIFORM_BLOCK:
       return 0;
@@ -525,39 +500,32 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
    return 0;
 }
 
-static int
-array_index_of_resource(struct gl_program_resource *res,
-                        const char *name)
+/**
+ * Checks if array subscript is valid and if so sets array_index.
+ */
+static bool
+valid_array_index(const GLchar *name, unsigned *array_index)
 {
-   assert(res->Data);
+   long idx = 0;
+   const GLchar *out_base_name_end;
 
-   switch (res->Type) {
-   case GL_PROGRAM_INPUT:
-   case GL_PROGRAM_OUTPUT:
-      return get_matching_index(RESOURCE_VAR(res), name);
-   default:
-      assert(!"support for resource type not implemented");
-      return -1;
-   }
+   idx = parse_program_resource_name(name, &out_base_name_end);
+   if (idx < 0)
+      return false;
+
+   if (array_index)
+      *array_index = idx;
+
+   return true;
 }
 
 /* Find a program resource with specific name in given interface.
  */
 struct gl_program_resource *
 _mesa_program_resource_find_name(struct gl_shader_program *shProg,
-                                 GLenum programInterface, const char *name)
+                                 GLenum programInterface, const char *name,
+                                 unsigned *array_index)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   const char *full_name = name;
-
-   /* When context has 'VertexID_is_zero_based' set, gl_VertexID has been
-    * lowered to gl_VertexIDMESA.
-    */
-   if (name && ctx->Const.VertexID_is_zero_based) {
-      if (strcmp(name, "gl_VertexID") == 0)
-         full_name = "gl_VertexIDMESA";
-   }
-
    struct gl_program_resource *res = shProg->ProgramResourceList;
    for (unsigned i = 0; i < shProg->NumProgramResourceList; i++, res++) {
       if (res->Type != programInterface)
@@ -567,26 +535,46 @@ _mesa_program_resource_find_name(struct gl_shader_program *shProg,
       const char *rname = _mesa_program_resource_name(res);
       unsigned baselen = strlen(rname);
 
-      switch (programInterface) {
-      case GL_TRANSFORM_FEEDBACK_VARYING:
-      case GL_UNIFORM_BLOCK:
-      case GL_UNIFORM:
-         if (strncmp(rname, name, baselen) == 0) {
+      if (strncmp(rname, name, baselen) == 0) {
+         switch (programInterface) {
+         case GL_UNIFORM_BLOCK:
             /* Basename match, check if array or struct. */
             if (name[baselen] == '\0' ||
                 name[baselen] == '[' ||
                 name[baselen] == '.') {
                return res;
             }
+            break;
+         case GL_TRANSFORM_FEEDBACK_VARYING:
+         case GL_UNIFORM:
+         case GL_VERTEX_SUBROUTINE_UNIFORM:
+         case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+         case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+         case GL_COMPUTE_SUBROUTINE_UNIFORM:
+         case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+         case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+         case GL_VERTEX_SUBROUTINE:
+         case GL_GEOMETRY_SUBROUTINE:
+         case GL_FRAGMENT_SUBROUTINE:
+         case GL_COMPUTE_SUBROUTINE:
+         case GL_TESS_CONTROL_SUBROUTINE:
+         case GL_TESS_EVALUATION_SUBROUTINE:
+            if (name[baselen] == '.') {
+               return res;
+            }
+            /* fall-through */
+         case GL_PROGRAM_INPUT:
+         case GL_PROGRAM_OUTPUT:
+            if (name[baselen] == '\0') {
+               return res;
+            } else if (name[baselen] == '[' &&
+                valid_array_index(name, array_index)) {
+               return res;
+            }
+            break;
+         default:
+            assert(!"not implemented for given interface");
          }
-         break;
-      case GL_PROGRAM_INPUT:
-      case GL_PROGRAM_OUTPUT:
-         if (array_index_of_resource(res, full_name) >= 0)
-            return res;
-         break;
-      default:
-         assert(!"not implemented for given interface");
       }
    }
    return NULL;
@@ -651,6 +639,18 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
       case GL_UNIFORM:
+      case GL_VERTEX_SUBROUTINE_UNIFORM:
+      case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      case GL_VERTEX_SUBROUTINE:
+      case GL_GEOMETRY_SUBROUTINE:
+      case GL_FRAGMENT_SUBROUTINE:
+      case GL_COMPUTE_SUBROUTINE:
+      case GL_TESS_CONTROL_SUBROUTINE:
+      case GL_TESS_EVALUATION_SUBROUTINE:
          if (++idx == (int) index)
             return res;
          break;
@@ -719,6 +719,12 @@ _mesa_get_program_resource_name(struct gl_shader_program *shProg,
    bool add_index = !(((programInterface == GL_PROGRAM_INPUT) &&
                        res->StageReferences & (1 << MESA_SHADER_GEOMETRY)));
 
+   /* Transform feedback varyings have array index already appended
+    * in their names.
+    */
+   if (programInterface == GL_TRANSFORM_FEEDBACK_VARYING)
+      add_index = false;
+
    if (add_index && _mesa_program_resource_array_size(res)) {
       int i;
 
@@ -736,17 +742,9 @@ _mesa_get_program_resource_name(struct gl_shader_program *shProg,
 
 static GLint
 program_resource_location(struct gl_shader_program *shProg,
-                          struct gl_program_resource *res, const char *name)
+                          struct gl_program_resource *res, const char *name,
+                          unsigned array_index)
 {
-   unsigned index, offset;
-   int array_index = -1;
-
-   if (res->Type == GL_PROGRAM_INPUT || res->Type == GL_PROGRAM_OUTPUT) {
-      array_index = array_index_of_resource(res, name);
-      if (array_index < 0)
-         return -1;
-   }
-
    /* Built-in locations should report GL_INVALID_INDEX. */
    if (is_gl_identifier(name))
       return GL_INVALID_INDEX;
@@ -757,13 +755,22 @@ program_resource_location(struct gl_shader_program *shProg,
     */
    switch (res->Type) {
    case GL_PROGRAM_INPUT:
+      /* If the input is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_VAR(res)->type->length) {
+         return -1;
+      }
       return RESOURCE_VAR(res)->data.location + array_index - VERT_ATTRIB_GENERIC0;
    case GL_PROGRAM_OUTPUT:
+      /* If the output is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_VAR(res)->type->length) {
+         return -1;
+      }
       return RESOURCE_VAR(res)->data.location + array_index - FRAG_RESULT_DATA0;
    case GL_UNIFORM:
-      index = _mesa_get_uniform_location(shProg, name, &offset);
-
-      if (index == GL_INVALID_INDEX)
+      /* If the uniform is built-in, fail. */
+      if (RESOURCE_UNI(res)->builtin)
          return -1;
 
       /* From the GL_ARB_uniform_buffer_object spec:
@@ -777,9 +784,21 @@ program_resource_location(struct gl_shader_program *shProg,
           RESOURCE_UNI(res)->atomic_buffer_index != -1)
          return -1;
 
-      /* location in remap table + array element offset */
-      return RESOURCE_UNI(res)->remap_location + offset;
+      /* fallthrough */
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      /* If the uniform is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_UNI(res)->array_elements) {
+         return -1;
+      }
 
+      /* location in remap table + array element offset */
+      return RESOURCE_UNI(res)->remap_location + array_index;
    default:
       return -1;
    }
@@ -787,22 +806,22 @@ program_resource_location(struct gl_shader_program *shProg,
 
 /**
  * Function implements following location queries:
- *    glGetAttribLocation
- *    glGetFragDataLocation
  *    glGetUniformLocation
  */
 GLint
 _mesa_program_resource_location(struct gl_shader_program *shProg,
                                 GLenum programInterface, const char *name)
 {
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, programInterface, name);
+      _mesa_program_resource_find_name(shProg, programInterface, name,
+                                       &array_index);
 
    /* Resource not found. */
    if (!res)
       return -1;
 
-   return program_resource_location(shProg, res, name);
+   return program_resource_location(shProg, res, name, array_index);
 }
 
 /**
@@ -814,7 +833,7 @@ _mesa_program_resource_location_index(struct gl_shader_program *shProg,
                                       GLenum programInterface, const char *name)
 {
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, programInterface, name);
+      _mesa_program_resource_find_name(shProg, programInterface, name, NULL);
 
    /* Non-existent variable or resource is not referenced by fragment stage. */
    if (!res || !(res->StageReferences & (1 << MESA_SHADER_FRAGMENT)))
@@ -829,6 +848,10 @@ stage_from_enum(GLenum ref)
    switch (ref) {
    case GL_REFERENCED_BY_VERTEX_SHADER:
       return MESA_SHADER_VERTEX;
+   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
+      return MESA_SHADER_TESS_EVAL;
    case GL_REFERENCED_BY_GEOMETRY_SHADER:
       return MESA_SHADER_GEOMETRY;
    case GL_REFERENCED_BY_FRAGMENT_SHADER:
@@ -886,7 +909,8 @@ get_buffer_property(struct gl_shader_program *shProg,
          for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) {
             const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName;
             struct gl_program_resource *uni =
-               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname);
+               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname,
+                                                NULL);
             if (!uni)
                continue;
             (*val)++;
@@ -896,7 +920,8 @@ get_buffer_property(struct gl_shader_program *shProg,
          for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) {
             const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName;
             struct gl_program_resource *uni =
-               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname);
+               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname,
+                                                NULL);
             if (!uni)
                continue;
             *val++ =
@@ -925,8 +950,8 @@ get_buffer_property(struct gl_shader_program *shProg,
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
 
    return 0;
 }
@@ -944,11 +969,17 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
 
    switch(prop) {
    case GL_NAME_LENGTH:
-      if (res->Type == GL_ATOMIC_COUNTER_BUFFER)
+      switch (res->Type) {
+      case GL_ATOMIC_COUNTER_BUFFER:
          goto invalid_operation;
-      /* Base name +3 if array '[0]' + terminator. */
-      *val = strlen(_mesa_program_resource_name(res)) +
-         (_mesa_program_resource_array_size(res) > 0 ? 3 : 0) + 1;
+      case GL_TRANSFORM_FEEDBACK_VARYING:
+         *val = strlen(_mesa_program_resource_name(res)) + 1;
+         break;
+      default:
+         /* Base name +3 if array '[0]' + terminator. */
+         *val = strlen(_mesa_program_resource_name(res)) +
+            (_mesa_program_resource_array_size(res) > 0 ? 3 : 0) + 1;
+      }
       return 1;
    case GL_TYPE:
       switch (res->Type) {
@@ -1014,6 +1045,8 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          goto invalid_enum;
       /* fallthrough */
    case GL_REFERENCED_BY_VERTEX_SHADER:
+   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
+   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
    case GL_REFERENCED_BY_GEOMETRY_SHADER:
    case GL_REFERENCED_BY_FRAGMENT_SHADER:
       switch (res->Type) {
@@ -1034,7 +1067,8 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
          *val = program_resource_location(shProg, res,
-                                          _mesa_program_resource_name(res));
+                                          _mesa_program_resource_name(res),
+                                          0);
          return 1;
       default:
          goto invalid_operation;
@@ -1045,10 +1079,54 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       *val = RESOURCE_VAR(res)->data.index;
       return 1;
 
+   case GL_NUM_COMPATIBLE_SUBROUTINES:
+      if (res->Type != GL_VERTEX_SUBROUTINE_UNIFORM &&
+          res->Type != GL_FRAGMENT_SUBROUTINE_UNIFORM &&
+          res->Type != GL_GEOMETRY_SUBROUTINE_UNIFORM &&
+          res->Type != GL_COMPUTE_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_CONTROL_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_EVALUATION_SUBROUTINE_UNIFORM)
+         goto invalid_operation;
+      *val = RESOURCE_UNI(res)->num_compatible_subroutines;
+      return 1;
+   case GL_COMPATIBLE_SUBROUTINES: {
+      const struct gl_uniform_storage *uni;
+      struct gl_shader *sh;
+      unsigned count, i;
+      int j;
+
+      if (res->Type != GL_VERTEX_SUBROUTINE_UNIFORM &&
+          res->Type != GL_FRAGMENT_SUBROUTINE_UNIFORM &&
+          res->Type != GL_GEOMETRY_SUBROUTINE_UNIFORM &&
+          res->Type != GL_COMPUTE_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_CONTROL_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_EVALUATION_SUBROUTINE_UNIFORM)
+         goto invalid_operation;
+      uni = RESOURCE_UNI(res);
+
+      sh = shProg->_LinkedShaders[_mesa_shader_stage_from_subroutine_uniform(res->Type)];
+      count = 0;
+      for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+         struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+         for (j = 0; j < fn->num_compat_types; j++) {
+            if (fn->types[j] == uni->type) {
+               val[count++] = i;
+               break;
+            }
+         }
+      }
+      return count;
+   }
    /* GL_ARB_tessellation_shader */
    case GL_IS_PER_PATCH:
-   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
-   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
+      switch (res->Type) {
+      case GL_PROGRAM_INPUT:
+      case GL_PROGRAM_OUTPUT:
+         *val = RESOURCE_VAR(res)->data.patch;
+         return 1;
+      default:
+         goto invalid_operation;
+      }
    default:
       goto invalid_enum;
    }
@@ -1057,14 +1135,14 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
    return 0;
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
    return 0;
 }
 
@@ -1086,7 +1164,7 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
    if (!res || bufSize < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glGetProgramResourceiv(%s index %d bufSize %d)",
-                  _mesa_lookup_enum_by_nr(programInterface), index, bufSize);
+                  _mesa_enum_to_string(programInterface), index, bufSize);
       return;
    }
 
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index a4296adf799..f9a7d130f9c 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -110,6 +110,7 @@ _mesa_init_shader_state(struct gl_context *ctx)
     */
    struct gl_shader_compiler_options options;
    gl_shader_stage sh;
+   int i;
 
    memset(&options, 0, sizeof(options));
    options.MaxUnrollIterations = 32;
@@ -126,6 +127,12 @@ _mesa_init_shader_state(struct gl_context *ctx)
    /* Extended for ARB_separate_shader_objects */
    ctx->Shader.RefCount = 1;
    mtx_init(&ctx->Shader.Mutex, mtx_plain);
+
+   ctx->TessCtrlProgram.patch_vertices = 3;
+   for (i = 0; i < 4; ++i)
+      ctx->TessCtrlProgram.patch_default_outer_level[i] = 1.0;
+   for (i = 0; i < 2; ++i)
+      ctx->TessCtrlProgram.patch_default_inner_level[i] = 1.0;
 }
 
 
@@ -199,6 +206,9 @@ _mesa_validate_shader_target(const struct gl_context *ctx, GLenum type)
       return ctx == NULL || ctx->Extensions.ARB_vertex_shader;
    case GL_GEOMETRY_SHADER_ARB:
       return ctx == NULL || _mesa_has_geometry_shaders(ctx);
+   case GL_TESS_CONTROL_SHADER:
+   case GL_TESS_EVALUATION_SHADER:
+      return ctx == NULL || _mesa_has_tessellation(ctx);
    case GL_COMPUTE_SHADER:
       return ctx == NULL || ctx->Extensions.ARB_compute_shader;
    default:
@@ -415,6 +425,8 @@ detach_shader(struct gl_context *ctx, GLuint program, GLuint shader)
          /* sanity check - make sure the new list's entries are sensible */
          for (j = 0; j < shProg->NumShaders; j++) {
             assert(shProg->Shaders[j]->Type == GL_VERTEX_SHADER ||
+                   shProg->Shaders[j]->Type == GL_TESS_CONTROL_SHADER ||
+                   shProg->Shaders[j]->Type == GL_TESS_EVALUATION_SHADER ||
                    shProg->Shaders[j]->Type == GL_GEOMETRY_SHADER ||
                    shProg->Shaders[j]->Type == GL_FRAGMENT_SHADER);
             assert(shProg->Shaders[j]->RefCount > 0);
@@ -511,6 +523,57 @@ check_gs_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
 
 
 /**
+ * Check if a tessellation control shader query is valid at this time.
+ * If not, report an error and return false.
+ *
+ * From GL 4.0 section 6.1.12 (Shader and Program Queries):
+ *
+ *     "If TESS_CONTROL_OUTPUT_VERTICES is queried for a program which has
+ *     not been linked successfully, or which does not contain objects to
+ *     form a tessellation control shader, then an INVALID_OPERATION error is
+ *     generated."
+ */
+static bool
+check_tcs_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
+{
+   if (shProg->LinkStatus &&
+       shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL] != NULL) {
+      return true;
+   }
+
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "glGetProgramv(linked tessellation control shader required)");
+   return false;
+}
+
+
+/**
+ * Check if a tessellation evaluation shader query is valid at this time.
+ * If not, report an error and return false.
+ *
+ * From GL 4.0 section 6.1.12 (Shader and Program Queries):
+ *
+ *     "If any of the pname values in this paragraph are queried for a program
+ *     which has not been linked successfully, or which does not contain
+ *     objects to form a tessellation evaluation shader, then an
+ *     INVALID_OPERATION error is generated."
+ *
+ */
+static bool
+check_tes_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
+{
+   if (shProg->LinkStatus &&
+       shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL] != NULL) {
+      return true;
+   }
+
+   _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramv(linked tessellation "
+               "evaluation shader required)");
+   return false;
+}
+
+
+/**
  * glGetProgramiv() - get shader program state.
  * Note that this is for GLSL shader programs, not ARB vertex/fragment
  * programs (see glGetProgramivARB).
@@ -533,6 +596,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
     * and GL 3.2) are available in this context
     */
    const bool has_core_gs = _mesa_has_geometry_shaders(ctx);
+   const bool has_tess = _mesa_has_tessellation(ctx);
 
    /* Are uniform buffer objects available in this context?
     */
@@ -711,12 +775,44 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
    case GL_PROGRAM_SEPARABLE:
       *params = shProg->SeparateShader;
       return;
+
+   /* ARB_tessellation_shader */
+   case GL_TESS_CONTROL_OUTPUT_VERTICES:
+      if (!has_tess)
+         break;
+      if (check_tcs_query(ctx, shProg))
+         *params = shProg->TessCtrl.VerticesOut;
+      return;
+   case GL_TESS_GEN_MODE:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.PrimitiveMode;
+      return;
+   case GL_TESS_GEN_SPACING:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.Spacing;
+      return;
+   case GL_TESS_GEN_VERTEX_ORDER:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.VertexOrder;
+      return;
+   case GL_TESS_GEN_POINT_MODE:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.PointMode;
+      return;
    default:
       break;
    }
 
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -992,6 +1088,12 @@ print_shader_info(const struct gl_shader_program *shProg)
    if (shProg->_LinkedShaders[MESA_SHADER_GEOMETRY])
       printf("  geom prog %u\n",
 	     shProg->_LinkedShaders[MESA_SHADER_GEOMETRY]->Program->Id);
+   if (shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL])
+      printf("  tesc prog %u\n",
+	     shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program->Id);
+   if (shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL])
+      printf("  tese prog %u\n",
+	     shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]->Program->Id);
 }
 
 
@@ -1037,11 +1139,9 @@ use_shader_program(struct gl_context *ctx, gl_shader_stage stage,
        */
       switch (stage) {
       case MESA_SHADER_VERTEX:
-	 /* Empty for now. */
-	 break;
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
       case MESA_SHADER_GEOMETRY:
-	 /* Empty for now. */
-	 break;
       case MESA_SHADER_COMPUTE:
          /* Empty for now. */
          break;
@@ -1071,6 +1171,7 @@ _mesa_use_program(struct gl_context *ctx, struct gl_shader_program *shProg)
       use_shader_program(ctx, i, shProg, &ctx->Shader);
    _mesa_active_program(ctx, shProg, "glUseProgram");
 
+   _mesa_shader_program_init_subroutine_defaults(shProg);
    if (ctx->Driver.UseProgram)
       ctx->Driver.UseProgram(ctx, shProg);
 }
@@ -1172,7 +1273,7 @@ _mesa_CreateShader(GLenum type)
 {
    GET_CURRENT_CONTEXT(ctx);
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glCreateShader %s\n", _mesa_lookup_enum_by_nr(type));
+      _mesa_debug(ctx, "glCreateShader %s\n", _mesa_enum_to_string(type));
    return create_shader(ctx, type);
 }
 
@@ -1331,7 +1432,7 @@ void GLAPIENTRY
 _mesa_GetObjectParameterfvARB(GLhandleARB object, GLenum pname,
                               GLfloat *params)
 {
-   GLint iparams[1];  /* XXX is one element enough? */
+   GLint iparams[1] = {0};  /* XXX is one element enough? */
    _mesa_GetObjectParameterivARB(object, pname, iparams);
    params[0] = (GLfloat) iparams[0];
 }
@@ -1460,7 +1561,7 @@ read_shader(const char *fname)
  */
 void GLAPIENTRY
 _mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count,
-                      const GLcharARB * const * string, const GLint * length)
+                   const GLcharARB * const * string, const GLint * length)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLint *offsets;
@@ -1692,12 +1793,23 @@ _mesa_ShaderBinary(GLint n, const GLuint* shaders, GLenum binaryformat,
                    const void* binary, GLint length)
 {
    GET_CURRENT_CONTEXT(ctx);
-   (void) n;
    (void) shaders;
    (void) binaryformat;
    (void) binary;
-   (void) length;
-   _mesa_error(ctx, GL_INVALID_OPERATION, "glShaderBinary");
+
+   /* Page 68, section 7.2 'Shader Binaries" of the of the OpenGL ES 3.1, and
+    * page 88 of the OpenGL 4.5 specs state:
+    *
+    *     "An INVALID_VALUE error is generated if count or length is negative.
+    *      An INVALID_ENUM error is generated if binaryformat is not a supported
+    *      format returned in SHADER_BINARY_FORMATS."
+    */
+   if (n < 0 || length < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glShaderBinary(count or length < 0)");
+      return;
+   }
+
+   _mesa_error(ctx, GL_INVALID_ENUM, "glShaderBinary(format)");
 }
 
 
@@ -1857,7 +1969,7 @@ _mesa_ProgramParameteri(GLuint program, GLenum pname, GLint value)
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glProgramParameteri(pname=%s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 
@@ -1865,7 +1977,7 @@ invalid_value:
    _mesa_error(ctx, GL_INVALID_VALUE,
                "glProgramParameteri(pname=%s, value=%d): "
                "value must be 0 or 1.",
-               _mesa_lookup_enum_by_nr(pname),
+               _mesa_enum_to_string(pname),
                value);
 }
 
@@ -1885,7 +1997,8 @@ _mesa_use_shader_program(struct gl_context *ctx, GLenum type,
 
 static GLuint
 _mesa_create_shader_program(struct gl_context* ctx, GLboolean separate,
-                            GLenum type, GLsizei count, const GLchar* const *strings)
+                            GLenum type, GLsizei count,
+                            const GLchar* const *strings)
 {
    const GLuint shader = create_shader(ctx, type);
    GLuint program = 0;
@@ -1920,8 +2033,8 @@ _mesa_create_shader_program(struct gl_context* ctx, GLboolean separate,
 	    }
 #endif
 	 }
-
-	 ralloc_strcat(&shProg->InfoLog, sh->InfoLog);
+         if (sh->InfoLog)
+            ralloc_strcat(&shProg->InfoLog, sh->InfoLog);
       }
 
       delete_shader(ctx, shader);
@@ -1944,6 +2057,22 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
    case MESA_SHADER_VERTEX:
       dst->UsesClipDistanceOut = src->Vert.UsesClipDistance;
       break;
+   case MESA_SHADER_TESS_CTRL: {
+      struct gl_tess_ctrl_program *dst_tcp =
+         (struct gl_tess_ctrl_program *) dst;
+      dst_tcp->VerticesOut = src->TessCtrl.VerticesOut;
+      break;
+   }
+   case MESA_SHADER_TESS_EVAL: {
+      struct gl_tess_eval_program *dst_tep =
+         (struct gl_tess_eval_program *) dst;
+      dst_tep->PrimitiveMode = src->TessEval.PrimitiveMode;
+      dst_tep->Spacing = src->TessEval.Spacing;
+      dst_tep->VertexOrder = src->TessEval.VertexOrder;
+      dst_tep->PointMode = src->TessEval.PointMode;
+      dst->UsesClipDistanceOut = src->TessEval.UsesClipDistance;
+      break;
+   }
    case MESA_SHADER_GEOMETRY: {
       struct gl_geometry_program *dst_gp = (struct gl_geometry_program *) dst;
       dst_gp->VerticesIn = src->Geom.VerticesIn;
@@ -1954,20 +2083,20 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
       dst->UsesClipDistanceOut = src->Geom.UsesClipDistance;
       dst_gp->UsesEndPrimitive = src->Geom.UsesEndPrimitive;
       dst_gp->UsesStreams = src->Geom.UsesStreams;
-   }
       break;
+   }
    case MESA_SHADER_FRAGMENT: {
       struct gl_fragment_program *dst_fp = (struct gl_fragment_program *) dst;
       dst_fp->FragDepthLayout = src->FragDepthLayout;
-   }
       break;
+   }
    case MESA_SHADER_COMPUTE: {
       struct gl_compute_program *dst_cp = (struct gl_compute_program *) dst;
       int i;
       for (i = 0; i < 3; i++)
          dst_cp->LocalSize[i] = src->Comp.LocalSize[i];
-   }
       break;
+   }
    default:
       break;
    }
@@ -1984,3 +2113,568 @@ _mesa_CreateShaderProgramv(GLenum type, GLsizei count,
 
    return _mesa_create_shader_program(ctx, GL_TRUE, type, count, strings);
 }
+
+
+/**
+ * For GL_ARB_tessellation_shader
+ */
+extern void GLAPIENTRY
+_mesa_PatchParameteri(GLenum pname, GLint value)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_tessellation(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glPatchParameteri");
+      return;
+   }
+
+   if (pname != GL_PATCH_VERTICES) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glPatchParameteri");
+      return;
+   }
+
+   if (value <= 0 || value > ctx->Const.MaxPatchVertices) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glPatchParameteri");
+      return;
+   }
+
+   ctx->TessCtrlProgram.patch_vertices = value;
+}
+
+
+extern void GLAPIENTRY
+_mesa_PatchParameterfv(GLenum pname, const GLfloat *values)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_tessellation(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glPatchParameterfv");
+      return;
+   }
+
+   switch(pname) {
+   case GL_PATCH_DEFAULT_OUTER_LEVEL:
+      FLUSH_VERTICES(ctx, 0);
+      memcpy(ctx->TessCtrlProgram.patch_default_outer_level, values,
+             4 * sizeof(GLfloat));
+      ctx->NewDriverState |= ctx->DriverFlags.NewDefaultTessLevels;
+      return;
+   case GL_PATCH_DEFAULT_INNER_LEVEL:
+      FLUSH_VERTICES(ctx, 0);
+      memcpy(ctx->TessCtrlProgram.patch_default_inner_level, values,
+             2 * sizeof(GLfloat));
+      ctx->NewDriverState |= ctx->DriverFlags.NewDefaultTessLevels;
+      return;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM, "glPatchParameterfv");
+      return;
+   }
+}
+
+/**
+ * ARB_shader_subroutine
+ */
+GLint GLAPIENTRY
+_mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
+                                   const GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetSubroutineUniformLocation";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return -1;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+   return _mesa_program_resource_location(shProg, resource_type, name);
+}
+
+GLuint GLAPIENTRY
+_mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
+                         const GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetSubroutineIndex";
+   struct gl_shader_program *shProg;
+   struct gl_program_resource *res;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return -1;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine(stage);
+   res = _mesa_program_resource_find_name(shProg, resource_type, name, NULL);
+   if (!res) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+     return -1;
+   }
+
+   return _mesa_program_resource_index(shProg, res);
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
+                                   GLuint index, GLenum pname, GLint *values)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineUniformiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+   struct gl_program_resource *res;
+   const struct gl_uniform_storage *uni;
+   GLenum resource_type;
+   int count, i, j;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   switch (pname) {
+   case GL_NUM_COMPATIBLE_SUBROUTINES: {
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         values[0] = uni->num_compatible_subroutines;
+      }
+      break;
+   }
+   case GL_COMPATIBLE_SUBROUTINES: {
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         count = 0;
+         for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+            struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+            for (j = 0; j < fn->num_compat_types; j++) {
+               if (fn->types[j] == uni->type) {
+                  values[count++] = i;
+                  break;
+               }
+            }
+         }
+      }
+      break;
+   }
+   case GL_UNIFORM_SIZE:
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         values[0] = uni->array_elements ? uni->array_elements : 1;
+      }
+      break;
+   case GL_UNIFORM_NAME_LENGTH:
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         values[0] = strlen(_mesa_program_resource_name(res)) + 1
+            + ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);;
+      }
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
+                                     GLuint index, GLsizei bufsize,
+                                     GLsizei *length, GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineUniformName";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+   /* get program resource name */
+   _mesa_get_program_resource_name(shProg, resource_type,
+                                   index, bufsize,
+                                   length, name, api_name);
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
+                              GLuint index, GLsizei bufsize,
+                              GLsizei *length, GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineName";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+   resource_type = _mesa_shader_stage_to_subroutine(stage);
+   _mesa_get_program_resource_name(shProg, resource_type,
+                                   index, bufsize,
+                                   length, name, api_name);
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
+                            const GLuint *indices)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glUniformSubroutinesuiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+   int i;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   shProg = ctx->_Shader->CurrentProgram[stage];
+   if (!shProg) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (count != sh->NumSubroutineUniformRemapTable) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
+      return;
+   }
+
+   i = 0;
+   do {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count = uni->array_elements ? uni->array_elements : 1;
+      int j, k;
+
+      for (j = i; j < i + uni_count; j++) {
+         struct gl_subroutine_function *subfn;
+         if (indices[j] >= sh->NumSubroutineFunctions) {
+            _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
+            return;
+         }
+
+         subfn = &sh->SubroutineFunctions[indices[j]];
+         for (k = 0; k < subfn->num_compat_types; k++) {
+            if (subfn->types[k] == uni->type)
+               break;
+         }
+         if (k == subfn->num_compat_types) {
+            _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+            return;
+         }
+      }
+      i += uni_count;
+   } while(i < count);
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   i = 0;
+   do {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count = uni->array_elements ? uni->array_elements : 1;
+
+      memcpy(&uni->storage[0], &indices[i],
+             sizeof(GLuint) * uni_count);
+
+      uni->initialized = true;
+      _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
+      i += uni_count;
+   } while(i < count);
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetUniformSubroutineuiv(GLenum shadertype, GLint location,
+                              GLuint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetUniformSubroutineuiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   shProg = ctx->_Shader->CurrentProgram[stage];
+   if (!shProg) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (location >= sh->NumSubroutineUniformRemapTable) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
+      return;
+   }
+
+   {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[location];
+      int offset = location - uni->subroutine[stage].index;
+      memcpy(params, &uni->storage[offset],
+	     sizeof(GLuint));
+   }
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
+                        GLenum pname, GLint *values)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetProgramStageiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   switch (pname) {
+   case GL_ACTIVE_SUBROUTINES:
+      values[0] = sh->NumSubroutineFunctions;
+      break;
+   case GL_ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS:
+      values[0] = sh->NumSubroutineUniformRemapTable;
+      break;
+   case GL_ACTIVE_SUBROUTINE_UNIFORMS:
+      values[0] = sh->NumSubroutineUniformTypes;
+      break;
+   case GL_ACTIVE_SUBROUTINE_MAX_LENGTH:
+   {
+      unsigned i;
+      GLint max_len = 0;
+      GLenum resource_type;
+      struct gl_program_resource *res;
+
+      resource_type = _mesa_shader_stage_to_subroutine(stage);
+      for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+         res = _mesa_program_resource_find_index(shProg, resource_type, i);
+         if (res) {
+            const GLint len = strlen(_mesa_program_resource_name(res)) + 1;
+            if (len > max_len)
+               max_len = len;
+         }
+      }
+      values[0] = max_len;
+      break;
+   }
+   case GL_ACTIVE_SUBROUTINE_UNIFORM_MAX_LENGTH:
+   {
+      unsigned i;
+      GLint max_len = 0;
+      GLenum resource_type;
+      struct gl_program_resource *res;
+
+      resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+      for (i = 0; i < sh->NumSubroutineUniformRemapTable; i++) {
+         res = _mesa_program_resource_find_index(shProg, resource_type, i);
+         if (res) {
+            const GLint len = strlen(_mesa_program_resource_name(res)) + 1
+               + ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);
+
+            if (len > max_len)
+               max_len = len;
+         }
+      }
+      values[0] = max_len;
+      break;
+   }
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", api_name);
+      values[0] = -1;
+      break;
+   }
+}
+
+static int
+find_compat_subroutine(struct gl_shader *sh, const struct glsl_type *type)
+{
+   int i, j;
+
+   for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+      struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+      for (j = 0; j < fn->num_compat_types; j++) {
+         if (fn->types[j] == type)
+            return i;
+      }
+   }
+   return 0;
+}
+
+static void
+_mesa_shader_init_subroutine_defaults(struct gl_shader *sh)
+{
+   int i, j;
+
+   for (i = 0; i < sh->NumSubroutineUniformRemapTable; i++) {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count;
+      int val;
+
+      if (!uni)
+         continue;
+      uni_count = uni->array_elements ? uni->array_elements : 1;
+      val = find_compat_subroutine(sh, uni->type);
+
+      for (j = 0; j < uni_count; j++)
+         memcpy(&uni->storage[j], &val, sizeof(int));
+      uni->initialized = true;
+      _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
+   }
+}
+
+void
+_mesa_shader_program_init_subroutine_defaults(struct gl_shader_program *shProg)
+{
+   int i;
+
+   if (!shProg)
+      return;
+
+   for (i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (!shProg->_LinkedShaders[i])
+         continue;
+
+      _mesa_shader_init_subroutine_defaults(shProg->_LinkedShaders[i]);
+   }
+}
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index aba6d5d8306..0a10191684f 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -232,7 +232,8 @@ _mesa_program_resource_index(struct gl_shader_program *shProg,
 
 extern struct gl_program_resource *
 _mesa_program_resource_find_name(struct gl_shader_program *shProg,
-                                 GLenum programInterface, const char *name);
+                                 GLenum programInterface, const char *name,
+                                 unsigned *array_index);
 
 extern struct gl_program_resource *
 _mesa_program_resource_find_index(struct gl_shader_program *shProg,
@@ -264,6 +265,51 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
                              GLsizei bufSize, GLsizei *length,
                              GLint *params);
 
+/* GL_ARB_tessellation_shader */
+extern void GLAPIENTRY
+_mesa_PatchParameteri(GLenum pname, GLint value);
+
+extern void GLAPIENTRY
+_mesa_PatchParameterfv(GLenum pname, const GLfloat *values);
+
+/* GL_ARB_shader_subroutine */
+void
+_mesa_shader_program_init_subroutine_defaults(struct gl_shader_program *shProg);
+
+extern GLint GLAPIENTRY
+_mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
+                                   const GLchar *name);
+
+extern GLuint GLAPIENTRY
+_mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
+                         const GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
+                                   GLuint index, GLenum pname, GLint *values);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
+                                     GLuint index, GLsizei bufsize,
+                                     GLsizei *length, GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
+                              GLuint index, GLsizei bufsize,
+                              GLsizei *length, GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
+                            const GLuint *indices);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetUniformSubroutineuiv(GLenum shadertype, GLint location,
+                              GLuint *params);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
+                        GLenum pname, GLint *values);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index 80b77275f93..a348cdb0405 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -610,7 +610,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
                         "glBindImageTextures(the internal format %s of "
                         "the level zero texture image of textures[%d]=%u "
                         "is not supported)",
-                        _mesa_lookup_enum_by_nr(tex_format),
+                        _mesa_enum_to_string(tex_format),
                         i, texture);
             continue;
          }
diff --git a/src/mesa/main/shaderobj.h b/src/mesa/main/shaderobj.h
index 3d696a1887e..943044e37cd 100644
--- a/src/mesa/main/shaderobj.h
+++ b/src/mesa/main/shaderobj.h
@@ -111,6 +111,10 @@ _mesa_shader_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_SHADER:
       return MESA_SHADER_GEOMETRY;
+   case GL_TESS_CONTROL_SHADER:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SHADER:
+      return MESA_SHADER_TESS_EVAL;
    case GL_COMPUTE_SHADER:
       return MESA_SHADER_COMPUTE;
    default:
@@ -119,6 +123,107 @@ _mesa_shader_enum_to_shader_stage(GLenum v)
    }
 }
 
+/* 8 bytes + another underscore */
+#define MESA_SUBROUTINE_PREFIX_LEN 9
+static inline const char *
+_mesa_shader_stage_to_subroutine_prefix(gl_shader_stage stage)
+{
+  switch (stage) {
+  case MESA_SHADER_VERTEX:
+    return "__subu_v";
+  case MESA_SHADER_GEOMETRY:
+    return "__subu_g";
+  case MESA_SHADER_FRAGMENT:
+    return "__subu_f";
+  case MESA_SHADER_COMPUTE:
+    return "__subu_c";
+  case MESA_SHADER_TESS_CTRL:
+    return "__subu_t";
+  case MESA_SHADER_TESS_EVAL:
+    return "__subu_e";
+  default:
+    return NULL;
+  }
+}
+
+static inline gl_shader_stage
+_mesa_shader_stage_from_subroutine_uniform(GLenum subuniform)
+{
+   switch (subuniform) {
+   default:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_VERTEX;
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_GEOMETRY;
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_FRAGMENT;
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_COMPUTE;
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_TESS_EVAL;
+   }
+}
+
+static inline gl_shader_stage
+_mesa_shader_stage_from_subroutine(GLenum subroutine)
+{
+   switch (subroutine) {
+   case GL_VERTEX_SUBROUTINE:
+      return MESA_SHADER_VERTEX;
+   case GL_GEOMETRY_SUBROUTINE:
+      return MESA_SHADER_GEOMETRY;
+   case GL_FRAGMENT_SUBROUTINE:
+      return MESA_SHADER_FRAGMENT;
+   case GL_COMPUTE_SUBROUTINE:
+      return MESA_SHADER_COMPUTE;
+   case GL_TESS_CONTROL_SUBROUTINE:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SUBROUTINE:
+      return MESA_SHADER_TESS_EVAL;
+   }
+}
+
+static inline GLenum
+_mesa_shader_stage_to_subroutine(gl_shader_stage stage)
+{
+   switch (stage) {
+   default:
+   case MESA_SHADER_VERTEX:
+      return GL_VERTEX_SUBROUTINE;
+   case MESA_SHADER_GEOMETRY:
+      return GL_GEOMETRY_SUBROUTINE;
+   case MESA_SHADER_FRAGMENT:
+      return GL_FRAGMENT_SUBROUTINE;
+   case MESA_SHADER_COMPUTE:
+      return GL_COMPUTE_SUBROUTINE;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_SUBROUTINE;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_SUBROUTINE;
+   }
+}
+
+static inline GLenum
+_mesa_shader_stage_to_subroutine_uniform(gl_shader_stage stage)
+{
+   switch (stage) {
+   default:
+   case MESA_SHADER_VERTEX:
+      return GL_VERTEX_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_GEOMETRY:
+      return GL_GEOMETRY_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_FRAGMENT:
+      return GL_FRAGMENT_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_COMPUTE:
+      return GL_COMPUTE_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_SUBROUTINE_UNIFORM;
+   }
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index bede7fe1d0e..d3b1c72b08d 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -79,8 +79,8 @@ update_program_enables(struct gl_context *ctx)
 
 
 /**
- * Update the ctx->Vertex/Geometry/FragmentProgram._Current pointers to point
- * to the current/active programs.  Then call ctx->Driver.BindProgram() to
+ * Update the ctx->*Program._Current pointers to point to the
+ * current/active programs.  Then call ctx->Driver.BindProgram() to
  * tell the driver which programs to use.
  *
  * Programs may come from 3 sources: GLSL shaders, ARB/NV_vertex/fragment
@@ -97,6 +97,10 @@ update_program(struct gl_context *ctx)
 {
    const struct gl_shader_program *vsProg =
       ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
+   const struct gl_shader_program *tcsProg =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
+   const struct gl_shader_program *tesProg =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
    const struct gl_shader_program *gsProg =
       ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
    struct gl_shader_program *fsProg =
@@ -106,6 +110,8 @@ update_program(struct gl_context *ctx)
    const struct gl_vertex_program *prevVP = ctx->VertexProgram._Current;
    const struct gl_fragment_program *prevFP = ctx->FragmentProgram._Current;
    const struct gl_geometry_program *prevGP = ctx->GeometryProgram._Current;
+   const struct gl_tess_ctrl_program *prevTCP = ctx->TessCtrlProgram._Current;
+   const struct gl_tess_eval_program *prevTEP = ctx->TessEvalProgram._Current;
    const struct gl_compute_program *prevCP = ctx->ComputeProgram._Current;
    GLbitfield new_state = 0x0;
 
@@ -175,6 +181,30 @@ update_program(struct gl_context *ctx)
       _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
    }
 
+   if (tesProg && tesProg->LinkStatus
+       && tesProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]) {
+      /* Use GLSL tessellation evaluation shader */
+      _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current,
+         gl_tess_eval_program(
+            tesProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]->Program));
+   }
+   else {
+      /* No tessellation evaluation program */
+      _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current, NULL);
+   }
+
+   if (tcsProg && tcsProg->LinkStatus
+       && tcsProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
+      /* Use GLSL tessellation control shader */
+      _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current,
+         gl_tess_ctrl_program(
+            tcsProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program));
+   }
+   else {
+      /* No tessellation control program */
+      _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current, NULL);
+   }
+
    /* Examine vertex program after fragment program as
     * _mesa_get_fixed_func_vertex_program() needs to know active
     * fragprog inputs.
@@ -230,6 +260,22 @@ update_program(struct gl_context *ctx)
       }
    }
 
+   if (ctx->TessEvalProgram._Current != prevTEP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, GL_TESS_EVALUATION_PROGRAM_NV,
+                            (struct gl_program *) ctx->TessEvalProgram._Current);
+      }
+   }
+
+   if (ctx->TessCtrlProgram._Current != prevTCP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, GL_TESS_CONTROL_PROGRAM_NV,
+                            (struct gl_program *) ctx->TessCtrlProgram._Current);
+      }
+   }
+
    if (ctx->VertexProgram._Current != prevVP) {
       new_state |= _NEW_PROGRAM;
       if (ctx->Driver.BindProgram) {
@@ -266,8 +312,8 @@ update_program_constants(struct gl_context *ctx)
       }
    }
 
-   /* Don't handle geometry shaders here. They don't use any state
-    * constants.
+   /* Don't handle tessellation and geometry shaders here. They don't use
+    * any state constants.
     */
 
    if (ctx->VertexProgram._Current) {
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 800720b798e..af89d2c1cfb 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -563,6 +563,8 @@ const struct function common_desktop_functions_possible[] = {
 
    /* GL 4.0 */
    { "glMinSampleShading", 40, -1 },
+   { "glPatchParameteri", 40, -1 },
+   { "glPatchParameterfv", 40, -1 },
    { "glBlendEquationi", 40, -1 },
    { "glBlendEquationSeparatei", 40, -1 },
    { "glBlendFunci", 40, -1 },
@@ -930,6 +932,11 @@ const struct function common_desktop_functions_possible[] = {
 
    /* GL_EXT_polygon_offset_clamp */
    { "glPolygonOffsetClampEXT", 11, -1 },
+
+   /* GL_ARB_get_texture_sub_image */
+   { "glGetTextureSubImage", 20, -1 },
+   { "glGetCompressedTextureSubImage", 20, -1 },
+
    { NULL, 0, -1 }
 };
 
@@ -1424,6 +1431,16 @@ const struct function gl_core_functions_possible[] = {
    /* GL 3.2 */
    { "glFramebufferTexture", 32, -1 },
 
+   /* GL 4.0 */
+   { "glGetSubroutineUniformLocation", 40, -1 },
+   { "glGetSubroutineIndex", 40, -1 },
+   { "glGetActiveSubroutineUniformiv", 40, -1 },
+   { "glGetActiveSubroutineUniformName", 40, -1 },
+   { "glGetActiveSubroutineName", 40, -1 },
+   { "glUniformSubroutinesuiv", 40, -1 },
+   { "glGetUniformSubroutineuiv", 40, -1 },
+   { "glGetProgramStageiv", 40, -1 },
+
    /* GL 4.3 */
    { "glIsRenderbuffer", 43, -1 },
    { "glBindRenderbuffer", 43, -1 },
@@ -1562,16 +1579,6 @@ const struct function gl_core_functions_possible[] = {
    { "glUniformMatrix4x2dv", 40, -1 },
    { "glUniformMatrix4x3dv", 40, -1 },
    { "glGetUniformdv", 43, -1 },
-// { "glGetSubroutineUniformLocation", 43, -1 },        // XXX: Add to xml
-// { "glGetSubroutineIndex", 43, -1 },                  // XXX: Add to xml
-// { "glGetActiveSubroutineUniformiv", 43, -1 },        // XXX: Add to xml
-// { "glGetActiveSubroutineUniformName", 43, -1 },      // XXX: Add to xml
-// { "glGetActiveSubroutineName", 43, -1 },             // XXX: Add to xml
-// { "glUniformSubroutinesuiv", 43, -1 },               // XXX: Add to xml
-// { "glGetUniformSubroutineuiv", 43, -1 },             // XXX: Add to xml
-// { "glGetProgramStageiv", 43, -1 },                   // XXX: Add to xml
-// { "glPatchParameteri", 43, -1 },                     // XXX: Add to xml
-// { "glPatchParameterfv", 43, -1 },                    // XXX: Add to xml
 
    { "glBindTransformFeedback", 43, -1 },
    { "glDeleteTransformFeedbacks", 43, -1 },
diff --git a/src/mesa/main/tests/enum_strings.cpp b/src/mesa/main/tests/enum_strings.cpp
index dc5fe751a86..8218cc9a685 100644
--- a/src/mesa/main/tests/enum_strings.cpp
+++ b/src/mesa/main/tests/enum_strings.cpp
@@ -39,13 +39,13 @@ TEST(EnumStrings, LookUpByNumber)
 {
    for (unsigned i = 0; everything[i].name != NULL; i++) {
       EXPECT_STREQ(everything[i].name,
-		   _mesa_lookup_enum_by_nr(everything[i].value));
+		   _mesa_enum_to_string(everything[i].value));
    }
 }
 
 TEST(EnumStrings, LookUpUnknownNumber)
 {
-   EXPECT_STRCASEEQ("0xEEEE", _mesa_lookup_enum_by_nr(0xEEEE));
+   EXPECT_STRCASEEQ("0xEEEE", _mesa_enum_to_string(0xEEEE));
 }
 
 /* Please type the name and the value.  This makes it easier to detect
@@ -1731,6 +1731,10 @@ const struct enum_info everything[] = {
    { 0x8DDF, "GL_MAX_GEOMETRY_UNIFORM_COMPONENTS" },
    { 0x8DE0, "GL_MAX_GEOMETRY_OUTPUT_VERTICES" },
    { 0x8DE1, "GL_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS" },
+   { 0x8DE5, "GL_ACTIVE_SUBROUTINES" },
+   { 0x8DE6, "GL_ACTIVE_SUBROUTINE_UNIFORMS" },
+   { 0x8DE7, "GL_MAX_SUBROUTINES" },
+   { 0x8DE8, "GL_MAX_SUBROUTINE_UNIFORM_LOCATIONS" },
    { 0x8DF0, "GL_LOW_FLOAT" },
    { 0x8DF1, "GL_MEDIUM_FLOAT" },
    { 0x8DF2, "GL_HIGH_FLOAT" },
@@ -1759,6 +1763,11 @@ const struct enum_info everything[] = {
    { 0x8E44, "GL_TEXTURE_SWIZZLE_B" },
    { 0x8E45, "GL_TEXTURE_SWIZZLE_A" },
    { 0x8E46, "GL_TEXTURE_SWIZZLE_RGBA" },
+   { 0x8E47, "GL_ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS" },
+   { 0x8E48, "GL_ACTIVE_SUBROUTINE_MAX_LENGTH" },
+   { 0x8E49, "GL_ACTIVE_SUBROUTINE_UNIFORM_MAX_LENGTH" },
+   { 0x8E4A, "GL_NUM_COMPATIBLE_SUBROUTINES" },
+   { 0x8E4B, "GL_COMPATIBLE_SUBROUTINES" },
    { 0x8E4C, "GL_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION" },
    { 0x8E4D, "GL_FIRST_VERTEX_CONVENTION" },
    { 0x8E4E, "GL_LAST_VERTEX_CONVENTION" },
diff --git a/src/mesa/main/texenv.c b/src/mesa/main/texenv.c
index 3edafc0f776..091922161c5 100644
--- a/src/mesa/main/texenv.c
+++ b/src/mesa/main/texenv.c
@@ -42,7 +42,7 @@
 
 
 #define TE_ERROR(errCode, msg, value)				\
-   _mesa_error(ctx, errCode, msg, _mesa_lookup_enum_by_nr(value));
+   _mesa_error(ctx, errCode, msg, _mesa_enum_to_string(value));
 
 
 /** Set texture env mode */
@@ -482,16 +482,16 @@ _mesa_TexEnvfv( GLenum target, GLenum pname, const GLfloat *param )
    }
    else {
       _mesa_error(ctx, GL_INVALID_ENUM, "glTexEnv(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
    if (MESA_VERBOSE&(VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexEnv %s %s %.1f(%s) ...\n",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(pname),
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(pname),
                   *param,
-                  _mesa_lookup_enum_by_nr((GLenum) iparam0));
+                  _mesa_enum_to_string((GLenum) iparam0));
 
    /* Tell device driver about the new texture environment */
    if (ctx->Driver.TexEnv) {
diff --git a/src/mesa/main/texformat.c b/src/mesa/main/texformat.c
index 3c4baca7026..f4d17e1bdb5 100644
--- a/src/mesa/main/texformat.c
+++ b/src/mesa/main/texformat.c
@@ -847,7 +847,7 @@ _mesa_choose_tex_format(struct gl_context *ctx, GLenum target,
    }
 
    _mesa_problem(ctx, "unexpected format %s in _mesa_choose_tex_format()",
-                 _mesa_lookup_enum_by_nr(internalFormat));
+                 _mesa_enum_to_string(internalFormat));
    return MESA_FORMAT_NONE;
 }
 
diff --git a/src/mesa/main/texgen.c b/src/mesa/main/texgen.c
index 41e428b69e7..24ba295746a 100644
--- a/src/mesa/main/texgen.c
+++ b/src/mesa/main/texgen.c
@@ -76,10 +76,10 @@ _mesa_TexGenfv( GLenum coord, GLenum pname, const GLfloat *params )
 
    if (MESA_VERBOSE&(VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexGen %s %s %.1f(%s)...\n",
-                  _mesa_lookup_enum_by_nr(coord),
-                  _mesa_lookup_enum_by_nr(pname),
+                  _mesa_enum_to_string(coord),
+                  _mesa_enum_to_string(pname),
                   *params,
-		  _mesa_lookup_enum_by_nr((GLenum) (GLint) *params));
+		  _mesa_enum_to_string((GLenum) (GLint) *params));
 
    if (ctx->Texture.CurrentUnit >= ctx->Const.MaxTextureCoordUnits) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glTexGen(current unit)");
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 92b4d6795c6..c0ccce3d50e 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -75,12 +75,11 @@ type_needs_clamping(GLenum type)
  */
 static void
 get_tex_depth(struct gl_context *ctx, GLuint dimensions,
+              GLint xoffset, GLint yoffset, GLint zoffset,
+              GLsizei width, GLsizei height, GLint depth,
               GLenum format, GLenum type, GLvoid *pixels,
               struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   GLint height = texImage->Height;
-   GLint depth = texImage->Depth;
    GLint img, row;
    GLfloat *depthRow = malloc(width * sizeof(GLfloat));
 
@@ -94,14 +93,15 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
       height = 1;
    }
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint srcRowStride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &srcRowStride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &srcRowStride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -113,7 +113,7 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
             _mesa_pack_depth_span(ctx, width, dest, type, depthRow, &ctx->Pack);
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -130,26 +130,26 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
+                      GLint xoffset, GLint yoffset, GLint zoffset,
+                      GLsizei width, GLsizei height, GLint depth,
                       GLenum format, GLenum type, GLvoid *pixels,
                       struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
    assert(format == GL_DEPTH_STENCIL);
    assert(type == GL_UNSIGNED_INT_24_8 ||
           type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV);
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &rowstride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &rowstride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -166,7 +166,7 @@ get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
             }
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -180,12 +180,11 @@ get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
+                GLint xoffset, GLint yoffset, GLint zoffset,
+                GLsizei width, GLsizei height, GLint depth,
                 GLenum format, GLenum type, GLvoid *pixels,
                 struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
    assert(format == GL_STENCIL_INDEX);
@@ -195,8 +194,9 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT,
                                   &srcMap, &rowstride);
 
       if (srcMap) {
@@ -211,7 +211,7 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
                                            dest);
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -226,22 +226,22 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
+              GLint xoffset, GLint yoffset, GLint zoffset,
+              GLsizei width, GLsizei height, GLint depth,
               GLenum format, GLenum type, GLvoid *pixels,
               struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &rowstride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &rowstride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -264,7 +264,7 @@ get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
             }
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -279,6 +279,8 @@ get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
                         GLenum format, GLenum type, GLvoid *pixels,
                         struct gl_texture_image *texImage,
                         GLbitfield transferOps)
@@ -287,9 +289,6 @@ get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
    const mesa_format texFormat =
       _mesa_get_srgb_format_linear(texImage->TexFormat);
    const GLenum baseFormat = _mesa_get_format_base_format(texFormat);
-   const GLuint width = texImage->Width;
-   const GLuint height = texImage->Height;
-   const GLuint depth = texImage->Depth;
    GLfloat *tempImage, *tempSlice;
    GLuint slice;
    int srcStride, dstStride;
@@ -312,15 +311,15 @@ get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
 
       tempSlice = tempImage + slice * 4 * width * height;
 
-      ctx->Driver.MapTextureImage(ctx, texImage, slice,
-                                  0, 0, width, height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + slice,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT,
                                   &srcMap, &srcRowStride);
       if (srcMap) {
          _mesa_decompress_image(texFormat, width, height,
                                 srcMap, srcRowStride, tempSlice);
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, slice);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + slice);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -409,6 +408,8 @@ _mesa_base_pack_format(GLenum format)
  */
 static void
 get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLint depth,
                           GLenum format, GLenum type, GLvoid *pixels,
                           struct gl_texture_image *texImage,
                           GLbitfield transferOps)
@@ -416,9 +417,6 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
    /* don't want to apply sRGB -> RGB conversion here so override the format */
    const mesa_format texFormat =
       _mesa_get_srgb_format_linear(texImage->TexFormat);
-   const GLuint width = texImage->Width;
-   GLuint height = texImage->Height;
-   GLuint depth = texImage->Depth;
    GLuint img;
    GLboolean dst_is_integer;
    uint32_t dst_format;
@@ -430,6 +428,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
    if (texImage->TexObject->Target == GL_TEXTURE_1D_ARRAY) {
       depth = height;
       height = 1;
+      zoffset = yoffset;
+      yoffset = 0;
    }
 
    /* Depending on the base format involved we may need to apply a rebase
@@ -449,7 +449,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       rebaseSwizzle[1] = MESA_FORMAT_SWIZZLE_ZERO;
       rebaseSwizzle[2] = MESA_FORMAT_SWIZZLE_ZERO;
       rebaseSwizzle[3] = MESA_FORMAT_SWIZZLE_W;
-    } else if (texImage->_BaseFormat != _mesa_get_format_base_format(texFormat)) {
+    } else if (texImage->_BaseFormat !=
+               _mesa_get_format_base_format(texFormat)) {
       needsRebase =
          _mesa_compute_rgba2base2rgba_component_mapping(texImage->_BaseFormat,
                                                         rebaseSwizzle);
@@ -480,8 +481,9 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       uint32_t src_format;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT,
                                   &srcMap, &rowstride);
       if (!srcMap) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -530,8 +532,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
          /* If we had to rebase, we have already handled that */
          needsRebase = false;
 
-         /* If we were lucky and our RGBA conversion matches the dst format, then
-          * we are done.
+         /* If we were lucky and our RGBA conversion matches the dst format,
+          * then we are done.
           */
          if (!need_convert)
             goto do_swap;
@@ -568,7 +570,7 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       }
 
       /* Unmap the src texture buffer */
-      ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+      ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
    }
 
 done:
@@ -583,6 +585,8 @@ done:
  */
 static void
 get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
+             GLint xoffset, GLint yoffset, GLint zoffset,
+             GLsizei width, GLsizei height, GLint depth,
              GLenum format, GLenum type, GLvoid *pixels,
              struct gl_texture_image *texImage)
 {
@@ -604,11 +608,17 @@ get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
    }
 
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
-      get_tex_rgba_compressed(ctx, dimensions, format, type,
+      get_tex_rgba_compressed(ctx, dimensions,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth,
+                              format, type,
                               pixels, texImage, transferOps);
    }
    else {
-      get_tex_rgba_uncompressed(ctx, dimensions, format, type,
+      get_tex_rgba_uncompressed(ctx, dimensions,
+                                xoffset, yoffset, zoffset,
+                                width, height, depth,
+                                format, type,
                                 pixels, texImage, transferOps);
    }
 }
@@ -619,8 +629,10 @@ get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
  * \return GL_TRUE if done, GL_FALSE otherwise
  */
 static GLboolean
-get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
-               GLvoid *pixels,
+get_tex_memcpy(struct gl_context *ctx,
+               GLint xoffset, GLint yoffset, GLint zoffset,
+               GLsizei width, GLsizei height, GLint depth,
+               GLenum format, GLenum type, GLvoid *pixels,
                struct gl_texture_image *texImage)
 {
    const GLenum target = texImage->TexObject->Target;
@@ -642,20 +654,25 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
                                                      ctx->Pack.SwapBytes);
    }
 
+   if (depth > 1) {
+      /* only a single slice is supported at this time */
+      memCopy = FALSE;
+   }
+
    if (memCopy) {
       const GLuint bpp = _mesa_get_format_bytes(texImage->TexFormat);
-      const GLint bytesPerRow = texImage->Width * bpp;
+      const GLint bytesPerRow = width * bpp;
       GLubyte *dst =
-         _mesa_image_address2d(&ctx->Pack, pixels, texImage->Width,
-                               texImage->Height, format, type, 0, 0);
+         _mesa_image_address2d(&ctx->Pack, pixels, width, height,
+                               format, type, 0, 0);
       const GLint dstRowStride =
-         _mesa_image_row_stride(&ctx->Pack, texImage->Width, format, type);
+         _mesa_image_row_stride(&ctx->Pack, width, format, type);
       GLubyte *src;
       GLint srcRowStride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, 0,
-                                  0, 0, texImage->Width, texImage->Height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT, &src, &srcRowStride);
 
       if (src) {
@@ -664,7 +681,7 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
          }
          else {
             GLuint row;
-            for (row = 0; row < texImage->Height; row++) {
+            for (row = 0; row < height; row++) {
                memcpy(dst, src, bytesPerRow);
                dst += dstRowStride;
                src += srcRowStride;
@@ -672,7 +689,7 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
          }
 
          /* unmap src texture buffer */
-         ctx->Driver.UnmapTextureImage(ctx, texImage, 0);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -684,15 +701,17 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
 
 
 /**
- * This is the software fallback for Driver.GetTexImage().
+ * This is the software fallback for Driver.GetTexSubImage().
  * All error checking will have been done before this routine is called.
  * We'll call ctx->Driver.MapTextureImage() to access the data, then
  * unmap with ctx->Driver.UnmapTextureImage().
  */
 void
-_mesa_GetTexImage_sw(struct gl_context *ctx,
-                     GLenum format, GLenum type, GLvoid *pixels,
-                     struct gl_texture_image *texImage)
+_mesa_GetTexSubImage_sw(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage)
 {
    const GLuint dimensions =
       _mesa_get_texture_dimensions(texImage->TexObject->Target);
@@ -720,23 +739,30 @@ _mesa_GetTexImage_sw(struct gl_context *ctx,
       pixels = ADD_POINTERS(buf, pixels);
    }
 
-   if (get_tex_memcpy(ctx, format, type, pixels, texImage)) {
+   if (get_tex_memcpy(ctx, xoffset, yoffset, zoffset, width, height, depth,
+                      format, type, pixels, texImage)) {
       /* all done */
    }
    else if (format == GL_DEPTH_COMPONENT) {
-      get_tex_depth(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_depth(ctx, dimensions, xoffset, yoffset, zoffset,
+                    width, height, depth, format, type, pixels, texImage);
    }
    else if (format == GL_DEPTH_STENCIL_EXT) {
-      get_tex_depth_stencil(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_depth_stencil(ctx, dimensions, xoffset, yoffset, zoffset,
+                            width, height, depth, format, type, pixels,
+                            texImage);
    }
    else if (format == GL_STENCIL_INDEX) {
-      get_tex_stencil(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_stencil(ctx, dimensions, xoffset, yoffset, zoffset,
+                      width, height, depth, format, type, pixels, texImage);
    }
    else if (format == GL_YCBCR_MESA) {
-      get_tex_ycbcr(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_ycbcr(ctx, dimensions, xoffset, yoffset, zoffset,
+                    width, height, depth, format, type, pixels, texImage);
    }
    else {
-      get_tex_rgba(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_rgba(ctx, dimensions, xoffset, yoffset, zoffset,
+                   width, height, depth, format, type, pixels, texImage);
    }
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
@@ -747,13 +773,16 @@ _mesa_GetTexImage_sw(struct gl_context *ctx,
 
 
 /**
- * This is the software fallback for Driver.GetCompressedTexImage().
+ * This is the software fallback for Driver.GetCompressedTexSubImage().
  * All error checking will have been done before this routine is called.
  */
 void
-_mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
-                               struct gl_texture_image *texImage,
-                               GLvoid *img)
+_mesa_GetCompressedTexSubImage_sw(struct gl_context *ctx,
+                                  struct gl_texture_image *texImage,
+                                  GLint xoffset, GLint yoffset,
+                                  GLint zoffset, GLsizei width,
+                                  GLint height, GLint depth,
+                                  GLvoid *img)
 {
    const GLuint dimensions =
       _mesa_get_texture_dimensions(texImage->TexObject->Target);
@@ -762,10 +791,8 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
    GLubyte *dest;
 
    _mesa_compute_compressed_pixelstore(dimensions, texImage->TexFormat,
-                                       texImage->Width, texImage->Height,
-                                       texImage->Depth,
-                                       &ctx->Pack,
-                                       &store);
+                                       width, height, depth,
+                                       &ctx->Pack, &store);
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
       /* pack texture image into a PBO */
@@ -791,8 +818,8 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
       GLubyte *src;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, slice,
-                                  0, 0, texImage->Width, texImage->Height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + slice,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT, &src, &srcRowStride);
 
       if (src) {
@@ -803,10 +830,11 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
             src += srcRowStride;
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, slice);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + slice);
 
          /* Advance to next slice */
-         dest += store.TotalBytesPerRow * (store.TotalRowsPerSlice - store.CopyRowsPerSlice);
+         dest += store.TotalBytesPerRow * (store.TotalRowsPerSlice -
+                                           store.CopyRowsPerSlice);
 
       } else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetCompresssedTexImage");
@@ -863,29 +891,299 @@ legal_getteximage_target(struct gl_context *ctx, GLenum target, bool dsa)
 
 
 /**
- * Do error checking for a glGetTex(ture)Image() call.
- * \return GL_TRUE if any error, GL_FALSE if no errors.
+ * Wrapper for _mesa_select_tex_image() which can handle target being
+ * GL_TEXTURE_CUBE_MAP_ARB in which case we use zoffset to select a cube face.
+ * This can happen for glGetTextureImage and glGetTextureSubImage (DSA
+ * functions).
  */
-static GLboolean
+static struct gl_texture_image *
+select_tex_image(const struct gl_texture_object *texObj, GLenum target,
+                 GLint level, GLint zoffset)
+{
+   assert(level >= 0);
+   assert(level < MAX_TEXTURE_LEVELS);
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      assert(zoffset >= 0);
+      assert(zoffset < 6);
+      target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset;
+   }
+   return _mesa_select_tex_image(texObj, target, level);
+}
+
+
+/**
+ * Error-check the offset and size arguments to
+ * glGet[Compressed]TextureSubImage().  Also checks if the specified
+ * texture image is missing.
+ * \return true if error, false if no error.
+ */
+static bool
+dimensions_error_check(struct gl_context *ctx,
+                       struct gl_texture_object *texObj,
+                       GLenum target, GLint level,
+                       GLint xoffset, GLint yoffset, GLint zoffset,
+                       GLsizei width, GLsizei height, GLsizei depth,
+                       const char *caller)
+{
+   const struct gl_texture_image *texImage;
+   int i;
+
+   if (xoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(xoffset = %d)", caller, xoffset);
+      return true;
+   }
+
+   if (yoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(yoffset = %d)", caller, yoffset);
+      return true;
+   }
+
+   if (zoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(zoffset = %d)", caller, zoffset);
+      return true;
+   }
+
+   if (width < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(width = %d)", caller, width);
+      return true;
+   }
+
+   if (height < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(height = %d)", caller, height);
+      return true;
+   }
+
+   if (depth < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(depth = %d)", caller, depth);
+      return true;
+   }
+
+   /* do special per-target checks */
+   switch (target) {
+   case GL_TEXTURE_1D:
+      if (yoffset != 0) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(1D, yoffset = %d)", caller, yoffset);
+         return true;
+      }
+      if (height > 1) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(1D, height = %d)", caller, height);
+         return true;
+      }
+      /* fall-through */
+   case GL_TEXTURE_1D_ARRAY:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+      if (zoffset != 0) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset = %d)", caller, zoffset);
+         return true;
+      }
+      if (depth > 1) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(depth = %d)", caller, depth);
+         return true;
+      }
+      break;
+   case GL_TEXTURE_CUBE_MAP:
+      /* Non-array cube maps are special because we have a gl_texture_image
+       * per face.
+       */
+      if (zoffset + depth > 6) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset + depth = %d)", caller, zoffset + depth);
+         return true;
+      }
+      /* check that the range of faces exist */
+      for (i = 0; i < depth; i++) {
+         GLenum face = GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset + i;
+         if (!_mesa_select_tex_image(texObj, face, level)) {
+            /* non-existant face */
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(missing cube face)", caller);
+            return true;
+         }
+      }
+      break;
+   default:
+      ; /* nothing */
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   if (!texImage) {
+      /* missing texture image */
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(missing image)", caller);
+      return true;
+   }
+
+   if (xoffset + width > texImage->Width) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(xoffset %d + width %d > %u)",
+                  caller, xoffset, width, texImage->Width);
+      return true;
+   }
+
+   if (yoffset + height > texImage->Height) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(yoffset %d + height %d > %u)",
+                  caller, yoffset, height, texImage->Height);
+      return true;
+   }
+
+   if (target != GL_TEXTURE_CUBE_MAP) {
+      /* Cube map error checking was done above */
+      if (zoffset + depth > texImage->Depth) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset %d + depth %d > %u)",
+                     caller, zoffset, depth, texImage->Depth);
+         return true;
+      }
+   }
+
+   /* Extra checks for compressed textures */
+   {
+      GLuint bw, bh;
+      _mesa_get_format_block_size(texImage->TexFormat, &bw, &bh);
+      if (bw > 1 || bh > 1) {
+         /* offset must be multiple of block size */
+         if (xoffset % bw != 0) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(xoffset = %d)", caller, xoffset);
+            return true;
+         }
+         if (target != GL_TEXTURE_1D && target != GL_TEXTURE_1D_ARRAY) {
+            if (yoffset % bh != 0) {
+               _mesa_error(ctx, GL_INVALID_VALUE,
+                           "%s(yoffset = %d)", caller, yoffset);
+               return true;
+            }
+         }
+
+         /* The size must be a multiple of bw x bh, or we must be using a
+          * offset+size that exactly hits the edge of the image.
+          */
+         if ((width % bw != 0) &&
+             (xoffset + width != (GLint) texImage->Width)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(width = %d)", caller, width);
+            return true;
+         }
+
+         if ((height % bh != 0) &&
+             (yoffset + height != (GLint) texImage->Height)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(height = %d)", caller, height);
+            return true;
+         }
+      }
+   }
+
+   if (width == 0 || height == 0 || depth == 0) {
+      /* Not an error, but nothing to do.  Return 'true' so that the
+       * caller simply returns.
+       */
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Do PBO-related error checking for getting uncompressed images.
+ * \return true if there was an error (or the GetTexImage is to be a no-op)
+ */
+static bool
+pbo_error_check(struct gl_context *ctx, GLenum target,
+                GLsizei width, GLsizei height, GLsizei depth,
+                GLenum format, GLenum type, GLsizei clientMemSize,
+                GLvoid *pixels,
+                const char *caller)
+{
+   const GLuint dimensions = (target == GL_TEXTURE_3D) ? 3 : 2;
+
+   if (!_mesa_validate_pbo_access(dimensions, &ctx->Pack, width, height, depth,
+                                  format, type, clientMemSize, pixels)) {
+      if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(out of bounds PBO access)", caller);
+      } else {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(out of bounds access: bufSize (%d) is too small)",
+                     caller, clientMemSize);
+      }
+      return true;
+   }
+
+   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
+      /* PBO should not be mapped */
+      if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(PBO is mapped)", caller);
+         return true;
+      }
+   }
+
+   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
+      /* not an error, do nothing */
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Do error checking for all (non-compressed) get-texture-image functions.
+ * \return true if any error, false if no errors.
+ */
+static bool
 getteximage_error_check(struct gl_context *ctx,
-                        struct gl_texture_image *texImage,
+                        struct gl_texture_object *texObj,
                         GLenum target, GLint level,
-                        GLenum format, GLenum type, GLsizei clientMemSize,
-                        GLvoid *pixels, bool dsa)
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLsizei depth,
+                        GLenum format, GLenum type, GLsizei bufSize,
+                        GLvoid *pixels, const char *caller)
 {
-   const GLint maxLevels = _mesa_max_texture_levels(ctx, target);
-   const GLuint dimensions = (target == GL_TEXTURE_3D) ? 3 : 2;
-   GLenum baseFormat;
-   const char *suffix = dsa ? "ture" : "";
+   struct gl_texture_image *texImage;
+   GLenum baseFormat, err;
+   GLint maxLevels;
 
-   assert(texImage);
-   assert(maxLevels != 0);
+   assert(texObj);
+
+   if (texObj->Target == 0) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture)", caller);
+      return true;
+   }
+
+   maxLevels = _mesa_max_texture_levels(ctx, target);
    if (level < 0 || level >= maxLevels) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glGetTex%sImage(level out of range)", suffix);
-      return GL_TRUE;
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(level = %d)", caller, level);
+      return true;
    }
 
+   err = _mesa_error_check_format_and_type(ctx, format, type);
+   if (err != GL_NO_ERROR) {
+      _mesa_error(ctx, err, "%s(format/type)", caller);
+      return true;
+   }
+
+   if (dimensions_error_check(ctx, texObj, target, level,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth, caller)) {
+      return true;
+   }
+
+   if (pbo_error_check(ctx, target, width, height, depth,
+                       format, type, bufSize, pixels, caller)) {
+      return true;
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);
+
    /*
     * Format and type checking has been moved up to GetnTexImage and
     * GetTextureImage so that it happens before getting the texImage object.
@@ -899,494 +1197,579 @@ getteximage_error_check(struct gl_context *ctx,
    if (_mesa_is_color_format(format)
        && !_mesa_is_color_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_depth_format(format)
             && !_mesa_is_depth_format(baseFormat)
             && !_mesa_is_depthstencil_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_stencil_format(format)
             && !ctx->Extensions.ARB_texture_stencil8) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetTex%sImage(format=GL_STENCIL_INDEX)", suffix);
-      return GL_TRUE;
+                  "%s(format=GL_STENCIL_INDEX)", caller);
+      return true;
    }
    else if (_mesa_is_ycbcr_format(format)
             && !_mesa_is_ycbcr_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_depthstencil_format(format)
             && !_mesa_is_depthstencil_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
-   else if (!_mesa_is_stencil_format(format) && _mesa_is_enum_format_integer(format) !=
+   else if (!_mesa_is_stencil_format(format) &&
+            _mesa_is_enum_format_integer(format) !=
             _mesa_is_format_integer(texImage->TexFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
 
-   if (!_mesa_validate_pbo_access(dimensions, &ctx->Pack, texImage->Width,
-                                  texImage->Height, texImage->Depth,
-                                  format, type, clientMemSize, pixels)) {
-      if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTex%sImage(out of bounds PBO access)", suffix);
-      } else {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(out of bounds access:"
-                     " bufSize (%d) is too small)",
-                     dsa ? "glGetTextureImage" : "glGetnTexImageARB",
-                     clientMemSize);
-      }
-      return GL_TRUE;
+   return false;
+}
+
+
+/**
+ * Return the width, height and depth of a texture image.
+ * This function must be resilient to bad parameter values since
+ * this is called before full error checking.
+ */
+static void
+get_texture_image_dims(const struct gl_texture_object *texObj,
+                       GLenum target, GLint level,
+                       GLsizei *width, GLsizei *height, GLsizei *depth)
+{
+   const struct gl_texture_image *texImage = NULL;
+
+   if (level >= 0 && level < MAX_TEXTURE_LEVELS) {
+      texImage = _mesa_select_tex_image(texObj, target, level);
    }
 
-   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      /* PBO should not be mapped */
-      if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTex%sImage(PBO is mapped)", suffix);
-         return GL_TRUE;
+   if (texImage) {
+      *width = texImage->Width;
+      *height = texImage->Height;
+      if (target == GL_TEXTURE_CUBE_MAP) {
+         *depth = 6;
+      }
+      else {
+         *depth = texImage->Depth;
       }
    }
-
-   return GL_FALSE;
+   else {
+      *width = *height = *depth = 0;
+   }
 }
 
 
 /**
- * This is the implementation for glGetnTexImageARB, glGetTextureImage,
- * and glGetTexImage.
- *
- * Requires caller to pass in texImage object because _mesa_GetTextureImage
- * must handle the GL_TEXTURE_CUBE_MAP target.
- *
- * \param target texture target.
+ * Common code for all (uncompressed) get-texture-image functions.
+ * \param texObj  the texture object (should not be null)
+ * \param target  user-provided target, or 0 for DSA
  * \param level image level.
  * \param format pixel data format for returned image.
  * \param type pixel data type for returned image.
  * \param bufSize size of the pixels data buffer.
  * \param pixels returned pixel data.
- * \param dsa True when the caller is an ARB_direct_state_access function,
- *            false otherwise
+ * \param caller  name of calling function
  */
-void
-_mesa_get_texture_image(struct gl_context *ctx,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage, GLenum target,
-                        GLint level, GLenum format, GLenum type,
-                        GLsizei bufSize, GLvoid *pixels, bool dsa)
+static void
+get_texture_image(struct gl_context *ctx,
+                  struct gl_texture_object *texObj,
+                  GLenum target, GLint level,
+                  GLint xoffset, GLint yoffset, GLint zoffset,
+                  GLsizei width, GLsizei height, GLint depth,
+                  GLenum format, GLenum type,
+                  GLvoid *pixels, const char *caller)
 {
-   assert(texObj);
-   assert(texImage);
+   struct gl_texture_image *texImage;
+   unsigned firstFace, numFaces, i;
+   GLint imageStride;
 
    FLUSH_VERTICES(ctx, 0);
 
-   /*
-    * Legal target checking has been moved up to GetnTexImage and
-    * GetTextureImage so that it can be caught before receiving a NULL
-    * texImage object and exiting.
-    */
-
-   if (getteximage_error_check(ctx, texImage, target, level, format,
-                               type, bufSize, pixels, dsa)) {
-      return;
-   }
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);  /* should have been error checked already */
 
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
-      /* not an error, do nothing */
+   if (_mesa_is_zero_size_texture(texImage)) {
+      /* no image data to return */
       return;
    }
 
-   if (_mesa_is_zero_size_texture(texImage))
-      return;
-
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE)) {
-      _mesa_debug(ctx, "glGetTex%sImage(tex %u) format = %s, w=%d, h=%d,"
+      _mesa_debug(ctx, "%s(tex %u) format = %s, w=%d, h=%d,"
                   " dstFmt=0x%x, dstType=0x%x\n",
-                  dsa ? "ture": "",
-                  texObj->Name,
+                  caller, texObj->Name,
                   _mesa_get_format_name(texImage->TexFormat),
                   texImage->Width, texImage->Height,
                   format, type);
    }
 
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      /* Compute stride between cube faces */
+      imageStride = _mesa_image_image_stride(&ctx->Pack, width, height,
+                                             format, type);
+      firstFace = zoffset;
+      numFaces = depth;
+      zoffset = 0;
+      depth = 1;
+   }
+   else {
+      imageStride = 0;
+      firstFace = _mesa_tex_target_to_face(target);
+      numFaces = 1;
+   }
+
    _mesa_lock_texture(ctx, texObj);
-   {
-      ctx->Driver.GetTexImage(ctx, format, type, pixels, texImage);
+
+   for (i = 0; i < numFaces; i++) {
+      texImage = texObj->Image[firstFace + i][level];
+      assert(texImage);
+
+      ctx->Driver.GetTexSubImage(ctx, xoffset, yoffset, zoffset,
+                                 width, height, depth,
+                                 format, type, pixels, texImage);
+
+      /* next cube face */
+      pixels = (GLubyte *) pixels + imageStride;
    }
+
    _mesa_unlock_texture(ctx, texObj);
 }
 
-/**
- * Get texture image.  Called by glGetTexImage.
- *
- * \param target texture target.
- * \param level image level.
- * \param format pixel data format for returned image.
- * \param type pixel data type for returned image.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
+
 void GLAPIENTRY
-_mesa_GetnTexImageARB(GLenum target, GLint level, GLenum format,
-                      GLenum type, GLsizei bufSize, GLvoid *pixels)
+_mesa_GetnTexImageARB(GLenum target, GLint level, GLenum format, GLenum type,
+                      GLsizei bufSize, GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   GLenum err;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetnTexImageARB";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
 
-   /*
-    * This has been moved here because a format/type mismatch can cause a NULL
-    * texImage object, which in turn causes the mismatch error to be
-    * ignored.
-    */
-   err = _mesa_error_check_format_and_type(ctx, format, type);
-   if (err != GL_NO_ERROR) {
-      _mesa_error(ctx, err, "glGetnTexImage(format/type)");
-      return;
-   }
-
-   /*
-    * Legal target checking has been moved here to prevent exiting with a NULL
-    * texImage object.
-    */
    if (!legal_getteximage_target(ctx, target, false)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetnTexImage(target=0x%x)",
-                  target);
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
       return;
    }
 
    texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
-      return;
+   assert(texObj);
+
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   if (!texImage)
+   if (getteximage_error_check(ctx, texObj, target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
       return;
+   }
 
-   _mesa_get_texture_image(ctx, texObj, texImage, target, level, format, type,
-                           bufSize, pixels, false);
+   get_texture_image(ctx, texObj, target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
 }
 
 
 void GLAPIENTRY
-_mesa_GetTexImage( GLenum target, GLint level, GLenum format,
-                   GLenum type, GLvoid *pixels )
+_mesa_GetTexImage(GLenum target, GLint level, GLenum format, GLenum type,
+                  GLvoid *pixels )
 {
-   _mesa_GetnTexImageARB(target, level, format, type, INT_MAX, pixels);
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetTexImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
+
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
+      return;
+   }
+
+   texObj = _mesa_get_current_tex_object(ctx, target);
+   assert(texObj);
+
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
+
+   if (getteximage_error_check(ctx, texObj, target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, INT_MAX, pixels, caller)) {
+      return;
+   }
+
+   get_texture_image(ctx, texObj, target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
 }
 
-/**
- * Get texture image.
- *
- * \param texture texture name.
- * \param level image level.
- * \param format pixel data format for returned image.
- * \param type pixel data type for returned image.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
+
 void GLAPIENTRY
-_mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
-                      GLenum type, GLsizei bufSize, GLvoid *pixels)
+_mesa_GetTextureImage(GLuint texture, GLint level, GLenum format, GLenum type,
+                      GLsizei bufSize, GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   int i;
-   GLint image_stride;
-   GLenum err;
    GET_CURRENT_CONTEXT(ctx);
+   GLsizei width, height, depth;
+   static const char *caller = "glGetTextureImage";
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
 
-   /*
-    * This has been moved here because a format/type mismatch can cause a NULL
-    * texImage object, which in turn causes the mismatch error to be
-    * ignored.
-    */
-   err = _mesa_error_check_format_and_type(ctx, format, type);
-   if (err != GL_NO_ERROR) {
-      _mesa_error(ctx, err, "glGetTextureImage(format/type)");
+   if (!texObj) {
       return;
    }
 
-   texObj = _mesa_lookup_texture_err(ctx, texture, "glGetTextureImage");
-   if (!texObj)
-      return;
+   get_texture_image_dims(texObj, texObj->Target, level,
+                          &width, &height, &depth);
 
-   /*
-    * Legal target checking has been moved here to prevent exiting with a NULL
-    * texImage object.
-    */
-   if (!legal_getteximage_target(ctx, texObj->Target, true)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetTextureImage(target=%s)",
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+   if (getteximage_error_check(ctx, texObj, texObj->Target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
       return;
    }
 
-   /* Must handle special case GL_TEXTURE_CUBE_MAP. */
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
-
-      /* Make sure the texture object is a proper cube.
-       * (See texturesubimage in teximage.c for details on why this check is
-       * performed.)
-       */
-      if (!_mesa_cube_level_complete(texObj, level)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTextureImage(cube map incomplete)");
-         return;
-      }
+   get_texture_image(ctx, texObj, texObj->Target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
+}
 
-      /* Copy each face. */
-      for (i = 0; i < 6; ++i) {
-         texImage = texObj->Image[i][level];
-         assert(texImage);
 
-         _mesa_get_texture_image(ctx, texObj, texImage, texObj->Target, level,
-                                 format, type, bufSize, pixels, true);
+void GLAPIENTRY
+_mesa_GetTextureSubImage(GLuint texture, GLint level,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLsizei depth,
+                         GLenum format, GLenum type, GLsizei bufSize,
+                         void *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetTextureSubImage";
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
 
-         image_stride = _mesa_image_image_stride(&ctx->Pack, texImage->Width,
-                                                 texImage->Height, format,
-                                                 type);
-         pixels = (GLubyte *) pixels + image_stride;
-         bufSize -= image_stride;
-      }
+   if (!texObj) {
+      return;
    }
-   else {
-      texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
-      if (!texImage)
-         return;
 
-      _mesa_get_texture_image(ctx, texObj, texImage, texObj->Target, level,
-                              format, type, bufSize, pixels, true);
+   if (getteximage_error_check(ctx, texObj, texObj->Target, level,
+                               xoffset, yoffset, zoffset, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
+      return;
    }
+
+   get_texture_image(ctx, texObj, texObj->Target, level,
+                     xoffset, yoffset, zoffset, width, height, depth,
+                     format, type, pixels, caller);
 }
 
+
+
 /**
- * Do error checking for a glGetCompressedTexImage() call.
- * \return GL_TRUE if any error, GL_FALSE if no errors.
+ * Compute the number of bytes which will be written when retrieving
+ * a sub-region of a compressed texture.
  */
-static GLboolean
+static GLsizei
+packed_compressed_size(GLuint dimensions, mesa_format format,
+                       GLsizei width, GLsizei height, GLsizei depth,
+                       const struct gl_pixelstore_attrib *packing)
+{
+   struct compressed_pixelstore st;
+   GLsizei totalBytes;
+
+   _mesa_compute_compressed_pixelstore(dimensions, format,
+                                       width, height, depth,
+                                       packing, &st);
+   totalBytes =
+      (st.CopySlices - 1) * st.TotalRowsPerSlice * st.TotalBytesPerRow +
+      st.SkipBytes +
+      (st.CopyRowsPerSlice - 1) * st.TotalBytesPerRow +
+      st.CopyBytesPerRow;
+
+   return totalBytes;
+}
+
+
+/**
+ * Do error checking for getting compressed texture images.
+ * \return true if any error, false if no errors.
+ */
+static bool
 getcompressedteximage_error_check(struct gl_context *ctx,
-                                  struct gl_texture_image *texImage,
-                                  GLenum target,
-                                  GLint level, GLsizei clientMemSize,
-                                  GLvoid *img, bool dsa)
+                                  struct gl_texture_object *texObj,
+                                  GLenum target, GLint level,
+                                  GLint xoffset, GLint yoffset, GLint zoffset,
+                                  GLsizei width, GLsizei height, GLsizei depth,
+                                  GLsizei bufSize, GLvoid *pixels,
+                                  const char *caller)
 {
-   const GLint maxLevels = _mesa_max_texture_levels(ctx, target);
-   GLuint compressedSize, dimensions;
-   const char *suffix = dsa ? "ture" : "";
+   struct gl_texture_image *texImage;
+   GLint maxLevels;
+   GLsizei totalBytes;
+   GLuint dimensions;
 
-   assert(texImage);
+   assert(texObj);
 
-   if (!legal_getteximage_target(ctx, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetCompressedTex%sImage(target=%s)", suffix,
-                  _mesa_lookup_enum_by_nr(target));
-      return GL_TRUE;
+   if (texObj->Target == 0) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture)", caller);
+      return true;
    }
 
-   assert(maxLevels != 0);
+   maxLevels = _mesa_max_texture_levels(ctx, target);
    if (level < 0 || level >= maxLevels) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glGetCompressedTex%sImage(bad level = %d)", suffix, level);
-      return GL_TRUE;
+                  "%s(bad level = %d)", caller, level);
+      return true;
+   }
+
+   if (dimensions_error_check(ctx, texObj, target, level,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth, caller)) {
+      return true;
    }
 
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);
+
    if (!_mesa_is_format_compressed(texImage->TexFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetCompressedTex%sImage(texture is not compressed)",
-                  suffix);
-      return GL_TRUE;
+                  "%s(texture is not compressed)", caller);
+      return true;
    }
 
-   compressedSize = _mesa_format_image_size(texImage->TexFormat,
-                                            texImage->Width,
-                                            texImage->Height,
-                                            texImage->Depth);
-
    /* Check for invalid pixel storage modes */
-   dimensions = _mesa_get_texture_dimensions(texImage->TexObject->Target);
+   dimensions = _mesa_get_texture_dimensions(texObj->Target);
    if (!_mesa_compressed_pixel_storage_error_check(ctx, dimensions,
-                                              &ctx->Pack, dsa ?
-                                              "glGetCompressedTextureImage":
-                                              "glGetCompressedTexImage")) {
-      return GL_TRUE;
+                                                   &ctx->Pack,
+                                                   caller)) {
+      return true;
    }
 
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      /* do bounds checking on writing to client memory */
-      if (clientMemSize < (GLsizei) compressedSize) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(out of bounds access: bufSize (%d) is too small)",
-                     dsa ? "glGetCompressedTextureImage" :
-                     "glGetnCompressedTexImageARB", clientMemSize);
-         return GL_TRUE;
-      }
-   } else {
+   /* Compute number of bytes that may be touched in the dest buffer */
+   totalBytes = packed_compressed_size(dimensions, texImage->TexFormat,
+                                       width, height, depth,
+                                       &ctx->Pack);
+
+   /* Do dest buffer bounds checking */
+   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
       /* do bounds checking on PBO write */
-      if ((const GLubyte *) img + compressedSize >
-          (const GLubyte *) ctx->Pack.BufferObj->Size) {
+      if ((GLubyte *) pixels + totalBytes >
+          (GLubyte *) ctx->Pack.BufferObj->Size) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTex%sImage(out of bounds PBO access)",
-                     suffix);
-         return GL_TRUE;
+                     "%s(out of bounds PBO access)", caller);
+         return true;
       }
 
       /* make sure PBO is not mapped */
       if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(PBO is mapped)", caller);
+         return true;
+      }
+   }
+   else {
+      /* do bounds checking on writing to client memory */
+      if (totalBytes > bufSize) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTex%sImage(PBO is mapped)", suffix);
-         return GL_TRUE;
+                     "%s(out of bounds access: bufSize (%d) is too small)",
+                     caller, bufSize);
+         return true;
       }
    }
 
-   return GL_FALSE;
+   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
+      /* not an error, but do nothing */
+      return true;
+   }
+
+   return false;
 }
 
-/** Implements glGetnCompressedTexImageARB, glGetCompressedTexImage, and
- * glGetCompressedTextureImage.
- *
- * texImage must be passed in because glGetCompressedTexImage must handle the
- * target GL_TEXTURE_CUBE_MAP.
+
+/**
+ * Common helper for all glGetCompressed-teximage functions.
  */
-void
-_mesa_get_compressed_texture_image(struct gl_context *ctx,
-                                   struct gl_texture_object *texObj,
-                                   struct gl_texture_image *texImage,
-                                   GLenum target, GLint level,
-                                   GLsizei bufSize, GLvoid *pixels,
-                                   bool dsa)
+static void
+get_compressed_texture_image(struct gl_context *ctx,
+                             struct gl_texture_object *texObj,
+                             GLenum target, GLint level,
+                             GLint xoffset, GLint yoffset, GLint zoffset,
+                             GLsizei width, GLsizei height, GLint depth,
+                             GLvoid *pixels,
+                             const char *caller)
 {
-   assert(texObj);
-   assert(texImage);
+   struct gl_texture_image *texImage;
+   unsigned firstFace, numFaces, i, imageStride;
 
    FLUSH_VERTICES(ctx, 0);
 
-   if (getcompressedteximage_error_check(ctx, texImage, target, level,
-                                         bufSize, pixels, dsa)) {
-      return;
-   }
-
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
-      /* not an error, do nothing */
-      return;
-   }
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);  /* should have been error checked already */
 
    if (_mesa_is_zero_size_texture(texImage))
       return;
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE)) {
       _mesa_debug(ctx,
-                  "glGetCompressedTex%sImage(tex %u) format = %s, w=%d, h=%d\n",
-                  dsa ? "ture" : "", texObj->Name,
+                  "%s(tex %u) format = %s, w=%d, h=%d\n",
+                  caller, texObj->Name,
                   _mesa_get_format_name(texImage->TexFormat),
                   texImage->Width, texImage->Height);
    }
 
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      struct compressed_pixelstore store;
+
+      /* Compute image stride between cube faces */
+      _mesa_compute_compressed_pixelstore(2, texImage->TexFormat,
+                                          width, height, depth,
+                                          &ctx->Pack, &store);
+      imageStride = store.TotalBytesPerRow * store.TotalRowsPerSlice;
+
+      firstFace = zoffset;
+      numFaces = depth;
+      zoffset = 0;
+      depth = 1;
+   }
+   else {
+      imageStride = 0;
+      firstFace = _mesa_tex_target_to_face(target);
+      numFaces = 1;
+   }
+
    _mesa_lock_texture(ctx, texObj);
-   {
-      ctx->Driver.GetCompressedTexImage(ctx, texImage, pixels);
+
+   for (i = 0; i < numFaces; i++) {
+      texImage = texObj->Image[firstFace + i][level];
+      assert(texImage);
+
+      ctx->Driver.GetCompressedTexSubImage(ctx, texImage,
+                                           xoffset, yoffset, zoffset,
+                                           width, height, depth, pixels);
+
+      /* next cube face */
+      pixels = (GLubyte *) pixels + imageStride;
    }
+
    _mesa_unlock_texture(ctx, texObj);
 }
 
+
 void GLAPIENTRY
 _mesa_GetnCompressedTexImageARB(GLenum target, GLint level, GLsizei bufSize,
-                                GLvoid *img)
+                                GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetnCompressedTexImageARB";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
 
-   texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
       return;
+   }
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   if (!texImage)
+   texObj = _mesa_get_current_tex_object(ctx, target);
+   assert(texObj);
+
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
+
+   if (getcompressedteximage_error_check(ctx, texObj, target, level,
+                                         0, 0, 0, width, height, depth,
+                                         INT_MAX, pixels, caller)) {
       return;
+   }
 
-   _mesa_get_compressed_texture_image(ctx, texObj, texImage, target, level,
-                                      bufSize, img, false);
+   get_compressed_texture_image(ctx, texObj, target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
 }
 
+
 void GLAPIENTRY
-_mesa_GetCompressedTexImage(GLenum target, GLint level, GLvoid *img)
+_mesa_GetCompressedTexImage(GLenum target, GLint level, GLvoid *pixels)
 {
-   _mesa_GetnCompressedTexImageARB(target, level, INT_MAX, img);
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTexImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
+
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
+      return;
+   }
+
+   texObj = _mesa_get_current_tex_object(ctx, target);
+   assert(texObj);
+
+   get_texture_image_dims(texObj, target, level,
+                          &width, &height, &depth);
+
+   if (getcompressedteximage_error_check(ctx, texObj, target, level,
+                                         0, 0, 0, width, height, depth,
+                                         INT_MAX, pixels, caller)) {
+      return;
+   }
+
+   get_compressed_texture_image(ctx, texObj, target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
 }
 
-/**
- * Get compressed texture image.
- *
- * \param texture texture name.
- * \param level image level.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
+
 void GLAPIENTRY
 _mesa_GetCompressedTextureImage(GLuint texture, GLint level,
                                 GLsizei bufSize, GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   int i;
-   GLint image_stride;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTextureImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
 
-   texObj = _mesa_lookup_texture_err(ctx, texture,
-                                     "glGetCompressedTextureImage");
-   if (!texObj)
+   if (!texObj) {
       return;
+   }
 
-   /* Must handle special case GL_TEXTURE_CUBE_MAP. */
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+   get_texture_image_dims(texObj, texObj->Target, level,
+                          &width, &height, &depth);
 
-      /* Make sure the texture object is a proper cube.
-       * (See texturesubimage in teximage.c for details on why this check is
-       * performed.)
-       */
-      if (!_mesa_cube_level_complete(texObj, level)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTextureImage(cube map incomplete)");
-         return;
-      }
+   if (getcompressedteximage_error_check(ctx, texObj, texObj->Target, level,
+                                         0, 0, 0, width, height, depth,
+                                         bufSize, pixels, caller)) {
+      return;
+   }
 
-      /* Copy each face. */
-      for (i = 0; i < 6; ++i) {
-         texImage = texObj->Image[i][level];
-         assert(texImage);
+   get_compressed_texture_image(ctx, texObj, texObj->Target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
+}
 
-         _mesa_get_compressed_texture_image(ctx, texObj, texImage,
-                                            texObj->Target, level,
-                                            bufSize, pixels, true);
 
-         /* Compressed images don't have a client format */
-         image_stride = _mesa_format_image_size(texImage->TexFormat,
-                                                texImage->Width,
-                                                texImage->Height, 1);
+void APIENTRY
+_mesa_GetCompressedTextureSubImage(GLuint texture, GLint level,
+                                   GLint xoffset, GLint yoffset,
+                                   GLint zoffset, GLsizei width,
+                                   GLsizei height, GLsizei depth,
+                                   GLsizei bufSize, void *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTextureImage";
+   struct gl_texture_object *texObj;
 
-         pixels = (GLubyte *) pixels + image_stride;
-         bufSize -= image_stride;
-      }
+   texObj = _mesa_lookup_texture_err(ctx, texture, caller);
+   if (!texObj) {
+      return;
    }
-   else {
-      texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
-      if (!texImage)
-         return;
 
-      _mesa_get_compressed_texture_image(ctx, texObj, texImage,
-                                         texObj->Target, level, bufSize,
-                                         pixels, true);
+   if (getcompressedteximage_error_check(ctx, texObj, texObj->Target, level,
+                                         xoffset, yoffset, zoffset,
+                                         width, height, depth,
+                                         bufSize, pixels, caller)) {
+      return;
    }
+
+   get_compressed_texture_image(ctx, texObj, texObj->Target, level,
+                                xoffset, yoffset, zoffset,
+                                width, height, depth,
+                                pixels, caller);
 }
diff --git a/src/mesa/main/texgetimage.h b/src/mesa/main/texgetimage.h
index 1fa2f59dcdc..63c75eb931d 100644
--- a/src/mesa/main/texgetimage.h
+++ b/src/mesa/main/texgetimage.h
@@ -37,22 +37,19 @@ extern GLenum
 _mesa_base_pack_format(GLenum format);
 
 extern void
-_mesa_GetTexImage_sw(struct gl_context *ctx,
-                     GLenum format, GLenum type, GLvoid *pixels,
-                     struct gl_texture_image *texImage);
-
-
-extern void
-_mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
-                               struct gl_texture_image *texImage,
-                               GLvoid *data);
+_mesa_GetTexSubImage_sw(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage);
 
 extern void
-_mesa_get_texture_image(struct gl_context *ctx,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage, GLenum target,
-                        GLint level, GLenum format, GLenum type,
-                        GLsizei bufSize, GLvoid *pixels, bool dsa);
+_mesa_GetCompressedTexSubImage_sw(struct gl_context *ctx,
+                                  struct gl_texture_image *texImage,
+                                  GLint xoffset, GLint yoffset,
+                                  GLint zoffset, GLsizei width,
+                                  GLint height, GLint depth,
+                                  GLvoid *data);
 
 extern void
 _mesa_get_compressed_texture_image( struct gl_context *ctx,
@@ -74,6 +71,14 @@ _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
                       GLenum type, GLsizei bufSize, GLvoid *pixels);
 
 extern void GLAPIENTRY
+_mesa_GetTextureSubImage(GLuint texture, GLint level,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLsizei depth,
+                         GLenum format, GLenum type, GLsizei bufSize,
+                         void *pixels);
+
+
+extern void GLAPIENTRY
 _mesa_GetCompressedTexImage(GLenum target, GLint lod, GLvoid *img);
 
 extern void GLAPIENTRY
@@ -84,4 +89,11 @@ extern void GLAPIENTRY
 _mesa_GetCompressedTextureImage(GLuint texture, GLint level, GLsizei bufSize,
                                 GLvoid *pixels);
 
+extern void APIENTRY
+_mesa_GetCompressedTextureSubImage(GLuint texture, GLint level,
+                                   GLint xoffset, GLint yoffset,
+                                   GLint zoffset, GLsizei width,
+                                   GLsizei height, GLsizei depth,
+                                   GLsizei bufSize, void *pixels);
+
 #endif /* TEXGETIMAGE_H */
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 3d85615fa45..3a556a6ad6e 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1008,7 +1008,7 @@ _mesa_max_texture_levels(struct gl_context *ctx, GLenum target)
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      return _mesa_is_desktop_gl(ctx)
+      return (_mesa_is_desktop_gl(ctx) || _mesa_is_gles31(ctx))
          && ctx->Extensions.ARB_texture_multisample
          ? 1 : 0;
    case GL_TEXTURE_EXTERNAL_OES:
@@ -1793,8 +1793,6 @@ GLboolean
 _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
                                GLenum intFormat)
 {
-   (void) intFormat;  /* not used yet */
-
    switch (target) {
    case GL_TEXTURE_2D:
    case GL_PROXY_TEXTURE_2D:
@@ -1814,6 +1812,16 @@ _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
    case GL_PROXY_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_CUBE_MAP_ARRAY:
       return ctx->Extensions.ARB_texture_cube_map_array;
+   case GL_TEXTURE_3D:
+      switch (intFormat) {
+      case GL_COMPRESSED_RGBA_BPTC_UNORM:
+      case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+      case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+      case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+         return ctx->Extensions.ARB_texture_compression_bptc;
+      default:
+         return GL_FALSE;
+      }
    default:
       return GL_FALSE;
    }
@@ -2081,6 +2089,53 @@ texture_formats_agree(GLenum internalFormat,
 }
 
 /**
+ * Test the combination of format, type and internal format arguments of
+ * different texture operations on GLES.
+ *
+ * \param ctx GL context.
+ * \param format pixel data format given by the user.
+ * \param type pixel data type given by the user.
+ * \param internalFormat internal format given by the user.
+ * \param dimensions texture image dimensions (must be 1, 2 or 3).
+ * \param callerName name of the caller function to print in the error message
+ *
+ * \return true if a error is found, false otherwise
+ *
+ * Currently, it is used by texture_error_check() and texsubimage_error_check().
+ */
+static bool
+texture_format_error_check_gles(struct gl_context *ctx, GLenum format,
+                                GLenum type, GLenum internalFormat,
+                                GLuint dimensions, const char *callerName)
+{
+   GLenum err;
+
+   if (_mesa_is_gles3(ctx)) {
+      err = _mesa_es3_error_check_format_and_type(ctx, format, type,
+                                                  internalFormat);
+      if (err != GL_NO_ERROR) {
+         _mesa_error(ctx, err,
+                     "%s(format = %s, type = %s, internalformat = %s)",
+                     callerName, _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type),
+                     _mesa_enum_to_string(internalFormat));
+         return true;
+      }
+   }
+   else {
+      err = _mesa_es_error_check_format_and_type(format, type, dimensions);
+      if (err != GL_NO_ERROR) {
+         _mesa_error(ctx, err, "%s(format = %s, type = %s)",
+                     callerName, _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type));
+         return true;
+      }
+   }
+
+   return false;
+}
+
+/**
  * Test the glTexImage[123]D() parameters for errors.
  *
  * \param ctx GL context.
@@ -2151,39 +2206,17 @@ texture_error_check( struct gl_context *ctx,
     * Formats and types that require additional extensions (e.g., GL_FLOAT
     * requires GL_OES_texture_float) are filtered elsewhere.
     */
-
-   if (_mesa_is_gles(ctx)) {
-      if (_mesa_is_gles3(ctx)) {
-         err = _mesa_es3_error_check_format_and_type(ctx, format, type,
-                                                     internalFormat);
-      } else {
-         if (format != internalFormat) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glTexImage%dD(format = %s, internalFormat = %s)",
-                        dimensions,
-                        _mesa_lookup_enum_by_nr(format),
-                        _mesa_lookup_enum_by_nr(internalFormat));
-            return GL_TRUE;
-         }
-
-         err = _mesa_es_error_check_format_and_type(format, type, dimensions);
-      }
-      if (err != GL_NO_ERROR) {
-         _mesa_error(ctx, err,
-                     "glTexImage%dD(format = %s, type = %s, internalFormat = %s)",
-                     dimensions,
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type),
-                     _mesa_lookup_enum_by_nr(internalFormat));
-         return GL_TRUE;
-      }
+   if (_mesa_is_gles(ctx) &&
+       texture_format_error_check_gles(ctx, format, type, internalFormat,
+                                       dimensions, "glTexImage%dD")) {
+     return GL_TRUE;
    }
 
    /* Check internalFormat */
    if (_mesa_base_tex_format(ctx, internalFormat) < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glTexImage%dD(internalFormat=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat));
+                  dimensions, _mesa_enum_to_string(internalFormat));
       return GL_TRUE;
    }
 
@@ -2192,8 +2225,8 @@ texture_error_check( struct gl_context *ctx,
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err,
                   "glTexImage%dD(incompatible format = %s, type = %s)",
-                  dimensions, _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  dimensions, _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return GL_TRUE;
    }
 
@@ -2208,8 +2241,8 @@ texture_error_check( struct gl_context *ctx,
    if (!texture_formats_agree(internalFormat, format)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glTexImage%dD(incompatible internalFormat = %s, format = %s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat),
-                  _mesa_lookup_enum_by_nr(format));
+                  dimensions, _mesa_enum_to_string(internalFormat),
+                  _mesa_enum_to_string(format));
       return GL_TRUE;
    }
 
@@ -2324,7 +2357,7 @@ compressed_texture_error_check(struct gl_context *ctx, GLint dimensions,
    if (!_mesa_is_compressed_format(ctx, internalFormat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCompressedTexImage%dD(internalFormat=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat));
+                  dimensions, _mesa_enum_to_string(internalFormat));
       return GL_TRUE;
    }
 
@@ -2479,40 +2512,38 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   /* check target (proxies not allowed) */
-   if (!legal_texsubimage_target(ctx, dimensions, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%s)",
-                  callerName, _mesa_lookup_enum_by_nr(target));
-      return GL_TRUE;
-   }
-
    /* level check */
    if (level < 0 || level >= _mesa_max_texture_levels(ctx, target)) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(level=%d)", callerName, level);
       return GL_TRUE;
    }
 
-   /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
-    * combinations of format and type that can be used.  Formats and types
-    * that require additional extensions (e.g., GL_FLOAT requires
-    * GL_OES_texture_float) are filtered elsewhere.
-    */
-   if (_mesa_is_gles(ctx) && !_mesa_is_gles3(ctx)) {
-      err = _mesa_es_error_check_format_and_type(format, type, dimensions);
-      if (err != GL_NO_ERROR) {
-         _mesa_error(ctx, err, "%s(format = %s, type = %s)",
-                     callerName, _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type));
-         return GL_TRUE;
-      }
+   texImage = _mesa_select_tex_image(texObj, target, level);
+   if (!texImage) {
+      /* non-existant texture level */
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture image)",
+                  callerName);
+      return GL_TRUE;
    }
 
    err = _mesa_error_check_format_and_type(ctx, format, type);
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err,
                   "%s(incompatible format = %s, type = %s)",
-                  callerName, _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  callerName, _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
+      return GL_TRUE;
+   }
+
+   /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
+    * combinations of format, internalFormat, and type that can be used.
+    * Formats and types that require additional extensions (e.g., GL_FLOAT
+    * requires GL_OES_texture_float) are filtered elsewhere.
+    */
+   if (_mesa_is_gles(ctx) &&
+       texture_format_error_check_gles(ctx, format, type,
+                                       texImage->InternalFormat,
+                                       dimensions, callerName)) {
       return GL_TRUE;
    }
 
@@ -2523,14 +2554,6 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   if (!texImage) {
-      /* non-existant texture level */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture image)",
-                  callerName);
-      return GL_TRUE;
-   }
-
    if (error_check_subtexture_dimensions(ctx, dimensions,
                                          texImage, xoffset, yoffset, zoffset,
                                          width, height, depth, callerName)) {
@@ -2590,7 +2613,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
    /* check target */
    if (!legal_texsubimage_target(ctx, dimensions, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glCopyTexImage%uD(target=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(target));
+                  dimensions, _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -2629,13 +2652,6 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
-   if (rb == NULL) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glCopyTexImage%dD(read buffer)", dimensions);
-      return GL_TRUE;
-   }
-
    /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
     * internalFormat.
     */
@@ -2648,18 +2664,25 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       case GL_LUMINANCE_ALPHA:
          break;
       default:
-         _mesa_error(ctx, GL_INVALID_VALUE,
+         _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
 
    baseFormat = _mesa_base_tex_format(ctx, internalFormat);
    if (baseFormat < 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
+      _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                  _mesa_lookup_enum_by_nr(internalFormat));
+                  _mesa_enum_to_string(internalFormat));
+      return GL_TRUE;
+   }
+
+   rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
+   if (rb == NULL) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glCopyTexImage%dD(read buffer)", dimensions);
       return GL_TRUE;
    }
 
@@ -2669,7 +2692,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       if (rb_base_format < 0) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -2696,7 +2719,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       if (!valid) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -2735,10 +2758,10 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
        * types for SNORM formats. Also, conversion to SNORM formats is not
        * allowed by Table 3.2 on Page 110.
        */
-      if(_mesa_is_enum_format_snorm(internalFormat)) {
+      if (_mesa_is_enum_format_snorm(internalFormat)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -3103,8 +3126,8 @@ _mesa_choose_texture_format(struct gl_context *ctx,
                        "DXT compression requested (%s), "
                        "but libtxc_dxtn library not installed.  Using %s "
                        "instead.",
-                       _mesa_lookup_enum_by_nr(before),
-                       _mesa_lookup_enum_by_nr(internalFormat));
+                       _mesa_enum_to_string(before),
+                       _mesa_enum_to_string(internalFormat));
       }
    }
 
@@ -3191,18 +3214,18 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
          _mesa_debug(ctx,
                      "glCompressedTexImage%uD %s %d %s %d %d %d %d %p\n",
                      dims,
-                     _mesa_lookup_enum_by_nr(target), level,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target), level,
+                     _mesa_enum_to_string(internalFormat),
                      width, height, depth, border, pixels);
       else
          _mesa_debug(ctx,
                      "glTexImage%uD %s %d %s %d %d %d %d %s %s %p\n",
                      dims,
-                     _mesa_lookup_enum_by_nr(target), level,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target), level,
+                     _mesa_enum_to_string(internalFormat),
                      width, height, depth, border,
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type), pixels);
+                     _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type), pixels);
    }
 
    internalFormat = override_internal_format(internalFormat, width, height);
@@ -3210,7 +3233,7 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
    /* target error checking */
    if (!legal_teximage_target(ctx, dims, target)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s%uD(target=%s)",
-                  func, dims, _mesa_lookup_enum_by_nr(target));
+                  func, dims, _mesa_enum_to_string(target));
       return;
    }
 
@@ -3313,16 +3336,16 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
 
       if (!dimensionsOK) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glTexImage%uD(invalid width or height or depth)",
-                     dims);
+                     "%s%uD(invalid width or height or depth)",
+                     func, dims);
          return;
       }
 
       if (!sizeOK) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY,
-                     "glTexImage%uD(image too large: %d x %d x %d, %s format)",
-                     dims, width, height, depth,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     "%s%uD(image too large: %d x %d x %d, %s format)",
+                     func, dims, width, height, depth,
+                     _mesa_enum_to_string(internalFormat));
          return;
       }
 
@@ -3495,7 +3518,6 @@ _mesa_EGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image)
       _mesa_dirty_texobj(ctx, texObj);
    }
    _mesa_unlock_texture(ctx, texObj);
-
 }
 
 
@@ -3515,14 +3537,6 @@ _mesa_texture_sub_image(struct gl_context *ctx, GLuint dims,
 {
    FLUSH_VERTICES(ctx, 0);
 
-   /* check target (proxies not allowed) */
-   if (!legal_texsubimage_target(ctx, dims, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sSubImage%uD(target=%s)",
-                  dsa ? "ture" : "",
-                  dims, _mesa_lookup_enum_by_nr(target));
-      return;
-   }
-
    if (ctx->NewState & _NEW_PIXEL)
       _mesa_update_state(ctx);
 
@@ -3572,6 +3586,13 @@ texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
 
+   /* check target (proxies not allowed) */
+   if (!legal_texsubimage_target(ctx, dims, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glTexSubImage%uD(target=%s)",
+                  dims, _mesa_enum_to_string(target));
+      return;
+   }
+
    texObj = _mesa_get_current_tex_object(ctx, target);
    if (!texObj)
       return;
@@ -3589,10 +3610,10 @@ texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexSubImage%uD %s %d %d %d %d %d %d %d %s %s %p\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), level,
+                  _mesa_enum_to_string(target), level,
                   xoffset, yoffset, zoffset, width, height, depth,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type), pixels);
 
    _mesa_texture_sub_image(ctx, dims, texObj, texImage, target, level,
                            xoffset, yoffset, zoffset, width, height, depth,
@@ -3621,8 +3642,8 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
                   "glTextureSubImage%uD %d %d %d %d %d %d %d %d %s %s %p\n",
                   dims, texture, level,
                   xoffset, yoffset, zoffset, width, height, depth,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type), pixels);
 
    /* Get the texture object by Name. */
    texObj = _mesa_lookup_texture(ctx, texture);
@@ -3632,6 +3653,13 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
       return;
    }
 
+   /* check target (proxies not allowed) */
+   if (!legal_texsubimage_target(ctx, dims, texObj->Target, true)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%s)",
+                  callerName, _mesa_enum_to_string(texObj->Target));
+      return;
+   }
+
    if (texsubimage_error_check(ctx, dims, texObj, texObj->Target, level,
                                xoffset, yoffset, zoffset,
                                width, height, depth, format, type,
@@ -3842,8 +3870,7 @@ copytexsubimage_by_slice(struct gl_context *ctx,
 }
 
 static GLboolean
-formats_differ_in_component_sizes (mesa_format f1,
-                                   mesa_format f2)
+formats_differ_in_component_sizes(mesa_format f1, mesa_format f2)
 {
    GLint f1_r_bits = _mesa_get_format_bits(f1, GL_RED_BITS);
    GLint f1_g_bits = _mesa_get_format_bits(f1, GL_GREEN_BITS);
@@ -3883,8 +3910,8 @@ copyteximage(struct gl_context *ctx, GLuint dims,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glCopyTexImage%uD %s %d %s %d %d %d %d %d\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat),
+                  _mesa_enum_to_string(target), level,
+                  _mesa_enum_to_string(internalFormat),
                   x, y, width, height, border);
 
    if (ctx->NewState & NEW_COPY_TEX_STATE)
@@ -3916,8 +3943,8 @@ copyteximage(struct gl_context *ctx, GLuint dims,
        */
          if (rb->InternalFormat == GL_RGB10_A2) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glCopyTexImage%uD(Reading from GL_RGB10_A2 buffer and"
-                           " writing to unsized internal format)", dims);
+                           "glCopyTexImage%uD(Reading from GL_RGB10_A2 buffer"
+                           " and writing to unsized internal format)", dims);
                return;
          }
       }
@@ -4043,7 +4070,7 @@ _mesa_copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "%s %s %d %d %d %d %d %d %d %d\n", caller,
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   level, xoffset, yoffset, zoffset, x, y, width, height);
 
    if (ctx->NewState & NEW_COPY_TEX_STATE)
@@ -4105,7 +4132,7 @@ _mesa_CopyTexSubImage1D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 1, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4133,7 +4160,7 @@ _mesa_CopyTexSubImage2D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 2, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4162,7 +4189,7 @@ _mesa_CopyTexSubImage3D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 3, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4190,7 +4217,7 @@ _mesa_CopyTextureSubImage1D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 1, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4214,7 +4241,7 @@ _mesa_CopyTextureSubImage2D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 2, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4241,7 +4268,7 @@ _mesa_CopyTextureSubImage3D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 3, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4288,8 +4315,8 @@ check_clear_tex_image(struct gl_context *ctx,
       _mesa_error(ctx, err,
                   "%s(incompatible format = %s, type = %s)",
                   function,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return false;
    }
 
@@ -4298,8 +4325,8 @@ check_clear_tex_image(struct gl_context *ctx,
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(incompatible internalFormat = %s, format = %s)",
                   function,
-                  _mesa_lookup_enum_by_nr(internalFormat),
-                  _mesa_lookup_enum_by_nr(format));
+                  _mesa_enum_to_string(internalFormat),
+                  _mesa_enum_to_string(format));
       return false;
    }
 
@@ -4541,7 +4568,7 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
 
    if (dsa && target == GL_TEXTURE_RECTANGLE) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -4549,13 +4576,15 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
    case 2:
       switch (target) {
       case GL_TEXTURE_2D:
+         targetOK = GL_TRUE;
+         break;
       case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
       case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
       case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-         targetOK = GL_TRUE;
+         targetOK = ctx->Extensions.ARB_texture_cube_map;
          break;
       default:
          targetOK = GL_FALSE;
@@ -4563,52 +4592,59 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
       }
       break;
    case 3:
-      targetOK = (target == GL_TEXTURE_3D) ||
-                 (target == GL_TEXTURE_2D_ARRAY) ||
-                 (target == GL_TEXTURE_CUBE_MAP_ARRAY) ||
-                 (target == GL_TEXTURE_CUBE_MAP && dsa);
-
-      /* OpenGL 4.5 spec (30.10.2014) says in Section 8.7 Compressed Texture
-       * Images:
-       *    "An INVALID_OPERATION error is generated by
-       *    CompressedTex*SubImage3D if the internal format of the texture is
-       *    one of the EAC, ETC2, or RGTC formats and either border is
-       *    non-zero, or the effective target for the texture is not
-       *    TEXTURE_2D_ARRAY."
-       */
-      if (target != GL_TEXTURE_2D_ARRAY) {
-         bool invalidformat;
+      switch (target) {
+      case GL_TEXTURE_CUBE_MAP:
+         targetOK = dsa && ctx->Extensions.ARB_texture_cube_map;
+         break;
+      case GL_TEXTURE_2D_ARRAY:
+         targetOK = _mesa_is_gles3(ctx) ||
+            (_mesa_is_desktop_gl(ctx) && ctx->Extensions.EXT_texture_array);
+         break;
+      case GL_TEXTURE_CUBE_MAP_ARRAY:
+         targetOK = ctx->Extensions.ARB_texture_cube_map_array;
+         break;
+      case GL_TEXTURE_3D:
+         targetOK = GL_TRUE;
+         /*
+          * OpenGL 4.5 spec (30.10.2014) says in Section 8.7 Compressed Texture
+          * Images:
+          *    "An INVALID_OPERATION error is generated by
+          *    CompressedTex*SubImage3D if the internal format of the texture
+          *    is one of the EAC, ETC2, or RGTC formats and either border is
+          *    non-zero, or the effective target for the texture is not
+          *    TEXTURE_2D_ARRAY."
+          *
+          * NOTE: that's probably a spec error.  It should probably say
+          *    "... or the effective target for the texture is not
+          *    TEXTURE_2D_ARRAY, TEXTURE_CUBE_MAP, nor
+          *    GL_TEXTURE_CUBE_MAP_ARRAY."
+          * since those targets are 2D images and they support all compression
+          * formats.
+          *
+          * Instead of listing all these, just list those which are allowed,
+          * which is (at this time) only bptc. Otherwise we'd say s3tc (and
+          * more) are valid here, which they are not, but of course not
+          * mentioned by core spec.
+          */
          switch (format) {
-            /* These came from _mesa_is_compressed_format in glformats.c. */
-            /* EAC formats */
-            case GL_COMPRESSED_RGBA8_ETC2_EAC:
-            case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
-            case GL_COMPRESSED_R11_EAC:
-            case GL_COMPRESSED_RG11_EAC:
-            case GL_COMPRESSED_SIGNED_R11_EAC:
-            case GL_COMPRESSED_SIGNED_RG11_EAC:
-            /* ETC2 formats */
-            case GL_COMPRESSED_RGB8_ETC2:
-            case GL_COMPRESSED_SRGB8_ETC2:
-            case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-            case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-            /* RGTC formats */
-            case GL_COMPRESSED_RED_RGTC1:
-            case GL_COMPRESSED_SIGNED_RED_RGTC1:
-            case GL_COMPRESSED_RG_RGTC2:
-            case GL_COMPRESSED_SIGNED_RG_RGTC2:
-               invalidformat = true;
-               break;
-            default:
-               invalidformat = false;
-         }
-         if (invalidformat) {
+         /* These are the only 3D compression formats supported at this time */
+         case GL_COMPRESSED_RGBA_BPTC_UNORM:
+         case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+         case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+         case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+            /* valid format */
+            break;
+         default:
+            /* invalid format */
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(invalid target %s for format %s)", caller,
-                        _mesa_lookup_enum_by_nr(target),
-                        _mesa_lookup_enum_by_nr(format));
+                        _mesa_enum_to_string(target),
+                        _mesa_enum_to_string(format));
             return GL_TRUE;
          }
+         break;
+      default:
+         targetOK = GL_FALSE;
       }
 
       break;
@@ -4621,7 +4657,7 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
 
    if (!targetOK) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -4834,8 +4870,7 @@ _mesa_CompressedTextureSubImage1D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 1, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 1, format, true,
                                           "glCompressedTextureSubImage1D")) {
       return;
    }
@@ -4912,8 +4947,7 @@ _mesa_CompressedTextureSubImage2D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 2, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 2, format, true,
                                           "glCompressedTextureSubImage2D")) {
       return;
    }
@@ -4990,8 +5024,7 @@ _mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 3, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 3, format, true,
                                           "glCompressedTextureSubImage3D")) {
       return;
    }
@@ -5440,7 +5473,6 @@ _mesa_TexBufferRange(GLenum target, GLenum internalFormat, GLuint buffer,
          return;
 
    } else {
-
       /* OpenGL 4.5 core spec (02.02.2015) says in Section 8.9 Buffer
        * Textures (PDF page 254):
        *    "If buffer is zero, then any buffer object attached to the buffer
@@ -5508,7 +5540,6 @@ _mesa_TextureBufferRange(GLuint texture, GLenum internalFormat, GLuint buffer,
          return;
 
    } else {
-
       /* OpenGL 4.5 core spec (02.02.2015) says in Section 8.9 Buffer
        * Textures (PDF page 254):
        *    "If buffer is zero, then any buffer object attached to the buffer
@@ -5554,19 +5585,17 @@ check_multisample_target(GLuint dims, GLenum target, bool dsa)
       return dims == 2;
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
       return dims == 2 && !dsa;
-
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return dims == 3;
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return dims == 3 && !dsa;
-
    default:
       return GL_FALSE;
    }
 }
 
 
-void
+static void
 _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
                                 struct gl_texture_object *texObj,
                                 GLenum target, GLsizei samples,
@@ -5581,8 +5610,8 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    GLenum sample_count_error;
    bool dsa = strstr(func, "ture") ? true : false;
 
-   if (!(ctx->Extensions.ARB_texture_multisample
-      && _mesa_is_desktop_gl(ctx))) {
+   if (!((ctx->Extensions.ARB_texture_multisample
+         && _mesa_is_desktop_gl(ctx))) && !_mesa_is_gles31(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(unsupported)", func);
       return;
    }
@@ -5605,14 +5634,21 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    if (immutable && !_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
             "%s(internalformat=%s not legal for immutable-format)",
-            func, _mesa_lookup_enum_by_nr(internalformat));
+            func, _mesa_enum_to_string(internalformat));
       return;
    }
 
    if (!is_renderable_texture_format(ctx, internalformat)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-            "%s(internalformat=%s)",
-            func, _mesa_lookup_enum_by_nr(internalformat));
+      /* Page 172 of OpenGL ES 3.1 spec says:
+       *   "An INVALID_ENUM error is generated if sizedinternalformat is not
+       *   color-renderable, depth-renderable, or stencil-renderable (as
+       *   defined in section 9.4).
+       *
+       *  (Same error is also defined for desktop OpenGL for multisample
+       *  teximage/texstorage functions.)
+       */
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(internalformat=%s)", func,
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -5671,13 +5707,12 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    else {
       if (!dimensionsOK) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-               "%s(invalid width or height)", func);
+                     "%s(invalid width or height)", func);
          return;
       }
 
       if (!sizeOK) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY,
-               "%s(texture too large)", func);
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s(texture too large)", func);
          return;
       }
 
@@ -5695,7 +5730,7 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
 
       if (width > 0 && height > 0 && depth > 0) {
          if (!ctx->Driver.AllocTextureStorage(ctx, texObj, 1,
-                  width, height, depth)) {
+                                              width, height, depth)) {
             /* tidy up the texture image state. strictly speaking,
              * we're allowed to just leave this in whatever state we
              * like, but being tidy is good.
diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h
index 1eebaa8b631..bf729daf534 100644
--- a/src/mesa/main/teximage.h
+++ b/src/mesa/main/teximage.h
@@ -200,15 +200,6 @@ _mesa_copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
                              const char *caller);
 
 extern void
-_mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
-                                struct gl_texture_object *texObj,
-                                GLenum target, GLsizei samples,
-                                GLint internalformat, GLsizei width,
-                                GLsizei height, GLsizei depth,
-                                GLboolean fixedsamplelocations,
-                                GLboolean immutable, const char *func);
-
-extern void
 _mesa_texture_buffer_range(struct gl_context *ctx,
                            struct gl_texture_object *texObj,
                            GLenum internalFormat,
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index c563f1e7434..cd7cfd6a4fb 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -1255,7 +1255,7 @@ create_textures(struct gl_context *ctx, GLenum target,
          if (targetIndex < 0) { /* Bad Target */
             mtx_unlock(&ctx->Shared->Mutex);
             _mesa_error(ctx, GL_INVALID_ENUM, "gl%sTextures(target = %s)",
-                        func, _mesa_lookup_enum_by_nr(texObj->Target));
+                        func, _mesa_enum_to_string(texObj->Target));
             return;
          }
          assert(targetIndex < NUM_TEXTURE_TARGETS);
@@ -1606,8 +1606,8 @@ _mesa_tex_target_to_index(const struct gl_context *ctx, GLenum target)
       return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_cube_map_array
          ? TEXTURE_CUBE_ARRAY_INDEX : -1;
    case GL_TEXTURE_2D_MULTISAMPLE:
-      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
-         ? TEXTURE_2D_MULTISAMPLE_INDEX: -1;
+      return ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample) ||
+              _mesa_is_gles31(ctx)) ? TEXTURE_2D_MULTISAMPLE_INDEX: -1;
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
          ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: -1;
@@ -1642,7 +1642,7 @@ _mesa_BindTexture( GLenum target, GLuint texName )
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glBindTexture %s %d\n",
-                  _mesa_lookup_enum_by_nr(target), (GLint) texName);
+                  _mesa_enum_to_string(target), (GLint) texName);
 
    targetIndex = _mesa_tex_target_to_index(ctx, target);
    if (targetIndex < 0) {
@@ -1806,7 +1806,7 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture)
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glBindTextureUnit %s %d\n",
-                  _mesa_lookup_enum_by_nr(GL_TEXTURE0+unit), (GLint) texture);
+                  _mesa_enum_to_string(GL_TEXTURE0+unit), (GLint) texture);
 
    /* Section 8.1 (Texture Objects) of the OpenGL 4.5 core profile spec
     * (20141030) says:
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index d74134f41b1..c0611c3e489 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -381,7 +381,7 @@ set_tex_parameteri(struct gl_context *ctx,
       if (texObj->Target == GL_TEXTURE_RECTANGLE_ARB && params[0] != 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glTex%sParameter(target=%s, param=%d)", suffix,
-                     _mesa_lookup_enum_by_nr(texObj->Target), params[0]);
+                     _mesa_enum_to_string(texObj->Target), params[0]);
          return GL_FALSE;
       }
       incomplete(ctx, texObj);
@@ -500,7 +500,9 @@ set_tex_parameteri(struct gl_context *ctx,
       goto invalid_pname;
 
    case GL_DEPTH_STENCIL_TEXTURE_MODE:
-      if (_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_stencil_texturing) {
+      if ((_mesa_is_desktop_gl(ctx) &&
+           ctx->Extensions.ARB_stencil_texturing) ||
+          _mesa_is_gles31(ctx)) {
          bool stencil = params[0] == GL_STENCIL_INDEX;
          if (!stencil && params[0] != GL_DEPTH_COMPONENT)
             goto invalid_param;
@@ -610,22 +612,22 @@ set_tex_parameteri(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_param:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(param=%s)",
-               suffix, _mesa_lookup_enum_by_nr(params[0]));
+               suffix, _mesa_enum_to_string(params[0]));
    return GL_FALSE;
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 }
 
@@ -683,7 +685,7 @@ set_tex_parameterf(struct gl_context *ctx,
 
          if (texObj->Sampler.MaxAnisotropy == params[0])
             return GL_FALSE;
-         if (params[0] < 1.0) {
+         if (params[0] < 1.0F) {
             _mesa_error(ctx, GL_INVALID_VALUE, "glTex%sParameter(param)",
                         suffix);
             return GL_FALSE;
@@ -745,12 +747,12 @@ set_tex_parameterf(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 }
 
@@ -1395,7 +1397,7 @@ get_tex_level_parameter_image(struct gl_context *ctx,
     else {
        _mesa_error(ctx, GL_INVALID_OPERATION,
                    "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-                   _mesa_lookup_enum_by_nr(pname));
+                   _mesa_enum_to_string(pname));
     }
          break;
       case GL_TEXTURE_COMPRESSED:
@@ -1444,7 +1446,7 @@ get_tex_level_parameter_image(struct gl_context *ctx,
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1528,7 +1530,7 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
          /* Always illegal for GL_TEXTURE_BUFFER */
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
          break;
 
       /* GL_ARB_texture_float */
@@ -1557,7 +1559,7 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1586,7 +1588,7 @@ get_tex_level_parameteriv(struct gl_context *ctx,
    if (!legal_get_tex_level_parameter_target(ctx, target, dsa)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetTex%sLevelParameter[if]v(target=%s)", suffix,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index 1af9d47f030..9b5928c4306 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -123,21 +123,21 @@ _mesa_print_texunit_state( struct gl_context *ctx, GLuint unit )
 {
    const struct gl_texture_unit *texUnit = ctx->Texture.Unit + unit;
    printf("Texture Unit %d\n", unit);
-   printf("  GL_TEXTURE_ENV_MODE = %s\n", _mesa_lookup_enum_by_nr(texUnit->EnvMode));
-   printf("  GL_COMBINE_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.ModeRGB));
-   printf("  GL_COMBINE_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.ModeA));
-   printf("  GL_SOURCE0_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[0]));
-   printf("  GL_SOURCE1_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[1]));
-   printf("  GL_SOURCE2_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[2]));
-   printf("  GL_SOURCE0_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[0]));
-   printf("  GL_SOURCE1_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[1]));
-   printf("  GL_SOURCE2_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[2]));
-   printf("  GL_OPERAND0_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[0]));
-   printf("  GL_OPERAND1_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[1]));
-   printf("  GL_OPERAND2_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[2]));
-   printf("  GL_OPERAND0_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[0]));
-   printf("  GL_OPERAND1_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[1]));
-   printf("  GL_OPERAND2_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[2]));
+   printf("  GL_TEXTURE_ENV_MODE = %s\n", _mesa_enum_to_string(texUnit->EnvMode));
+   printf("  GL_COMBINE_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.ModeRGB));
+   printf("  GL_COMBINE_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.ModeA));
+   printf("  GL_SOURCE0_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[0]));
+   printf("  GL_SOURCE1_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[1]));
+   printf("  GL_SOURCE2_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[2]));
+   printf("  GL_SOURCE0_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[0]));
+   printf("  GL_SOURCE1_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[1]));
+   printf("  GL_SOURCE2_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[2]));
+   printf("  GL_OPERAND0_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[0]));
+   printf("  GL_OPERAND1_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[1]));
+   printf("  GL_OPERAND2_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[2]));
+   printf("  GL_OPERAND0_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[0]));
+   printf("  GL_OPERAND1_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[1]));
+   printf("  GL_OPERAND2_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[2]));
    printf("  GL_RGB_SCALE = %d\n", 1 << texUnit->Combine.ScaleShiftRGB);
    printf("  GL_ALPHA_SCALE = %d\n", 1 << texUnit->Combine.ScaleShiftA);
    printf("  GL_TEXTURE_ENV_COLOR = (%f, %f, %f, %f)\n", texUnit->EnvColor[0], texUnit->EnvColor[1], texUnit->EnvColor[2], texUnit->EnvColor[3]);
@@ -289,23 +289,23 @@ _mesa_ActiveTexture(GLenum texture)
    GLuint k;
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
+      _mesa_debug(ctx, "glActiveTexture %s\n",
+                  _mesa_enum_to_string(texture));
+
+   if (ctx->Texture.CurrentUnit == texUnit)
+      return;
+
    k = _mesa_max_tex_unit(ctx);
 
    assert(k <= ARRAY_SIZE(ctx->Texture.Unit));
 
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glActiveTexture %s\n",
-                  _mesa_lookup_enum_by_nr(texture));
-
    if (texUnit >= k) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glActiveTexture(texture=%s)",
-                  _mesa_lookup_enum_by_nr(texture));
+                  _mesa_enum_to_string(texture));
       return;
    }
 
-   if (ctx->Texture.CurrentUnit == texUnit)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_TEXTURE);
 
    ctx->Texture.CurrentUnit = texUnit;
@@ -325,16 +325,16 @@ _mesa_ClientActiveTexture(GLenum texture)
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glClientActiveTexture %s\n",
-                  _mesa_lookup_enum_by_nr(texture));
+                  _mesa_enum_to_string(texture));
+
+   if (ctx->Array.ActiveTexture == texUnit)
+      return;
 
    if (texUnit >= ctx->Const.MaxTextureCoordUnits) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture)");
       return;
    }
 
-   if (ctx->Array.ActiveTexture == texUnit)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_ARRAY);
    ctx->Array.ActiveTexture = texUnit;
 }
diff --git a/src/mesa/main/texstate.h b/src/mesa/main/texstate.h
index 662435b47cc..bee8c9c3316 100644
--- a/src/mesa/main/texstate.h
+++ b/src/mesa/main/texstate.h
@@ -77,7 +77,7 @@ _mesa_get_tex_unit_err(struct gl_context *ctx, GLuint unit, const char *func)
     *     implementation."
     */
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(unit=%s)", func,
-               _mesa_lookup_enum_by_nr(GL_TEXTURE0+unit));
+               _mesa_enum_to_string(GL_TEXTURE0+unit));
    return NULL;
 }
 
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index 53cb2c091f8..4a2cc6065df 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -308,7 +308,8 @@ tex_storage_error_check(struct gl_context *ctx,
       _mesa_error(ctx, _mesa_is_desktop_gl(ctx)?
                   GL_INVALID_ENUM : GL_INVALID_OPERATION,
                   "glTex%sStorage%dD(internalformat = %s)", suffix, dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
+      return GL_TRUE;
    }
 
    /* levels check */
@@ -464,21 +465,21 @@ texstorage(GLuint dims, GLenum target, GLsizei levels, GLenum internalformat,
    if (!legal_texobj_target(ctx, dims, target)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTexStorage%uD(illegal target=%s)",
-                  dims, _mesa_lookup_enum_by_nr(target));
+                  dims, _mesa_enum_to_string(target));
       return;
    }
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexStorage%uD %s %d %s %d %d %d\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), levels,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  _mesa_enum_to_string(target), levels,
+                  _mesa_enum_to_string(internalformat),
                   width, height, depth);
    /* Check the format to make sure it is sized. */
    if (!_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTexStorage%uD(internalformat = %s)", dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -504,14 +505,14 @@ texturestorage(GLuint dims, GLuint texture, GLsizei levels,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTextureStorage%uD %d %d %s %d %d %d\n",
                   dims, texture, levels,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  _mesa_enum_to_string(internalformat),
                   width, height, depth);
 
    /* Check the format to make sure it is sized. */
    if (!_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTextureStorage%uD(internalformat = %s)", dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -529,7 +530,7 @@ texturestorage(GLuint dims, GLuint texture, GLsizei levels,
    if (!legal_texobj_target(ctx, dims, texObj->Target)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTextureStorage%uD(illegal target=%s)",
-                  dims, _mesa_lookup_enum_by_nr(texObj->Target));
+                  dims, _mesa_enum_to_string(texObj->Target));
       return;
    }
 
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 1525205981b..37c05690091 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -787,6 +787,7 @@ texstore_rgba(TEXSTORE_PARAMS)
       srcType = GL_FLOAT;
       srcRowStride = srcWidth * 4 * sizeof(float);
       srcMesaFormat = RGBA32_FLOAT;
+      srcPacking = &ctx->DefaultPacking;
    }
 
    src = (GLubyte *)
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index 6b0aed4ea1a..5a3282a40c1 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -313,7 +313,7 @@ target_valid(struct gl_context *ctx, GLenum origTarget, GLenum newTarget)
    }
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "glTextureView(illegal target=%s)",
-               _mesa_lookup_enum_by_nr(newTarget));
+               _mesa_enum_to_string(newTarget));
    return false;
 }
 #undef RETURN_IF_SUPPORTED
@@ -435,8 +435,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTextureView %d %s %d %s %d %d %d %d\n",
-                  texture, _mesa_lookup_enum_by_nr(target), origtexture,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  texture, _mesa_enum_to_string(target), origtexture,
+                  _mesa_enum_to_string(internalformat),
                   minlevel, numlevels, minlayer, numlayers);
 
    if (origtexture == 0) {
@@ -523,8 +523,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
                                    internalformat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
           "glTextureView(internalformat %s not compatible with origtexture %s)",
-          _mesa_lookup_enum_by_nr(internalformat),
-          _mesa_lookup_enum_by_nr(origTexObj->Image[0][0]->InternalFormat));
+          _mesa_enum_to_string(internalformat),
+          _mesa_enum_to_string(origTexObj->Image[0][0]->InternalFormat));
       return;
    }
 
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index cab5083e81b..036530e91b6 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -978,81 +978,6 @@ _mesa_uniform_matrix(struct gl_context *ctx, struct gl_shader_program *shProg,
 }
 
 
-/**
- * Called via glGetUniformLocation().
- *
- * Returns the uniform index into UniformStorage (also the
- * glGetActiveUniformsiv uniform index), and stores the referenced
- * array offset in *offset, or GL_INVALID_INDEX (-1).
- */
-extern "C" unsigned
-_mesa_get_uniform_location(struct gl_shader_program *shProg,
-                           const GLchar *name,
-                           unsigned *out_offset)
-{
-   /* Page 80 (page 94 of the PDF) of the OpenGL 2.1 spec says:
-    *
-    *     "The first element of a uniform array is identified using the
-    *     name of the uniform array appended with "[0]". Except if the last
-    *     part of the string name indicates a uniform array, then the
-    *     location of the first element of that array can be retrieved by
-    *     either using the name of the uniform array, or the name of the
-    *     uniform array appended with "[0]"."
-    *
-    * Note: since uniform names are not allowed to use whitespace, and array
-    * indices within uniform names are not allowed to use "+", "-", or leading
-    * zeros, it follows that each uniform has a unique name up to the possible
-    * ambiguity with "[0]" noted above.  Therefore we don't need to worry
-    * about mal-formed inputs--they will properly fail when we try to look up
-    * the uniform name in shProg->UniformHash.
-    */
-
-   const GLchar *base_name_end;
-   long offset = parse_program_resource_name(name, &base_name_end);
-   bool array_lookup = offset >= 0;
-   char *name_copy;
-
-   if (array_lookup) {
-      name_copy = (char *) malloc(base_name_end - name + 1);
-      memcpy(name_copy, name, base_name_end - name);
-      name_copy[base_name_end - name] = '\0';
-   } else {
-      name_copy = (char *) name;
-      offset = 0;
-   }
-
-   unsigned location = 0;
-   const bool found = shProg->UniformHash->get(location, name_copy);
-
-   assert(!found
-	  || strcmp(name_copy, shProg->UniformStorage[location].name) == 0);
-
-   /* Free the temporary buffer *before* possibly returning an error.
-    */
-   if (name_copy != name)
-      free(name_copy);
-
-   if (!found)
-      return GL_INVALID_INDEX;
-
-   /* If the uniform is built-in, fail. */
-   if (shProg->UniformStorage[location].builtin)
-      return GL_INVALID_INDEX;
-
-   /* If the uniform is an array, fail if the index is out of bounds.
-    * (A negative index is caught above.)  This also fails if the uniform
-    * is not an array, but the user is trying to index it, because
-    * array_elements is zero and offset >= 0.
-    */
-   if (array_lookup
-       && offset >= (long) shProg->UniformStorage[location].array_elements) {
-      return GL_INVALID_INDEX;
-   }
-
-   *out_offset = offset;
-   return location;
-}
-
 extern "C" bool
 _mesa_sampler_uniforms_are_valid(const struct gl_shader_program *shProg,
 				 char *errMsg, size_t errMsgLength)
@@ -1101,18 +1026,23 @@ _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *pipeline)
       for (unsigned i = 0; i < shProg[idx]->NumUniformStorage; i++) {
          const struct gl_uniform_storage *const storage =
             &shProg[idx]->UniformStorage[i];
-         const glsl_type *const t = (storage->type->is_array())
-            ? storage->type->fields.array : storage->type;
 
-         if (!t->is_sampler())
+         if (!storage->type->is_sampler())
             continue;
 
          active_samplers++;
 
-         const unsigned count = MAX2(1, storage->type->array_size());
+         const unsigned count = MAX2(1, storage->array_elements);
          for (unsigned j = 0; j < count; j++) {
             const unsigned unit = storage->storage[j].i;
 
+            /* FIXME: Samplers are initialized to 0 and Mesa doesn't do a
+             * great job of eliminating unused uniforms currently so for now
+             * don't throw an error if two sampler types both point to 0.
+             */
+            if (unit == 0)
+               continue;
+
             /* The types of the samplers associated with a particular texture
              * unit must be an exact match.  Page 74 (page 89 of the PDF) of
              * the OpenGL 3.3 core spec says:
@@ -1122,13 +1052,14 @@ _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *pipeline)
              *     program object."
              */
             if (unit_types[unit] == NULL) {
-               unit_types[unit] = t;
-            } else if (unit_types[unit] != t) {
+               unit_types[unit] = storage->type;
+            } else if (unit_types[unit] != storage->type) {
                pipeline->InfoLog =
                   ralloc_asprintf(pipeline,
                                   "Texture unit %d is accessed both as %s "
                                   "and %s",
-                                  unit, unit_types[unit]->name, t->name);
+                                  unit, unit_types[unit]->name,
+                                  storage->type->name);
                return false;
             }
          }
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 5548d1d026f..ff1df72e1d6 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -952,7 +952,7 @@ _mesa_GetUniformBlockIndex(GLuint program,
 
    struct gl_program_resource *res =
       _mesa_program_resource_find_name(shProg, GL_UNIFORM_BLOCK,
-                                       uniformBlockName);
+                                       uniformBlockName, NULL);
    if (!res)
       return GL_INVALID_INDEX;
 
@@ -987,7 +987,8 @@ _mesa_GetUniformIndices(GLuint program,
 
    for (i = 0; i < uniformCount; i++) {
       struct gl_program_resource *res =
-         _mesa_program_resource_find_name(shProg, GL_UNIFORM, uniformNames[i]);
+         _mesa_program_resource_find_name(shProg, GL_UNIFORM, uniformNames[i],
+                                          NULL);
       uniformIndices[i] = _mesa_program_resource_index(shProg, res);
    }
 }
@@ -1092,6 +1093,21 @@ mesa_bufferiv(struct gl_shader_program *shProg, GLenum type,
                                   GL_REFERENCED_BY_VERTEX_SHADER, params,
                                   caller);
       return;
+
+   case GL_UNIFORM_BLOCK_REFERENCED_BY_TESS_CONTROL_SHADER:
+   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_CONTROL_SHADER:
+      _mesa_program_resource_prop(shProg, res, index,
+                                  GL_REFERENCED_BY_TESS_CONTROL_SHADER, params,
+                                  caller);
+      return;
+
+   case GL_UNIFORM_BLOCK_REFERENCED_BY_TESS_EVALUATION_SHADER:
+   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_EVALUATION_SHADER:
+      _mesa_program_resource_prop(shProg, res, index,
+                                  GL_REFERENCED_BY_TESS_EVALUATION_SHADER, params,
+                                  caller);
+      return;
+
    case GL_UNIFORM_BLOCK_REFERENCED_BY_GEOMETRY_SHADER:
    case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_GEOMETRY_SHADER:
       _mesa_program_resource_prop(shProg, res, index,
@@ -1104,16 +1120,10 @@ mesa_bufferiv(struct gl_shader_program *shProg, GLenum type,
                                   GL_REFERENCED_BY_FRAGMENT_SHADER, params,
                                   caller);
       return;
-   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_CONTROL_SHADER:
-      params[0] = GL_FALSE;
-      return;
-   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_EVALUATION_SHADER:
-      params[0] = GL_FALSE;
-      return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(pname 0x%x (%s))", caller, pname,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 }
diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h
index bd7b05e207a..e62eaa53ccc 100644
--- a/src/mesa/main/uniforms.h
+++ b/src/mesa/main/uniforms.h
@@ -343,10 +343,6 @@ void GLAPIENTRY
 _mesa_ProgramUniformMatrix4x3dv(GLuint program, GLint location, GLsizei count,
                                 GLboolean transpose, const GLdouble *value);
 
-unsigned
-_mesa_get_uniform_location(struct gl_shader_program *shProg,
-			   const GLchar *name, unsigned *offset);
-
 void
 _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shader_program,
 	      GLint location, GLsizei count,
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index ebdd9eaf02e..3bab9850588 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -300,7 +300,7 @@ update_array_format(struct gl_context *ctx,
    typeBit = type_to_bit(ctx, type);
    if (typeBit == 0x0 || (typeBit & legalTypesMask) == 0x0) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(type = %s)",
-                  func, _mesa_lookup_enum_by_nr(type));
+                  func, _mesa_enum_to_string(type));
       return false;
    }
 
@@ -333,7 +333,7 @@ update_array_format(struct gl_context *ctx,
 
       if (bgra_error) {
          _mesa_error(ctx, GL_INVALID_OPERATION, "%s(size=GL_BGRA and type=%s)",
-                     func, _mesa_lookup_enum_by_nr(type));
+                     func, _mesa_enum_to_string(type));
          return false;
       }
 
@@ -2310,7 +2310,7 @@ print_array(const char *name, GLint index, const struct gl_client_array *array)
    else
       fprintf(stderr, "  %s: ", name);
    fprintf(stderr, "Ptr=%p, Type=%s, Size=%d, ElemSize=%u, Stride=%d, Buffer=%u(Size %lu)\n",
-           array->Ptr, _mesa_lookup_enum_by_nr(array->Type), array->Size,
+           array->Ptr, _mesa_enum_to_string(array->Type), array->Size,
            array->_ElementSize, array->StrideB, array->BufferObj->Name,
            (unsigned long) array->BufferObj->Size);
 }
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 8bc00ace5c4..fd7ae53ccbd 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -309,7 +309,7 @@ compute_version(const struct gl_extensions *extensions,
                          extensions->ARB_gpu_shader5 &&
                          extensions->ARB_gpu_shader_fp64 &&
                          extensions->ARB_sample_shading &&
-                         false /*extensions->ARB_shader_subroutine*/ &&
+                         extensions->ARB_shader_subroutine &&
                          extensions->ARB_tessellation_shader &&
                          extensions->ARB_texture_buffer_object_rgb32 &&
                          extensions->ARB_texture_cube_map_array &&
diff --git a/src/mesa/main/viewport.c b/src/mesa/main/viewport.c
index b27063031c4..7d8914291c3 100644
--- a/src/mesa/main/viewport.c
+++ b/src/mesa/main/viewport.c
@@ -391,8 +391,8 @@ _mesa_ClipControl(GLenum origin, GLenum depth)
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glClipControl(%s, %s)\n",
-	          _mesa_lookup_enum_by_nr(origin),
-                  _mesa_lookup_enum_by_nr(depth));
+	          _mesa_enum_to_string(origin),
+                  _mesa_enum_to_string(depth));
 
    ASSERT_OUTSIDE_BEGIN_END(ctx);
 
@@ -443,12 +443,12 @@ _mesa_ClipControl(GLenum origin, GLenum depth)
  */
 void
 _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
-                         double scale[3], double translate[3])
+                         float scale[3], float translate[3])
 {
-   double x = ctx->ViewportArray[i].X;
-   double y = ctx->ViewportArray[i].Y;
-   double half_width = 0.5*ctx->ViewportArray[i].Width;
-   double half_height = 0.5*ctx->ViewportArray[i].Height;
+   float x = ctx->ViewportArray[i].X;
+   float y = ctx->ViewportArray[i].Y;
+   float half_width = 0.5f * ctx->ViewportArray[i].Width;
+   float half_height = 0.5f * ctx->ViewportArray[i].Height;
    double n = ctx->ViewportArray[i].Near;
    double f = ctx->ViewportArray[i].Far;
 
@@ -462,8 +462,8 @@ _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
       translate[1] = half_height + y;
    }
    if (ctx->Transform.ClipDepthMode == GL_NEGATIVE_ONE_TO_ONE) {
-      scale[2] = 0.5*(f - n);
-      translate[2] = 0.5*(n + f);
+      scale[2] = 0.5 * (f - n);
+      translate[2] = 0.5 * (n + f);
    } else {
       scale[2] = f - n;
       translate[2] = n;
diff --git a/src/mesa/main/viewport.h b/src/mesa/main/viewport.h
index 899dc2d0bcc..b0675db1096 100644
--- a/src/mesa/main/viewport.h
+++ b/src/mesa/main/viewport.h
@@ -73,6 +73,6 @@ _mesa_ClipControl(GLenum origin, GLenum depth);
 
 extern void
 _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
-                         double scale[3], double translate[3]);
+                         float scale[3], float translate[3]);
 
 #endif
diff --git a/src/mesa/math/m_clip_tmp.h b/src/mesa/math/m_clip_tmp.h
index e289be7b302..60c00043725 100644
--- a/src/mesa/math/m_clip_tmp.h
+++ b/src/mesa/math/m_clip_tmp.h
@@ -194,13 +194,13 @@ static GLvector4f * TAG(cliptest_points3)( GLvector4f *clip_vec,
    STRIDE_LOOP {
       const GLfloat cx = from[0], cy = from[1], cz = from[2];
       GLubyte mask = 0;
-      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
-      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
-      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
-      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
+      if (cx >  1.0F)       mask |= CLIP_RIGHT_BIT;
+      else if (cx < -1.0F)  mask |= CLIP_LEFT_BIT;
+      if (cy >  1.0F)       mask |= CLIP_TOP_BIT;
+      else if (cy < -1.0F)  mask |= CLIP_BOTTOM_BIT;
       if (viewport_z_clip) {
-	 if (cz >  1.0)       mask |= CLIP_FAR_BIT;
-	 else if (cz < -1.0)  mask |= CLIP_NEAR_BIT;
+	 if (cz >  1.0F)       mask |= CLIP_FAR_BIT;
+	 else if (cz < -1.0F)  mask |= CLIP_NEAR_BIT;
       }
       clipMask[i] = mask;
       tmpOrMask |= mask;
@@ -230,10 +230,10 @@ static GLvector4f * TAG(cliptest_points2)( GLvector4f *clip_vec,
    STRIDE_LOOP {
       const GLfloat cx = from[0], cy = from[1];
       GLubyte mask = 0;
-      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
-      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
-      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
-      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
+      if (cx >  1.0F)       mask |= CLIP_RIGHT_BIT;
+      else if (cx < -1.0F)  mask |= CLIP_LEFT_BIT;
+      if (cy >  1.0F)       mask |= CLIP_TOP_BIT;
+      else if (cy < -1.0F)  mask |= CLIP_BOTTOM_BIT;
       clipMask[i] = mask;
       tmpOrMask |= mask;
       tmpAndMask &= mask;
diff --git a/src/mesa/math/m_matrix.c b/src/mesa/math/m_matrix.c
index ecf564c0089..6522200b345 100644
--- a/src/mesa/math/m_matrix.c
+++ b/src/mesa/math/m_matrix.c
@@ -380,7 +380,7 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    if (fabsf(r3[0])>fabsf(r2[0])) SWAP_ROWS(r3, r2);
    if (fabsf(r2[0])>fabsf(r1[0])) SWAP_ROWS(r2, r1);
    if (fabsf(r1[0])>fabsf(r0[0])) SWAP_ROWS(r1, r0);
-   if (0.0 == r0[0])  return GL_FALSE;
+   if (0.0F == r0[0])  return GL_FALSE;
 
    /* eliminate first variable     */
    m1 = r1[0]/r0[0]; m2 = r2[0]/r0[0]; m3 = r3[0]/r0[0];
@@ -388,31 +388,31 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    s = r0[2]; r1[2] -= m1 * s; r2[2] -= m2 * s; r3[2] -= m3 * s;
    s = r0[3]; r1[3] -= m1 * s; r2[3] -= m2 * s; r3[3] -= m3 * s;
    s = r0[4];
-   if (s != 0.0) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   if (s != 0.0F) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
    s = r0[5];
-   if (s != 0.0) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   if (s != 0.0F) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
    s = r0[6];
-   if (s != 0.0) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   if (s != 0.0F) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
    s = r0[7];
-   if (s != 0.0) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
+   if (s != 0.0F) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
 
    /* choose pivot - or die */
    if (fabsf(r3[1])>fabsf(r2[1])) SWAP_ROWS(r3, r2);
    if (fabsf(r2[1])>fabsf(r1[1])) SWAP_ROWS(r2, r1);
-   if (0.0 == r1[1])  return GL_FALSE;
+   if (0.0F == r1[1])  return GL_FALSE;
 
    /* eliminate second variable */
    m2 = r2[1]/r1[1]; m3 = r3[1]/r1[1];
    r2[2] -= m2 * r1[2]; r3[2] -= m3 * r1[2];
    r2[3] -= m2 * r1[3]; r3[3] -= m3 * r1[3];
-   s = r1[4]; if (0.0 != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
-   s = r1[5]; if (0.0 != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
-   s = r1[6]; if (0.0 != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
-   s = r1[7]; if (0.0 != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
+   s = r1[4]; if (0.0F != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   s = r1[5]; if (0.0F != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   s = r1[6]; if (0.0F != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   s = r1[7]; if (0.0F != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
 
    /* choose pivot - or die */
    if (fabsf(r3[2])>fabsf(r2[2])) SWAP_ROWS(r3, r2);
-   if (0.0 == r2[2])  return GL_FALSE;
+   if (0.0F == r2[2])  return GL_FALSE;
 
    /* eliminate third variable */
    m3 = r3[2]/r2[2];
@@ -421,7 +421,7 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    r3[7] -= m3 * r2[7];
 
    /* last check */
-   if (0.0 == r3[3]) return GL_FALSE;
+   if (0.0F == r3[3]) return GL_FALSE;
 
    s = 1.0F/r3[3];             /* now back substitute row 3 */
    r3[4] *= s; r3[5] *= s; r3[6] *= s; r3[7] *= s;
@@ -490,26 +490,26 @@ static GLboolean invert_matrix_3d_general( GLmatrix *mat )
     */
    pos = neg = 0.0;
    t =  MAT(in,0,0) * MAT(in,1,1) * MAT(in,2,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t =  MAT(in,1,0) * MAT(in,2,1) * MAT(in,0,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t =  MAT(in,2,0) * MAT(in,0,1) * MAT(in,1,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,2,0) * MAT(in,1,1) * MAT(in,0,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,1,0) * MAT(in,0,1) * MAT(in,2,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,0,0) * MAT(in,2,1) * MAT(in,1,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    det = pos + neg;
 
-   if (fabsf(det) < 1e-25)
+   if (fabsf(det) < 1e-25F)
       return GL_FALSE;
 
    det = 1.0F / det;
@@ -564,7 +564,7 @@ static GLboolean invert_matrix_3d( GLmatrix *mat )
                        MAT(in,0,1) * MAT(in,0,1) +
                        MAT(in,0,2) * MAT(in,0,2));
 
-      if (scale == 0.0)
+      if (scale == 0.0F)
          return GL_FALSE;
 
       scale = 1.0F / scale;
@@ -799,8 +799,8 @@ _math_matrix_rotate( GLmatrix *mat,
    GLfloat m[16];
    GLboolean optimized;
 
-   s = (GLfloat) sin( angle * M_PI / 180.0 );
-   c = (GLfloat) cos( angle * M_PI / 180.0 );
+   s = sinf( angle * M_PI / 180.0 );
+   c = cosf( angle * M_PI / 180.0 );
 
    memcpy(m, Identity, sizeof(GLfloat)*16);
    optimized = GL_FALSE;
@@ -859,7 +859,7 @@ _math_matrix_rotate( GLmatrix *mat,
    if (!optimized) {
       const GLfloat mag = sqrtf(x * x + y * y + z * z);
 
-      if (mag <= 1.0e-4) {
+      if (mag <= 1.0e-4F) {
          /* no rotation, leave mat as-is */
          return;
       }
@@ -1070,7 +1070,7 @@ _math_matrix_scale( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z )
    m[2] *= x;   m[6] *= y;   m[10] *= z;
    m[3] *= x;   m[7] *= y;   m[11] *= z;
 
-   if (fabsf(x - y) < 1e-8 && fabsf(x - z) < 1e-8)
+   if (fabsf(x - y) < 1e-8F && fabsf(x - z) < 1e-8F)
       mat->flags |= MAT_FLAG_UNIFORM_SCALE;
    else
       mat->flags |= MAT_FLAG_GENERAL_SCALE;
@@ -1111,8 +1111,8 @@ _math_matrix_translate( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z )
  * Transforms Normalized Device Coords to window/Z values.
  */
 void
-_math_matrix_viewport(GLmatrix *m, const double scale[3],
-                      const double translate[3], double depthMax)
+_math_matrix_viewport(GLmatrix *m, const float scale[3],
+                      const float translate[3], double depthMax)
 {
    m->m[MAT_SX] = scale[0];
    m->m[MAT_TX] = translate[0];
@@ -1206,7 +1206,7 @@ static void analyse_from_scratch( GLmatrix *mat )
    GLuint i;
 
    for (i = 0 ; i < 16 ; i++) {
-      if (m[i] == 0.0) mask |= (1<<i);
+      if (m[i] == 0.0F) mask |= (1<<i);
    }
 
    if (m[0] == 1.0F) mask |= (1<<16);
@@ -1240,12 +1240,12 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_2D;
 
       /* Check for scale */
-      if (SQ(mm-1) > SQ(1e-6) ||
-	  SQ(m4m4-1) > SQ(1e-6))
+      if (SQ(mm-1) > SQ(1e-6F) ||
+	  SQ(m4m4-1) > SQ(1e-6F))
 	 mat->flags |= MAT_FLAG_GENERAL_SCALE;
 
       /* Check for rotation */
-      if (SQ(mm4) > SQ(1e-6))
+      if (SQ(mm4) > SQ(1e-6F))
 	 mat->flags |= MAT_FLAG_GENERAL_3D;
       else
 	 mat->flags |= MAT_FLAG_ROTATION;
@@ -1255,9 +1255,9 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_3D_NO_ROT;
 
       /* Check for scale */
-      if (SQ(m[0]-m[5]) < SQ(1e-6) &&
-	  SQ(m[0]-m[10]) < SQ(1e-6)) {
-	 if (SQ(m[0]-1.0) > SQ(1e-6)) {
+      if (SQ(m[0]-m[5]) < SQ(1e-6F) &&
+	  SQ(m[0]-m[10]) < SQ(1e-6F)) {
+	 if (SQ(m[0]-1.0F) > SQ(1e-6F)) {
 	    mat->flags |= MAT_FLAG_UNIFORM_SCALE;
          }
       }
@@ -1275,8 +1275,8 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_3D;
 
       /* Check for scale */
-      if (SQ(c1-c2) < SQ(1e-6) && SQ(c1-c3) < SQ(1e-6)) {
-	 if (SQ(c1-1.0) > SQ(1e-6))
+      if (SQ(c1-c2) < SQ(1e-6F) && SQ(c1-c3) < SQ(1e-6F)) {
+	 if (SQ(c1-1.0F) > SQ(1e-6F))
 	    mat->flags |= MAT_FLAG_UNIFORM_SCALE;
 	 /* else no scale at all */
       }
@@ -1285,10 +1285,10 @@ static void analyse_from_scratch( GLmatrix *mat )
       }
 
       /* Check for rotation */
-      if (SQ(d1) < SQ(1e-6)) {
+      if (SQ(d1) < SQ(1e-6F)) {
 	 CROSS3( cp, m, m+4 );
 	 SUB_3V( cp, cp, (m+8) );
-	 if (LEN_SQUARED_3FV(cp) < SQ(1e-6))
+	 if (LEN_SQUARED_3FV(cp) < SQ(1e-6F))
 	    mat->flags |= MAT_FLAG_ROTATION;
 	 else
 	    mat->flags |= MAT_FLAG_GENERAL_3D;
diff --git a/src/mesa/math/m_matrix.h b/src/mesa/math/m_matrix.h
index 778d716dce7..c34d9e3022f 100644
--- a/src/mesa/math/m_matrix.h
+++ b/src/mesa/math/m_matrix.h
@@ -122,8 +122,8 @@ _math_matrix_frustum( GLmatrix *mat,
 		      GLfloat nearval, GLfloat farval );
 
 extern void
-_math_matrix_viewport( GLmatrix *m, const double scale[3],
-                       const double translate[3], double depthMax );
+_math_matrix_viewport( GLmatrix *m, const float scale[3],
+                       const float translate[3], double depthMax );
 
 extern void
 _math_matrix_set_identity( GLmatrix *dest );
diff --git a/src/mesa/math/m_norm_tmp.h b/src/mesa/math/m_norm_tmp.h
index d3ec1c22ecd..6f1db8d0bd0 100644
--- a/src/mesa/math/m_norm_tmp.h
+++ b/src/mesa/math/m_norm_tmp.h
@@ -80,7 +80,7 @@ TAG(transform_normalize_normals)( const GLmatrix *mat,
       }
    }
    else {
-      if (scale != 1.0) {
+      if (scale != 1.0f) {
 	 m0 *= scale,  m4 *= scale,  m8 *= scale;
 	 m1 *= scale,  m5 *= scale,  m9 *= scale;
 	 m2 *= scale,  m6 *= scale,  m10 *= scale;
diff --git a/src/mesa/math/m_vector.h b/src/mesa/math/m_vector.h
index 8551ee7520e..5bd76b8987d 100644
--- a/src/mesa/math/m_vector.h
+++ b/src/mesa/math/m_vector.h
@@ -51,7 +51,7 @@
 
 /**
  * Wrap all the information about vectors up in a struct.  Has
- * additional fields compared to the other vectors to help us track of
+ * additional fields compared to the other vectors to help us track
  * different vertex sizes, and whether we need to clean columns out
  * because they contain non-(0,0,0,1) values.
  *
@@ -61,7 +61,7 @@
  */
 typedef struct {
    GLfloat (*data)[4];	/**< may be malloc'd or point to client data */
-   GLfloat *start;	/**< points somewhere inside of <data> */
+   GLfloat *start;	/**< points somewhere inside of GLvector4f::data */
    GLuint count;	/**< size of the vector (in elements) */
    GLuint stride;	/**< stride from one element to the next (in bytes) */
    GLuint size;		/**< 2-4 for vertices and 1-4 for texcoords */
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 3bffe90ff1f..b8b082e2a59 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -534,6 +534,7 @@ type_size(const struct glsl_type *type)
       return size;
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SUBROUTINE:
       /* Samplers take up one slot in UNIFORMS[], but they're baked in
        * at link time.
        */
@@ -1343,6 +1344,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
    case ir_unop_dFdx_fine:
    case ir_unop_dFdy_coarse:
    case ir_unop_dFdy_fine:
+   case ir_unop_subroutine_to_int:
       assert(!"not supported");
       break;
 
@@ -2385,7 +2387,7 @@ _mesa_generate_parameters_list_for_uniforms(struct gl_shader_program
       ir_variable *var = node->as_variable();
 
       if ((var == NULL) || (var->data.mode != ir_var_uniform)
-	  || var->is_in_uniform_block() || (strncmp(var->name, "gl_", 3) == 0))
+	  || var->is_in_buffer_block() || (strncmp(var->name, "gl_", 3) == 0))
 	 continue;
 
       add.process(var);
@@ -2452,6 +2454,7 @@ _mesa_associate_uniform_storage(struct gl_context *ctx,
 	    break;
 	 case GLSL_TYPE_SAMPLER:
 	 case GLSL_TYPE_IMAGE:
+         case GLSL_TYPE_SUBROUTINE:
 	    format = uniform_native;
 	    columns = 1;
 	    break;
@@ -2912,7 +2915,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 	 if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput
 	     || options->EmitNoIndirectTemp || options->EmitNoIndirectUniform)
 	   progress =
-	     lower_variable_index_to_cond_assign(ir,
+	     lower_variable_index_to_cond_assign(prog->_LinkedShaders[i]->Stage, ir,
 						 options->EmitNoIndirectInput,
 						 options->EmitNoIndirectOutput,
 						 options->EmitNoIndirectTemp,
@@ -2977,6 +2980,8 @@ _mesa_glsl_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
    if (prog->LinkStatus) {
       if (!ctx->Driver.LinkShader(ctx, prog)) {
 	 prog->LinkStatus = GL_FALSE;
+      } else {
+         build_program_resource_list(ctx, prog);
       }
    }
 
diff --git a/src/mesa/program/prog_execute.c b/src/mesa/program/prog_execute.c
index 46260b54882..2c52d0db508 100644
--- a/src/mesa/program/prog_execute.c
+++ b/src/mesa/program/prog_execute.c
@@ -623,7 +623,7 @@ _mesa_execute_program(struct gl_context * ctx,
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) cos(a[0]);
+               = cosf(a[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -723,7 +723,7 @@ _mesa_execute_program(struct gl_context * ctx,
                 * result.z = result.x * APPX(result.y)
                 * We do what the ARB extension says.
                 */
-               q[2] = (GLfloat) pow(2.0, t[0]);
+               q[2] = exp2f(t[0]);
             }
             q[1] = t[0] - floor_t0;
             q[3] = 1.0F;
@@ -734,7 +734,7 @@ _mesa_execute_program(struct gl_context * ctx,
          {
             GLfloat a[4], result[4], val;
             fetch_vector1(&inst->SrcReg[0], machine, a);
-            val = (GLfloat) pow(2.0, a[0]);
+            val = exp2f(a[0]);
             /*
             if (IS_INF_OR_NAN(val))
                val = 1.0e10;
@@ -776,7 +776,7 @@ _mesa_execute_program(struct gl_context * ctx,
             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
                GLfloat a[4];
                fetch_vector1(&inst->SrcReg[0], machine, a);
-               cond = (a[0] != 0.0);
+               cond = (a[0] != 0.0F);
             }
             else {
                cond = eval_condition(machine, inst);
@@ -834,7 +834,7 @@ _mesa_execute_program(struct gl_context * ctx,
                val = -FLT_MAX;
             }
             else {
-               val = (float)(log(a[0]) * 1.442695F);
+               val = logf(a[0]) * 1.442695F;
             }
             result[0] = result[1] = result[2] = result[3] = val;
             store_vector4(inst, machine, result);
@@ -853,10 +853,10 @@ _mesa_execute_program(struct gl_context * ctx,
             result[1] = a[0];
             /* XXX we could probably just use pow() here */
             if (a[0] > 0.0F) {
-               if (a[1] == 0.0 && a[3] == 0.0)
+               if (a[1] == 0.0F && a[3] == 0.0F)
                   result[2] = 1.0F;
                else
-                  result[2] = (GLfloat) pow(a[1], a[3]);
+                  result[2] = powf(a[1], a[3]);
             }
             else {
                result[2] = 0.0F;
@@ -886,12 +886,12 @@ _mesa_execute_program(struct gl_context * ctx,
                   int exponent;
                   GLfloat mantissa = frexpf(t[0], &exponent);
                   q[0] = (GLfloat) (exponent - 1);
-                  q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
+                  q[1] = 2.0F * mantissa; /* map [.5, 1) -> [1, 2) */
 
 		  /* The fast LOG2 macro doesn't meet the precision
 		   * requirements.
 		   */
-                  q[2] = (float)(log(t[0]) * 1.442695F);
+                  q[2] = logf(t[0]) * 1.442695F;
                }
             }
             else {
@@ -1051,7 +1051,7 @@ _mesa_execute_program(struct gl_context * ctx,
             fetch_vector1(&inst->SrcReg[0], machine, a);
             fetch_vector1(&inst->SrcReg[1], machine, b);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) pow(a[0], b[0]);
+               = powf(a[0], b[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -1095,10 +1095,10 @@ _mesa_execute_program(struct gl_context * ctx,
          {
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
-            result[0] = (GLfloat) cos(a[0]);
-            result[1] = (GLfloat) sin(a[0]);
-            result[2] = 0.0;    /* undefined! */
-            result[3] = 0.0;    /* undefined! */
+            result[0] = cosf(a[0]);
+            result[1] = sinf(a[0]);
+            result[2] = 0.0F;    /* undefined! */
+            result[3] = 0.0F;    /* undefined! */
             store_vector4(inst, machine, result);
          }
          break;
@@ -1161,7 +1161,7 @@ _mesa_execute_program(struct gl_context * ctx,
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) sin(a[0]);
+               = sinf(a[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -1360,7 +1360,7 @@ _mesa_execute_program(struct gl_context * ctx,
              * zero, we'd probably be fine except for an assert in
              * IROUND_POS() which gets triggered by the inf values created.
              */
-            if (texcoord[3] != 0.0) {
+            if (texcoord[3] != 0.0F) {
                texcoord[0] /= texcoord[3];
                texcoord[1] /= texcoord[3];
                texcoord[2] /= texcoord[3];
@@ -1380,7 +1380,7 @@ _mesa_execute_program(struct gl_context * ctx,
 
             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
-                texcoord[3] != 0.0) {
+                texcoord[3] != 0.0F) {
                texcoord[0] /= texcoord[3];
                texcoord[1] /= texcoord[3];
                texcoord[2] /= texcoord[3];
diff --git a/src/mesa/program/prog_opt_constant_fold.c b/src/mesa/program/prog_opt_constant_fold.c
index 3811c0d8aa6..e2518e660e6 100644
--- a/src/mesa/program/prog_opt_constant_fold.c
+++ b/src/mesa/program/prog_opt_constant_fold.c
@@ -38,6 +38,8 @@ src_regs_are_constant(const struct prog_instruction *inst, unsigned num_srcs)
    for (i = 0; i < num_srcs; i++) {
       if (inst->SrcReg[i].File != PROGRAM_CONSTANT)
 	 return false;
+      if (inst->SrcReg[i].RelAddr)
+         return false;
    }
 
    return true;
diff --git a/src/mesa/program/prog_print.c b/src/mesa/program/prog_print.c
index e4faa63c06f..bb7c2c6e527 100644
--- a/src/mesa/program/prog_print.c
+++ b/src/mesa/program/prog_print.c
@@ -147,6 +147,8 @@ arb_input_attrib_string(GLuint index, GLenum progType)
       "fragment.(twenty-one)", /* VARYING_SLOT_VIEWPORT */
       "fragment.(twenty-two)", /* VARYING_SLOT_FACE */
       "fragment.(twenty-three)", /* VARYING_SLOT_PNTC */
+      "fragment.(twenty-four)", /* VARYING_SLOT_TESS_LEVEL_OUTER */
+      "fragment.(twenty-five)", /* VARYING_SLOT_TESS_LEVEL_INNER */
       "fragment.varying[0]",
       "fragment.varying[1]",
       "fragment.varying[2]",
@@ -272,6 +274,8 @@ arb_output_attrib_string(GLuint index, GLenum progType)
       "result.(twenty-one)", /* VARYING_SLOT_VIEWPORT */
       "result.(twenty-two)", /* VARYING_SLOT_FACE */
       "result.(twenty-three)", /* VARYING_SLOT_PNTC */
+      "result.(twenty-four)", /* VARYING_SLOT_TESS_LEVEL_OUTER */
+      "result.(twenty-five)", /* VARYING_SLOT_TESS_LEVEL_INNER */
       "result.varying[0]",
       "result.varying[1]",
       "result.varying[2]",
@@ -1015,6 +1019,12 @@ _mesa_write_shader_to_file(const struct gl_shader *shader)
    case MESA_SHADER_FRAGMENT:
       type = "frag";
       break;
+   case MESA_SHADER_TESS_CTRL:
+      type = "tesc";
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      type = "tese";
+      break;
    case MESA_SHADER_VERTEX:
       type = "vert";
       break;
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index c13e61b1630..2d03bba3d12 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -286,6 +286,38 @@ _mesa_init_compute_program(struct gl_context *ctx,
 
 
 /**
+ * Initialize a new tessellation control program object.
+ */
+struct gl_program *
+_mesa_init_tess_ctrl_program(struct gl_context *ctx,
+                             struct gl_tess_ctrl_program *prog,
+                             GLenum target, GLuint id)
+{
+   if (prog) {
+      init_program_struct(&prog->Base, target, id);
+      return &prog->Base;
+   }
+   return NULL;
+}
+
+
+/**
+ * Initialize a new tessellation evaluation program object.
+ */
+struct gl_program *
+_mesa_init_tess_eval_program(struct gl_context *ctx,
+                             struct gl_tess_eval_program *prog,
+                             GLenum target, GLuint id)
+{
+   if (prog) {
+      init_program_struct(&prog->Base, target, id);
+      return &prog->Base;
+   }
+   return NULL;
+}
+
+
+/**
  * Initialize a new geometry program object.
  */
 struct gl_program *
@@ -333,6 +365,16 @@ _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id)
                                          CALLOC_STRUCT(gl_geometry_program),
                                          target, id);
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      prog = _mesa_init_tess_ctrl_program(ctx,
+                                          CALLOC_STRUCT(gl_tess_ctrl_program),
+                                          target, id);
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      prog = _mesa_init_tess_eval_program(ctx,
+                                         CALLOC_STRUCT(gl_tess_eval_program),
+                                         target, id);
+      break;
    case GL_COMPUTE_PROGRAM_NV:
       prog = _mesa_init_compute_program(ctx,
                                         CALLOC_STRUCT(gl_compute_program),
@@ -554,6 +596,23 @@ _mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog)
          gpc->UsesStreams = gp->UsesStreams;
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         const struct gl_tess_ctrl_program *tcp = gl_tess_ctrl_program_const(prog);
+         struct gl_tess_ctrl_program *tcpc = gl_tess_ctrl_program(clone);
+         tcpc->VerticesOut = tcp->VerticesOut;
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         const struct gl_tess_eval_program *tep = gl_tess_eval_program_const(prog);
+         struct gl_tess_eval_program *tepc = gl_tess_eval_program(clone);
+         tepc->PrimitiveMode = tep->PrimitiveMode;
+         tepc->Spacing = tep->Spacing;
+         tepc->VertexOrder = tep->VertexOrder;
+         tepc->PointMode = tep->PointMode;
+      }
+      break;
    default:
       _mesa_problem(NULL, "Unexpected target in _mesa_clone_program");
    }
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 2d92ab2f118..a894147cafd 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -79,6 +79,16 @@ _mesa_init_fragment_program(struct gl_context *ctx,
                             GLenum target, GLuint id);
 
 extern struct gl_program *
+_mesa_init_tess_ctrl_program(struct gl_context *ctx,
+                            struct gl_tess_ctrl_program *prog,
+                            GLenum target, GLuint id);
+
+extern struct gl_program *
+_mesa_init_tess_eval_program(struct gl_context *ctx,
+                            struct gl_tess_eval_program *prog,
+                            GLenum target, GLuint id);
+
+extern struct gl_program *
 _mesa_init_geometry_program(struct gl_context *ctx,
                             struct gl_geometry_program *prog,
                             GLenum target, GLuint id);
@@ -147,6 +157,25 @@ _mesa_reference_compprog(struct gl_context *ctx,
                            (struct gl_program *) prog);
 }
 
+
+static inline void
+_mesa_reference_tesscprog(struct gl_context *ctx,
+                         struct gl_tess_ctrl_program **ptr,
+                         struct gl_tess_ctrl_program *prog)
+{
+   _mesa_reference_program(ctx, (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
+static inline void
+_mesa_reference_tesseprog(struct gl_context *ctx,
+                         struct gl_tess_eval_program **ptr,
+                         struct gl_tess_eval_program *prog)
+{
+   _mesa_reference_program(ctx, (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
 extern struct gl_program *
 _mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog);
 
@@ -157,6 +186,20 @@ _mesa_clone_vertex_program(struct gl_context *ctx,
    return (struct gl_vertex_program *) _mesa_clone_program(ctx, &prog->Base);
 }
 
+static inline struct gl_tess_ctrl_program *
+_mesa_clone_tess_ctrl_program(struct gl_context *ctx,
+                             const struct gl_tess_ctrl_program *prog)
+{
+   return (struct gl_tess_ctrl_program *) _mesa_clone_program(ctx, &prog->Base);
+}
+
+static inline struct gl_tess_eval_program *
+_mesa_clone_tess_eval_program(struct gl_context *ctx,
+                             const struct gl_tess_eval_program *prog)
+{
+   return (struct gl_tess_eval_program *) _mesa_clone_program(ctx, &prog->Base);
+}
+
 static inline struct gl_geometry_program *
 _mesa_clone_geometry_program(struct gl_context *ctx,
                              const struct gl_geometry_program *prog)
@@ -216,6 +259,10 @@ _mesa_program_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_PROGRAM_NV:
       return MESA_SHADER_GEOMETRY;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      return MESA_SHADER_TESS_EVAL;
    case GL_COMPUTE_PROGRAM_NV:
       return MESA_SHADER_COMPUTE;
    default:
@@ -235,6 +282,10 @@ _mesa_shader_stage_to_program(unsigned stage)
       return GL_FRAGMENT_PROGRAM_ARB;
    case MESA_SHADER_GEOMETRY:
       return GL_GEOMETRY_PROGRAM_NV;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_PROGRAM_NV;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_PROGRAM_NV;
    case MESA_SHADER_COMPUTE:
       return GL_COMPUTE_PROGRAM_NV;
    }
@@ -244,7 +295,9 @@ _mesa_shader_stage_to_program(unsigned stage)
 }
 
 
-/* Cast wrappers from gl_program to gl_vertex/geometry/fragment_program */
+/* Cast wrappers from gl_program to derived program types.
+ * (e.g. gl_vertex_program)
+ */
 
 static inline struct gl_fragment_program *
 gl_fragment_program(struct gl_program *prog)
@@ -297,6 +350,31 @@ gl_compute_program_const(const struct gl_program *prog)
    return (const struct gl_compute_program *) prog;
 }
 
+static inline struct gl_tess_ctrl_program *
+gl_tess_ctrl_program(struct gl_program *prog)
+{
+   return (struct gl_tess_ctrl_program *) prog;
+}
+
+static inline const struct gl_tess_ctrl_program *
+gl_tess_ctrl_program_const(const struct gl_program *prog)
+{
+   return (const struct gl_tess_ctrl_program *) prog;
+}
+
+
+static inline struct gl_tess_eval_program *
+gl_tess_eval_program(struct gl_program *prog)
+{
+   return (struct gl_tess_eval_program *) prog;
+}
+
+static inline const struct gl_tess_eval_program *
+gl_tess_eval_program_const(const struct gl_program *prog)
+{
+   return (const struct gl_tess_eval_program *) prog;
+}
+
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/mesa/program/program_parse_extra.c b/src/mesa/program/program_parse_extra.c
index 32b54afc57b..71f86d13ace 100644
--- a/src/mesa/program/program_parse_extra.c
+++ b/src/mesa/program/program_parse_extra.c
@@ -163,6 +163,8 @@ _mesa_ARBvp_parse_option(struct asm_parser_state *state, const char *option)
 int
 _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option)
 {
+   unsigned fog_option;
+
    /* All of the options currently supported start with "ARB_".  The code is
     * currently structured with nested if-statements because eventually options
     * that start with "NV_" will be supported.  This structure will result in
@@ -177,20 +179,42 @@ _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option)
       if (strncmp(option, "fog_", 4) == 0) {
 	 option += 4;
 
-	 if (state->option.Fog == OPTION_NONE) {
-	    if (strcmp(option, "exp") == 0) {
-	       state->option.Fog = OPTION_FOG_EXP;
-	       return 1;
-	    } else if (strcmp(option, "exp2") == 0) {
-	       state->option.Fog = OPTION_FOG_EXP2;
-	       return 1;
-	    } else if (strcmp(option, "linear") == 0) {
-	       state->option.Fog = OPTION_FOG_LINEAR;
-	       return 1;
-	    }
-	 }
+         if (strcmp(option, "exp") == 0) {
+            fog_option = OPTION_FOG_EXP;
+         } else if (strcmp(option, "exp2") == 0) {
+            fog_option = OPTION_FOG_EXP2;
+         } else if (strcmp(option, "linear") == 0) {
+            fog_option = OPTION_FOG_LINEAR;
+         } else {
+            /* invalid option */
+            return 0;
+         }
 
-	 return 0;
+         if (state->option.Fog == OPTION_NONE) {
+            state->option.Fog = fog_option;
+            return 1;
+         }
+
+         /* The ARB_fragment_program specification instructs us to handle
+          * redundant options in two seemingly contradictory ways:
+          *
+          * Section 3.11.4.5.1 says:
+          * "Only one fog application option may be specified by any given
+          *  fragment program.  A fragment program that specifies more than one
+          *  of the program options "ARB_fog_exp", "ARB_fog_exp2", and
+          *  "ARB_fog_linear", will fail to load."
+          *
+          * Issue 27 says:
+          * "The three mandatory options are ARB_fog_exp, ARB_fog_exp2, and
+          *  ARB_fog_linear.  As these options are mutually exclusive by
+          *  nature, specifying more than one is not useful.  If more than one
+          *  is specified, the last one encountered in the <optionSequence>
+          *  will be the one to actually modify the execution environment."
+          *
+          * We choose to allow programs to specify the same OPTION redundantly,
+          * but fail to load programs that specify contradictory options.
+          */
+         return state->option.Fog == fog_option ? 1 : 0;
       } else if (strncmp(option, "precision_hint_", 15) == 0) {
 	 option += 15;
 
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 428f2d9d7d7..43dbadd4a7e 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -46,9 +46,10 @@ static const struct st_tracked_state *atoms[] =
    &st_update_depth_stencil_alpha,
    &st_update_clip,
 
-   &st_finalize_textures,
    &st_update_fp,
    &st_update_gp,
+   &st_update_tep,
+   &st_update_tcp,
    &st_update_vp,
 
    &st_update_rasterizer,
@@ -59,17 +60,24 @@ static const struct st_tracked_state *atoms[] =
    &st_update_vertex_texture,
    &st_update_fragment_texture,
    &st_update_geometry_texture,
+   &st_update_tessctrl_texture,
+   &st_update_tesseval_texture,
    &st_update_sampler, /* depends on update_*_texture for swizzle */
    &st_update_framebuffer,
    &st_update_msaa,
    &st_update_sample_shading,
    &st_update_vs_constants,
+   &st_update_tcs_constants,
+   &st_update_tes_constants,
    &st_update_gs_constants,
    &st_update_fs_constants,
    &st_bind_vs_ubos,
+   &st_bind_tcs_ubos,
+   &st_bind_tes_ubos,
    &st_bind_fs_ubos,
    &st_bind_gs_ubos,
    &st_update_pixel_transfer,
+   &st_update_tess,
 
    /* this must be done after the vertex program update */
    &st_update_array
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index c50111d501f..a24842baa4f 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -52,6 +52,8 @@ extern const struct st_tracked_state st_update_clip;
 extern const struct st_tracked_state st_update_depth_stencil_alpha;
 extern const struct st_tracked_state st_update_fp;
 extern const struct st_tracked_state st_update_gp;
+extern const struct st_tracked_state st_update_tep;
+extern const struct st_tracked_state st_update_tcp;
 extern const struct st_tracked_state st_update_vp;
 extern const struct st_tracked_state st_update_rasterizer;
 extern const struct st_tracked_state st_update_polygon_stipple;
@@ -64,14 +66,20 @@ extern const struct st_tracked_state st_update_sampler;
 extern const struct st_tracked_state st_update_fragment_texture;
 extern const struct st_tracked_state st_update_vertex_texture;
 extern const struct st_tracked_state st_update_geometry_texture;
-extern const struct st_tracked_state st_finalize_textures;
+extern const struct st_tracked_state st_update_tessctrl_texture;
+extern const struct st_tracked_state st_update_tesseval_texture;
 extern const struct st_tracked_state st_update_fs_constants;
 extern const struct st_tracked_state st_update_gs_constants;
+extern const struct st_tracked_state st_update_tes_constants;
+extern const struct st_tracked_state st_update_tcs_constants;
 extern const struct st_tracked_state st_update_vs_constants;
 extern const struct st_tracked_state st_bind_fs_ubos;
 extern const struct st_tracked_state st_bind_vs_ubos;
 extern const struct st_tracked_state st_bind_gs_ubos;
+extern const struct st_tracked_state st_bind_tcs_ubos;
+extern const struct st_tracked_state st_bind_tes_ubos;
 extern const struct st_tracked_state st_update_pixel_transfer;
+extern const struct st_tracked_state st_update_tess;
 
 
 GLuint st_compare_func_to_pipe(GLenum func);
diff --git a/src/mesa/state_tracker/st_atom_clip.c b/src/mesa/state_tracker/st_atom_clip.c
index f82c1332afc..506a770499f 100644
--- a/src/mesa/state_tracker/st_atom_clip.c
+++ b/src/mesa/state_tracker/st_atom_clip.c
@@ -59,8 +59,11 @@ static void update_clip( struct st_context *st )
    memcpy(clip.ucp,
           use_eye ? ctx->Transform.EyeUserPlane
                   : ctx->Transform._ClipUserPlane, sizeof(clip.ucp));
-   st->state.clip = clip;
-   cso_set_clip(st->cso_context, &clip);
+
+   if (memcmp(&st->state.clip, &clip, sizeof(clip)) != 0) {
+      st->state.clip = clip;
+      st->pipe->set_clip_state(st->pipe, &clip);
+   }
 }
 
 
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index a54e0d9dbf5..6affb4d84d5 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -59,7 +59,9 @@ void st_upload_constants( struct st_context *st,
 {
    assert(shader_type == PIPE_SHADER_VERTEX ||
           shader_type == PIPE_SHADER_FRAGMENT ||
-          shader_type == PIPE_SHADER_GEOMETRY);
+          shader_type == PIPE_SHADER_GEOMETRY ||
+          shader_type == PIPE_SHADER_TESS_CTRL ||
+          shader_type == PIPE_SHADER_TESS_EVAL);
 
    /* update constants */
    if (params && params->NumParameters) {
@@ -178,6 +180,50 @@ const struct st_tracked_state st_update_gs_constants = {
    update_gs_constants					/* update */
 };
 
+/* Tessellation control shader:
+ */
+static void update_tcs_constants(struct st_context *st )
+{
+   struct st_tessctrl_program *tcp = st->tcp;
+   struct gl_program_parameter_list *params;
+
+   if (tcp) {
+      params = tcp->Base.Base.Parameters;
+      st_upload_constants( st, params, PIPE_SHADER_TESS_CTRL );
+   }
+}
+
+const struct st_tracked_state st_update_tcs_constants = {
+   "st_update_tcs_constants",				/* name */
+   {							/* dirty */
+      _NEW_PROGRAM_CONSTANTS,                           /* mesa */
+      ST_NEW_TESSCTRL_PROGRAM,				/* st */
+   },
+   update_tcs_constants					/* update */
+};
+
+/* Tessellation evaluation shader:
+ */
+static void update_tes_constants(struct st_context *st )
+{
+   struct st_tesseval_program *tep = st->tep;
+   struct gl_program_parameter_list *params;
+
+   if (tep) {
+      params = tep->Base.Base.Parameters;
+      st_upload_constants( st, params, PIPE_SHADER_TESS_EVAL );
+   }
+}
+
+const struct st_tracked_state st_update_tes_constants = {
+   "st_update_tes_constants",				/* name */
+   {							/* dirty */
+      _NEW_PROGRAM_CONSTANTS,                           /* mesa */
+      ST_NEW_TESSEVAL_PROGRAM,				/* st */
+   },
+   update_tes_constants					/* update */
+};
+
 static void st_bind_ubos(struct st_context *st,
                            struct gl_shader *shader,
                            unsigned shader_type)
@@ -275,3 +321,43 @@ const struct st_tracked_state st_bind_gs_ubos = {
    },
    bind_gs_ubos
 };
+
+static void bind_tcs_ubos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
+
+   if (!prog)
+      return;
+
+   st_bind_ubos(st, prog->_LinkedShaders[MESA_SHADER_TESS_CTRL], PIPE_SHADER_TESS_CTRL);
+}
+
+const struct st_tracked_state st_bind_tcs_ubos = {
+   "st_bind_tcs_ubos",
+   {
+      0,
+      ST_NEW_TESSCTRL_PROGRAM | ST_NEW_UNIFORM_BUFFER,
+   },
+   bind_tcs_ubos
+};
+
+static void bind_tes_ubos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+
+   if (!prog)
+      return;
+
+   st_bind_ubos(st, prog->_LinkedShaders[MESA_SHADER_TESS_EVAL], PIPE_SHADER_TESS_EVAL);
+}
+
+const struct st_tracked_state st_bind_tes_ubos = {
+   "st_bind_tes_ubos",
+   {
+      0,
+      ST_NEW_TESSEVAL_PROGRAM | ST_NEW_UNIFORM_BUFFER,
+   },
+   bind_tes_ubos
+};
diff --git a/src/mesa/state_tracker/st_atom_depth.c b/src/mesa/state_tracker/st_atom_depth.c
index c4bca8d09b5..d9cc97029fb 100644
--- a/src/mesa/state_tracker/st_atom_depth.c
+++ b/src/mesa/state_tracker/st_atom_depth.c
@@ -105,10 +105,17 @@ update_depth_stencil_alpha(struct st_context *st)
    memset(dsa, 0, sizeof(*dsa));
    memset(&sr, 0, sizeof(sr));
 
-   if (ctx->Depth.Test && ctx->DrawBuffer->Visual.depthBits > 0) {
-      dsa->depth.enabled = 1;
-      dsa->depth.writemask = ctx->Depth.Mask;
-      dsa->depth.func = st_compare_func_to_pipe(ctx->Depth.Func);
+   if (ctx->DrawBuffer->Visual.depthBits > 0) {
+      if (ctx->Depth.Test) {
+         dsa->depth.enabled = 1;
+         dsa->depth.writemask = ctx->Depth.Mask;
+         dsa->depth.func = st_compare_func_to_pipe(ctx->Depth.Func);
+      }
+      if (ctx->Depth.BoundsTest) {
+         dsa->depth.bounds_test = 1;
+         dsa->depth.bounds_min = ctx->Depth.BoundsMin;
+         dsa->depth.bounds_max = ctx->Depth.BoundsMax;
+      }
    }
 
    if (ctx->Stencil.Enabled && ctx->DrawBuffer->Visual.stencilBits > 0) {
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index b68eb16d7be..4252c27962e 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -245,6 +245,7 @@ update_shader_samplers(struct st_context *st,
    GLuint unit;
    GLbitfield samplers_used;
    const GLuint old_max = *num_samplers;
+   const struct pipe_sampler_state *states[PIPE_MAX_SAMPLERS];
 
    samplers_used = prog->SamplersUsed;
 
@@ -261,13 +262,11 @@ update_shader_samplers(struct st_context *st,
          const GLuint texUnit = prog->SamplerUnits[unit];
 
          convert_sampler(st, sampler, texUnit);
-
+         states[unit] = sampler;
          *num_samplers = unit + 1;
-
-         cso_single_sampler(st->cso_context, shader_stage, unit, sampler);
       }
       else if (samplers_used != 0 || unit < old_max) {
-         cso_single_sampler(st->cso_context, shader_stage, unit, NULL);
+         states[unit] = NULL;
       }
       else {
          /* if we've reset all the old samplers and we have no more new ones */
@@ -275,7 +274,7 @@ update_shader_samplers(struct st_context *st,
       }
    }
 
-   cso_single_sampler_done(st->cso_context, shader_stage);
+   cso_set_samplers(st->cso_context, shader_stage, *num_samplers, states);
 }
 
 
@@ -306,6 +305,22 @@ update_samplers(struct st_context *st)
                              st->state.samplers[PIPE_SHADER_GEOMETRY],
                              &st->state.num_samplers[PIPE_SHADER_GEOMETRY]);
    }
+   if (ctx->TessCtrlProgram._Current) {
+      update_shader_samplers(st,
+                             PIPE_SHADER_TESS_CTRL,
+                             &ctx->TessCtrlProgram._Current->Base,
+                             ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
+                             st->state.samplers[PIPE_SHADER_TESS_CTRL],
+                             &st->state.num_samplers[PIPE_SHADER_TESS_CTRL]);
+   }
+   if (ctx->TessEvalProgram._Current) {
+      update_shader_samplers(st,
+                             PIPE_SHADER_TESS_EVAL,
+                             &ctx->TessEvalProgram._Current->Base,
+                             ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
+                             st->state.samplers[PIPE_SHADER_TESS_EVAL],
+                             &st->state.num_samplers[PIPE_SHADER_TESS_EVAL]);
+   }
 }
 
 
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index ad8d2624fc9..fee15a980f3 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -50,24 +50,6 @@
 
 
 /**
- * Return pointer to a pass-through fragment shader.
- * This shader is used when a texture is missing/incomplete.
- */
-static void *
-get_passthrough_fs(struct st_context *st)
-{
-   if (!st->passthrough_fs) {
-      st->passthrough_fs =
-         util_make_fragment_passthrough_shader(st->pipe, TGSI_SEMANTIC_COLOR,
-                                               TGSI_INTERPOLATE_PERSPECTIVE,
-                                               TRUE);
-   }
-
-   return st->passthrough_fs;
-}
-
-
-/**
  * Update fragment program state/atom.  This involves translating the
  * Mesa fragment program into a gallium fragment program and binding it.
  */
@@ -96,15 +78,8 @@ update_fp( struct st_context *st )
 
    st_reference_fragprog(st, &st->fp, stfp);
 
-   if (st->missing_textures) {
-      /* use a pass-through frag shader that uses no textures */
-      void *fs = get_passthrough_fs(st);
-      cso_set_fragment_shader_handle(st->cso_context, fs);
-   }
-   else {
-      cso_set_fragment_shader_handle(st->cso_context,
-                                     st->fp_variant->driver_shader);
-   }
+   cso_set_fragment_shader_handle(st->cso_context,
+                                  st->fp_variant->driver_shader);
 }
 
 
@@ -210,3 +185,75 @@ const struct st_tracked_state st_update_gp = {
    },
    update_gp  				/* update */
 };
+
+
+
+static void
+update_tcp( struct st_context *st )
+{
+   struct st_tessctrl_program *sttcp;
+   struct st_tcp_variant_key key;
+
+   if (!st->ctx->TessCtrlProgram._Current) {
+      cso_set_tessctrl_shader_handle(st->cso_context, NULL);
+      return;
+   }
+
+   sttcp = st_tessctrl_program(st->ctx->TessCtrlProgram._Current);
+   assert(sttcp->Base.Base.Target == GL_TESS_CONTROL_PROGRAM_NV);
+
+   memset(&key, 0, sizeof(key));
+   key.st = st;
+
+   st->tcp_variant = st_get_tcp_variant(st, sttcp, &key);
+
+   st_reference_tesscprog(st, &st->tcp, sttcp);
+
+   cso_set_tessctrl_shader_handle(st->cso_context,
+                                  st->tcp_variant->driver_shader);
+}
+
+const struct st_tracked_state st_update_tcp = {
+   "st_update_tcp",			/* name */
+   {					/* dirty */
+      0,				/* mesa */
+      ST_NEW_TESSCTRL_PROGRAM           /* st */
+   },
+   update_tcp  				/* update */
+};
+
+
+
+static void
+update_tep( struct st_context *st )
+{
+   struct st_tesseval_program *sttep;
+   struct st_tep_variant_key key;
+
+   if (!st->ctx->TessEvalProgram._Current) {
+      cso_set_tesseval_shader_handle(st->cso_context, NULL);
+      return;
+   }
+
+   sttep = st_tesseval_program(st->ctx->TessEvalProgram._Current);
+   assert(sttep->Base.Base.Target == GL_TESS_EVALUATION_PROGRAM_NV);
+
+   memset(&key, 0, sizeof(key));
+   key.st = st;
+
+   st->tep_variant = st_get_tep_variant(st, sttep, &key);
+
+   st_reference_tesseprog(st, &st->tep, sttep);
+
+   cso_set_tesseval_shader_handle(st->cso_context,
+                                  st->tep_variant->driver_shader);
+}
+
+const struct st_tracked_state st_update_tep = {
+   "st_update_tep",			/* name */
+   {					/* dirty */
+      0,				/* mesa */
+      ST_NEW_TESSEVAL_PROGRAM           /* st */
+   },
+   update_tep  				/* update */
+};
diff --git a/src/mesa/state_tracker/st_atom_tess.c b/src/mesa/state_tracker/st_atom_tess.c
new file mode 100644
index 00000000000..8e6287a900c
--- /dev/null
+++ b/src/mesa/state_tracker/st_atom_tess.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * Authors:
+ *   Marek Olšák <[email protected]>
+ */
+
+
+#include "main/macros.h"
+#include "st_context.h"
+#include "pipe/p_context.h"
+#include "st_atom.h"
+
+
+static void
+update_tess(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+   struct pipe_context *pipe = st->pipe;
+
+   if (!pipe->set_tess_state)
+      return;
+
+   pipe->set_tess_state(pipe,
+                        ctx->TessCtrlProgram.patch_default_outer_level,
+                        ctx->TessCtrlProgram.patch_default_inner_level);
+}
+
+
+const struct st_tracked_state st_update_tess = {
+   "update_tess",		/* name */
+   {				/* dirty */
+      0,			/* mesa */
+      ST_NEW_TESS_STATE,	/* st */
+   },
+   update_tess                  /* update */
+};
diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index 04ba86448fc..31e0f6ba06c 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -103,7 +103,8 @@ swizzle_swizzle(unsigned swizzle1, unsigned swizzle2)
  */
 static unsigned
 compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
-                               enum pipe_format actualFormat)
+                               enum pipe_format actualFormat,
+                               unsigned glsl_version)
 {
    switch (baseFormat) {
    case GL_RGBA:
@@ -157,8 +158,26 @@ compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
       case GL_INTENSITY:
          return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
       case GL_ALPHA:
-         return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
-                              SWIZZLE_ZERO, SWIZZLE_X);
+         /* The texture(sampler*Shadow) functions from GLSL 1.30 ignore
+          * the depth mode and return float, while older shadow* functions
+          * and ARB_fp instructions return vec4 according to the depth mode.
+          *
+          * The problem with the GLSL 1.30 functions is that GL_ALPHA forces
+          * them to return 0, breaking them completely.
+          *
+          * A proper fix would increase code complexity and that's not worth
+          * it for a rarely used feature such as the GL_ALPHA depth mode
+          * in GL3. Therefore, change GL_ALPHA to GL_INTENSITY for all
+          * shaders that use GLSL 1.30 or later.
+          *
+          * BTW, it's required that sampler views are updated when
+          * shaders change (check_sampler_swizzle takes care of that).
+          */
+         if (glsl_version && glsl_version >= 130)
+            return SWIZZLE_XXXX;
+         else
+            return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
+                                 SWIZZLE_ZERO, SWIZZLE_X);
       case GL_RED:
          return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO,
                               SWIZZLE_ZERO, SWIZZLE_ONE);
@@ -174,7 +193,8 @@ compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
 
 
 static unsigned
-get_texture_format_swizzle(const struct st_texture_object *stObj)
+get_texture_format_swizzle(const struct st_texture_object *stObj,
+                           unsigned glsl_version)
 {
    GLenum baseFormat = _mesa_texture_base_format(&stObj->base);
    unsigned tex_swizzle;
@@ -182,7 +202,8 @@ get_texture_format_swizzle(const struct st_texture_object *stObj)
    if (baseFormat != GL_NONE) {
       tex_swizzle = compute_texture_format_swizzle(baseFormat,
                                                    stObj->base.DepthMode,
-                                                   stObj->pt->format);
+                                                   stObj->pt->format,
+                                                   glsl_version);
    }
    else {
       tex_swizzle = SWIZZLE_XYZW;
@@ -201,9 +222,9 @@ get_texture_format_swizzle(const struct st_texture_object *stObj)
  */
 static boolean
 check_sampler_swizzle(const struct st_texture_object *stObj,
-		      struct pipe_sampler_view *sv)
+		      struct pipe_sampler_view *sv, unsigned glsl_version)
 {
-   unsigned swizzle = get_texture_format_swizzle(stObj);
+   unsigned swizzle = get_texture_format_swizzle(stObj, glsl_version);
 
    return ((sv->swizzle_r != GET_SWZ(swizzle, 0)) ||
            (sv->swizzle_g != GET_SWZ(swizzle, 1)) ||
@@ -232,11 +253,11 @@ static unsigned last_layer(struct st_texture_object *stObj)
 static struct pipe_sampler_view *
 st_create_texture_sampler_view_from_stobj(struct pipe_context *pipe,
 					  struct st_texture_object *stObj,
-                                          const struct gl_sampler_object *samp,
-					  enum pipe_format format)
+					  enum pipe_format format,
+                                          unsigned glsl_version)
 {
    struct pipe_sampler_view templ;
-   unsigned swizzle = get_texture_format_swizzle(stObj);
+   unsigned swizzle = get_texture_format_swizzle(stObj, glsl_version);
 
    u_sampler_view_default_template(&templ,
                                    stObj->pt,
@@ -283,8 +304,8 @@ st_create_texture_sampler_view_from_stobj(struct pipe_context *pipe,
 static struct pipe_sampler_view *
 st_get_texture_sampler_view_from_stobj(struct st_context *st,
                                        struct st_texture_object *stObj,
-                                       const struct gl_sampler_object *samp,
-				       enum pipe_format format)
+				       enum pipe_format format,
+                                       unsigned glsl_version)
 {
    struct pipe_sampler_view **sv;
    const struct st_texture_image *firstImage;
@@ -306,7 +327,7 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
 
    /* if sampler view has changed dereference it */
    if (*sv) {
-      if (check_sampler_swizzle(stObj, *sv) ||
+      if (check_sampler_swizzle(stObj, *sv, glsl_version) ||
 	  (format != (*sv)->format) ||
           gl_target_to_pipe(stObj->base.Target) != (*sv)->target ||
           stObj->base.MinLevel + stObj->base.BaseLevel != (*sv)->u.tex.first_level ||
@@ -318,7 +339,8 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
    }
 
    if (!*sv) {
-      *sv = st_create_texture_sampler_view_from_stobj(st->pipe, stObj, samp, format);
+      *sv = st_create_texture_sampler_view_from_stobj(st->pipe, stObj,
+                                                      format, glsl_version);
 
    } else if ((*sv)->context != st->pipe) {
       /* Recreate view in correct context, use existing view as template */
@@ -334,7 +356,7 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
 static GLboolean
 update_single_texture(struct st_context *st,
                       struct pipe_sampler_view **sampler_view,
-		      GLuint texUnit)
+		      GLuint texUnit, unsigned glsl_version)
 {
    struct gl_context *ctx = st->ctx;
    const struct gl_sampler_object *samp;
@@ -374,8 +396,9 @@ update_single_texture(struct st_context *st,
       }
    }
 
-   *sampler_view = st_get_texture_sampler_view_from_stobj(st, stObj, samp,
-							  view_format);
+   *sampler_view =
+      st_get_texture_sampler_view_from_stobj(st, stObj, view_format,
+                                             glsl_version);
    return GL_TRUE;
 }
 
@@ -383,7 +406,7 @@ update_single_texture(struct st_context *st,
 
 static void
 update_textures(struct st_context *st,
-                unsigned shader_stage,
+                gl_shader_stage mesa_shader,
                 const struct gl_program *prog,
                 unsigned max_units,
                 struct pipe_sampler_view **sampler_views,
@@ -392,6 +415,10 @@ update_textures(struct st_context *st,
    const GLuint old_max = *num_textures;
    GLbitfield samplers_used = prog->SamplersUsed;
    GLuint unit;
+   struct gl_shader_program *shader =
+      st->ctx->_Shader->CurrentProgram[mesa_shader];
+   unsigned glsl_version = shader ? shader->Version : 0;
+   unsigned shader_stage = st_shader_stage_to_ptarget(mesa_shader);
 
    if (samplers_used == 0x0 && old_max == 0)
       return;
@@ -406,7 +433,8 @@ update_textures(struct st_context *st,
          const GLuint texUnit = prog->SamplerUnits[unit];
          GLboolean retval;
 
-         retval = update_single_texture(st, &sampler_view, texUnit);
+         retval = update_single_texture(st, &sampler_view, texUnit,
+                                        glsl_version);
          if (retval == GL_FALSE)
             continue;
 
@@ -435,7 +463,7 @@ update_vertex_textures(struct st_context *st)
 
    if (ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits > 0) {
       update_textures(st,
-                      PIPE_SHADER_VERTEX,
+                      MESA_SHADER_VERTEX,
                       &ctx->VertexProgram._Current->Base,
                       ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_VERTEX],
@@ -450,7 +478,7 @@ update_fragment_textures(struct st_context *st)
    const struct gl_context *ctx = st->ctx;
 
    update_textures(st,
-                   PIPE_SHADER_FRAGMENT,
+                   MESA_SHADER_FRAGMENT,
                    &ctx->FragmentProgram._Current->Base,
                    ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
                    st->state.sampler_views[PIPE_SHADER_FRAGMENT],
@@ -465,7 +493,7 @@ update_geometry_textures(struct st_context *st)
 
    if (ctx->GeometryProgram._Current) {
       update_textures(st,
-                      PIPE_SHADER_GEOMETRY,
+                      MESA_SHADER_GEOMETRY,
                       &ctx->GeometryProgram._Current->Base,
                       ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_GEOMETRY],
@@ -474,11 +502,43 @@ update_geometry_textures(struct st_context *st)
 }
 
 
+static void
+update_tessctrl_textures(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+
+   if (ctx->TessCtrlProgram._Current) {
+      update_textures(st,
+                      MESA_SHADER_TESS_CTRL,
+                      &ctx->TessCtrlProgram._Current->Base,
+                      ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
+                      st->state.sampler_views[PIPE_SHADER_TESS_CTRL],
+                      &st->state.num_sampler_views[PIPE_SHADER_TESS_CTRL]);
+   }
+}
+
+
+static void
+update_tesseval_textures(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+
+   if (ctx->TessEvalProgram._Current) {
+      update_textures(st,
+                      MESA_SHADER_TESS_EVAL,
+                      &ctx->TessEvalProgram._Current->Base,
+                      ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
+                      st->state.sampler_views[PIPE_SHADER_TESS_EVAL],
+                      &st->state.num_sampler_views[PIPE_SHADER_TESS_EVAL]);
+   }
+}
+
+
 const struct st_tracked_state st_update_fragment_texture = {
    "st_update_texture",					/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_FRAGMENT_PROGRAM,				/* st */
+      ST_NEW_FRAGMENT_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_fragment_textures				/* update */
 };
@@ -488,7 +548,7 @@ const struct st_tracked_state st_update_vertex_texture = {
    "st_update_vertex_texture",				/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_VERTEX_PROGRAM,				/* st */
+      ST_NEW_VERTEX_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_vertex_textures				/* update */
 };
@@ -498,52 +558,27 @@ const struct st_tracked_state st_update_geometry_texture = {
    "st_update_geometry_texture",			/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_GEOMETRY_PROGRAM,				/* st */
+      ST_NEW_GEOMETRY_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_geometry_textures				/* update */
 };
 
 
-
-static void
-finalize_textures(struct st_context *st)
-{
-   struct gl_context *ctx = st->ctx;
-   struct gl_fragment_program *fprog = ctx->FragmentProgram._Current;
-   const GLboolean prev_missing_textures = st->missing_textures;
-   GLuint su;
-
-   st->missing_textures = GL_FALSE;
-
-   for (su = 0; su < ctx->Const.MaxTextureCoordUnits; su++) {
-      if (fprog->Base.SamplersUsed & (1 << su)) {
-         const GLuint texUnit = fprog->Base.SamplerUnits[su];
-         struct gl_texture_object *texObj
-            = ctx->Texture.Unit[texUnit]._Current;
-
-         if (texObj) {
-            GLboolean retval;
-
-            retval = st_finalize_texture(ctx, st->pipe, texObj);
-            if (!retval) {
-               /* out of mem */
-               st->missing_textures = GL_TRUE;
-               continue;
-            }
-         }
-      }
-   }
-
-   if (prev_missing_textures != st->missing_textures)
-      st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
-}
+const struct st_tracked_state st_update_tessctrl_texture = {
+   "st_update_tessctrl_texture",			/* name */
+   {							/* dirty */
+      _NEW_TEXTURE,					/* mesa */
+      ST_NEW_TESSCTRL_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
+   },
+   update_tessctrl_textures				/* update */
+};
 
 
-const struct st_tracked_state st_finalize_textures = {
-   "st_finalize_textures",		/* name */
-   {					/* dirty */
-      _NEW_TEXTURE,			/* mesa */
-      0,				/* st */
+const struct st_tracked_state st_update_tesseval_texture = {
+   "st_update_tesseval_texture",			/* name */
+   {							/* dirty */
+      _NEW_TEXTURE,					/* mesa */
+      ST_NEW_TESSEVAL_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
-   finalize_textures			/* update */
+   update_tesseval_textures				/* update */
 };
diff --git a/src/mesa/state_tracker/st_atom_viewport.c b/src/mesa/state_tracker/st_atom_viewport.c
index 2f62590c4f1..9a692cecade 100644
--- a/src/mesa/state_tracker/st_atom_viewport.c
+++ b/src/mesa/state_tracker/st_atom_viewport.c
@@ -64,7 +64,7 @@ update_viewport( struct st_context *st )
     */
    for (i = 0; i < ctx->Const.MaxViewports; i++)
    {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       st->state.viewport[i].scale[0] = scale[0];
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index c881e194f70..01a96c18264 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -446,8 +446,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    assert(height <= (GLsizei)maxSize);
 
    cso_save_rasterizer(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
+   cso_save_fragment_sampler_views(cso);
    cso_save_viewport(cso);
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
@@ -535,8 +535,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
    /* restore state */
    cso_restore_rasterizer(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_viewport(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
diff --git a/src/mesa/state_tracker/st_cb_blit.c b/src/mesa/state_tracker/st_cb_blit.c
index 6d9371852c5..139690615d6 100644
--- a/src/mesa/state_tracker/st_cb_blit.c
+++ b/src/mesa/state_tracker/st_cb_blit.c
@@ -39,7 +39,7 @@
 #include "st_cb_bitmap.h"
 #include "st_cb_blit.h"
 #include "st_cb_fbo.h"
-#include "st_atom.h"
+#include "st_manager.h"
 
 #include "util/u_format.h"
 
@@ -92,7 +92,7 @@ st_BlitFramebuffer(struct gl_context *ctx,
    } clip;
    struct pipe_blit_info blit;
 
-   st_validate_state(st);
+   st_manager_validate_framebuffers(st);
 
    /* Make sure bitmap rendering has landed in the framebuffers */
    st_flush_bitmap_cache(st);
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index a6a98c83aa6..b372697026b 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -33,6 +33,7 @@
 #include "main/imports.h"
 #include "main/image.h"
 #include "main/bufferobj.h"
+#include "main/blit.h"
 #include "main/format_pack.h"
 #include "main/macros.h"
 #include "main/mtypes.h"
@@ -688,8 +689,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
    cso_save_rasterizer(cso);
    cso_save_viewport(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
+   cso_save_fragment_sampler_views(cso);
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
@@ -756,6 +757,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* texture sampling state: */
    {
       struct pipe_sampler_state sampler;
+      const struct pipe_sampler_state *states[2] = {&sampler, &sampler};
+
       memset(&sampler, 0, sizeof(sampler));
       sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
       sampler.wrap_t = PIPE_TEX_WRAP_CLAMP;
@@ -765,11 +768,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
       sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
       sampler.normalized_coords = normalized;
 
-      cso_single_sampler(cso, PIPE_SHADER_FRAGMENT, 0, &sampler);
-      if (num_sampler_view > 1) {
-         cso_single_sampler(cso, PIPE_SHADER_FRAGMENT, 1, &sampler);
-      }
-      cso_single_sampler_done(cso, PIPE_SHADER_FRAGMENT);
+      cso_set_samplers(cso, PIPE_SHADER_FRAGMENT,
+                       num_sampler_view > 1 ? 2 : 1, states);
    }
 
    /* viewport state: viewport matching window dims */
@@ -816,8 +816,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* restore state */
    cso_restore_rasterizer(cso);
    cso_restore_viewport(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
    cso_restore_tessctrl_shader(cso);
@@ -1313,31 +1313,6 @@ st_get_color_read_renderbuffer(struct gl_context *ctx)
 
 
 /**
- * \return TRUE if two regions overlap, FALSE otherwise
- */
-static boolean
-regions_overlap(int srcX0, int srcY0,
-                int srcX1, int srcY1,
-                int dstX0, int dstY0,
-                int dstX1, int dstY1)
-{
-   if (MAX2(srcX0, srcX1) < MIN2(dstX0, dstX1))
-      return FALSE; /* src completely left of dst */
-
-   if (MAX2(dstX0, dstX1) < MIN2(srcX0, srcX1))
-      return FALSE; /* dst completely left of src */
-
-   if (MAX2(srcY0, srcY1) < MIN2(dstY0, dstY1))
-      return FALSE; /* src completely above dst */
-
-   if (MAX2(dstY0, dstY1) < MIN2(srcY0, srcY1))
-      return FALSE; /* dst completely above src */
-
-   return TRUE; /* some overlap */
-}
-
-
-/**
  * Try to do a glCopyPixels for simple cases with a blit by calling
  * pipe->blit().
  *
@@ -1420,8 +1395,8 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
       }
 
       if (rbRead != rbDraw ||
-          !regions_overlap(readX, readY, readX + readW, readY + readH,
-                           drawX, drawY, drawX + drawW, drawY + drawH)) {
+          !_mesa_regions_overlap(readX, readY, readX + readW, readY + readH,
+                                 drawX, drawY, drawX + drawW, drawY + drawH)) {
          struct pipe_blit_info blit;
 
          memset(&blit, 0, sizeof(blit));
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 0399eef7204..57075904450 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -511,8 +511,6 @@ st_render_texture(struct gl_context *ctx,
    strb->rtt_layered = att->Layered;
    pipe_resource_reference(&strb->texture, pt);
 
-   pipe_surface_release(pipe, &strb->surface);
-
    st_update_renderbuffer_surface(st, strb);
 
    strb->Base.Format = st_pipe_format_to_mesa_format(pt->format);
diff --git a/src/mesa/state_tracker/st_cb_perfmon.h b/src/mesa/state_tracker/st_cb_perfmon.h
index 13d3627de5d..0b195de47fe 100644
--- a/src/mesa/state_tracker/st_cb_perfmon.h
+++ b/src/mesa/state_tracker/st_cb_perfmon.h
@@ -46,7 +46,7 @@ struct st_perf_counter_object
 /**
  * Cast wrapper
  */
-static INLINE struct st_perf_monitor_object *
+static inline struct st_perf_monitor_object *
 st_perf_monitor_object(struct gl_perf_monitor_object *q)
 {
    return (struct st_perf_monitor_object *)q;
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 6aa7d5796d9..3029909d12d 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -68,6 +68,12 @@ st_bind_program(struct gl_context *ctx, GLenum target, struct gl_program *prog)
    case GL_GEOMETRY_PROGRAM_NV:
       st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
+      break;
    }
 }
 
@@ -84,6 +90,8 @@ st_use_program(struct gl_context *ctx, struct gl_shader_program *shProg)
    st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
    st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
    st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
+   st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+   st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
 }
 
 
@@ -110,6 +118,16 @@ st_new_program(struct gl_context *ctx, GLenum target, GLuint id)
       return _mesa_init_geometry_program(ctx, &prog->Base, target, id);
    }
 
+   case GL_TESS_CONTROL_PROGRAM_NV: {
+      struct st_tessctrl_program *prog = ST_CALLOC_STRUCT(st_tessctrl_program);
+      return _mesa_init_tess_ctrl_program(ctx, &prog->Base, target, id);
+   }
+
+   case GL_TESS_EVALUATION_PROGRAM_NV: {
+      struct st_tesseval_program *prog = ST_CALLOC_STRUCT(st_tesseval_program);
+      return _mesa_init_tess_eval_program(ctx, &prog->Base, target, id);
+   }
+
    default:
       assert(0);
       return NULL;
@@ -157,6 +175,28 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
             free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi);
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         struct st_tessctrl_program *sttcp =
+            (struct st_tessctrl_program *) prog;
+
+         st_release_tcp_variants(st, sttcp);
+
+         if (sttcp->glsl_to_tgsi)
+            free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi);
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         struct st_tesseval_program *sttep =
+            (struct st_tesseval_program *) prog;
+
+         st_release_tep_variants(st, sttep);
+
+         if (sttep->glsl_to_tgsi)
+            free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi);
+      }
+      break;
    default:
       assert(0); /* problem */
    }
@@ -214,6 +254,24 @@ st_program_string_notify( struct gl_context *ctx,
       if (st->vp == stvp)
 	 st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
    }
+   else if (target == GL_TESS_CONTROL_PROGRAM_NV) {
+      struct st_tessctrl_program *sttcp =
+         (struct st_tessctrl_program *) prog;
+
+      st_release_tcp_variants(st, sttcp);
+
+      if (st->tcp == sttcp)
+         st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+   }
+   else if (target == GL_TESS_EVALUATION_PROGRAM_NV) {
+      struct st_tesseval_program *sttep =
+         (struct st_tesseval_program *) prog;
+
+      st_release_tep_variants(st, sttep);
+
+      if (st->tep == sttep)
+         st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
+   }
 
    if (ST_DEBUG & DEBUG_PRECOMPILE)
       st_precompile_shader_variant(st, prog);
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index 272cbb91d52..b9997dacfd2 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -254,7 +254,7 @@ st_RasterPos(struct gl_context *ctx, const GLfloat v[4])
     * st_feedback_draw_vbo doesn't check for that flag. */
    ctx->Array._DrawArrays = rs->arrays;
    st_feedback_draw_vbo(ctx, &rs->prim, 1, NULL, GL_TRUE, 0, 1,
-                        NULL, NULL);
+                        NULL, 0, NULL);
    ctx->Array._DrawArrays = saved_arrays;
 
    /* restore draw's rasterization stage depending on rendermode */
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index d95a608d32e..18ea43fa71a 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -43,6 +43,30 @@
 #include "state_tracker/st_format.h"
 #include "state_tracker/st_texture.h"
 
+static boolean
+needs_integer_signed_unsigned_conversion(const struct gl_context *ctx,
+                                         GLenum format, GLenum type)
+{
+   struct gl_renderbuffer *rb =
+      _mesa_get_read_renderbuffer_for_format(ctx, format);
+
+   assert(rb);
+
+   GLenum srcType = _mesa_get_format_datatype(rb->Format);
+
+    if ((srcType == GL_INT &&
+        (type == GL_UNSIGNED_INT ||
+         type == GL_UNSIGNED_SHORT ||
+         type == GL_UNSIGNED_BYTE)) ||
+       (srcType == GL_UNSIGNED_INT &&
+        (type == GL_INT ||
+         type == GL_SHORT ||
+         type == GL_BYTE))) {
+      return TRUE;
+   }
+
+   return FALSE;
+}
 
 /**
  * This uses a blit to copy the read buffer to a texture format which matches
@@ -123,6 +147,10 @@ st_readpixels(struct gl_context *ctx, GLint x, GLint y,
       goto fallback;
    }
 
+   if (needs_integer_signed_unsigned_conversion(ctx, format, type)) {
+      goto fallback;
+   }
+
    /* Convert the source format to what is expected by ReadPixels
     * and see if it's supported. */
    src_format = util_format_linear(src->format);
diff --git a/src/mesa/state_tracker/st_cb_syncobj.c b/src/mesa/state_tracker/st_cb_syncobj.c
index 6d875b851a2..ec2687fba53 100644
--- a/src/mesa/state_tracker/st_cb_syncobj.c
+++ b/src/mesa/state_tracker/st_cb_syncobj.c
@@ -81,7 +81,13 @@ static void st_check_sync(struct gl_context *ctx, struct gl_sync_object *obj)
    struct pipe_screen *screen = st_context(ctx)->pipe->screen;
    struct st_sync_object *so = (struct st_sync_object*)obj;
 
-   if (so->fence && screen->fence_signalled(screen, so->fence)) {
+   /* If the fence doesn't exist, assume it's signalled. */
+   if (!so->fence) {
+      so->b.StatusFlag = GL_TRUE;
+      return;
+   }
+
+   if (screen->fence_finish(screen, so->fence, 0)) {
       screen->fence_reference(screen, &so->fence, NULL);
       so->b.StatusFlag = GL_TRUE;
    }
@@ -94,6 +100,12 @@ static void st_client_wait_sync(struct gl_context *ctx,
    struct pipe_screen *screen = st_context(ctx)->pipe->screen;
    struct st_sync_object *so = (struct st_sync_object*)obj;
 
+   /* If the fence doesn't exist, assume it's signalled. */
+   if (!so->fence) {
+      so->b.StatusFlag = GL_TRUE;
+      return;
+   }
+
    /* We don't care about GL_SYNC_FLUSH_COMMANDS_BIT, because flush is
     * already called when creating a fence. */
 
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 7ea3846fff1..715d69c0c68 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -896,7 +896,7 @@ st_CompressedTexImage(struct gl_context *ctx, GLuint dims,
 
 
 /**
- * Called via ctx->Driver.GetTexImage()
+ * Called via ctx->Driver.GetTexSubImage()
  *
  * This uses a blit to copy the texture to a texture format which matches
  * the format and type combo and then a fast read-back is done using memcpy.
@@ -910,16 +910,15 @@ st_CompressedTexImage(struct gl_context *ctx, GLuint dims,
  *       we do here should be free in such cases.
  */
 static void
-st_GetTexImage(struct gl_context * ctx,
-               GLenum format, GLenum type, GLvoid * pixels,
-               struct gl_texture_image *texImage)
+st_GetTexSubImage(struct gl_context * ctx,
+                  GLint xoffset, GLint yoffset, GLint zoffset,
+                  GLsizei width, GLsizei height, GLint depth,
+                  GLenum format, GLenum type, GLvoid * pixels,
+                  struct gl_texture_image *texImage)
 {
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
    struct pipe_screen *screen = pipe->screen;
-   GLuint width = texImage->Width;
-   GLuint height = texImage->Height;
-   GLuint depth = texImage->Depth;
    struct st_texture_image *stImage = st_texture_image(texImage);
    struct st_texture_object *stObj = st_texture_object(texImage->TexObject);
    struct pipe_resource *src = stObj->pt;
@@ -1054,7 +1053,7 @@ st_GetTexImage(struct gl_context * ctx,
       }
    }
 
-   /* create the destination texture */
+   /* create the destination texture of size (width X height X depth) */
    memset(&dst_templ, 0, sizeof(dst_templ));
    dst_templ.target = pipe_target;
    dst_templ.format = dst_format;
@@ -1076,6 +1075,10 @@ st_GetTexImage(struct gl_context * ctx,
       height = 1;
    }
 
+   assert(texImage->Face == 0 ||
+          texImage->TexObject->MinLayer == 0 ||
+          zoffset == 0);
+
    memset(&blit, 0, sizeof(blit));
    blit.src.resource = src;
    blit.src.level = texImage->Level + texImage->TexObject->MinLevel;
@@ -1083,9 +1086,11 @@ st_GetTexImage(struct gl_context * ctx,
    blit.dst.resource = dst;
    blit.dst.level = 0;
    blit.dst.format = dst->format;
-   blit.src.box.x = blit.dst.box.x = 0;
-   blit.src.box.y = blit.dst.box.y = 0;
-   blit.src.box.z = texImage->Face + texImage->TexObject->MinLayer;
+   blit.src.box.x = xoffset;
+   blit.dst.box.x = 0;
+   blit.src.box.y = yoffset;
+   blit.dst.box.y = 0;
+   blit.src.box.z = texImage->Face + texImage->TexObject->MinLayer + zoffset;
    blit.dst.box.z = 0;
    blit.src.box.width = blit.dst.box.width = width;
    blit.src.box.height = blit.dst.box.height = height;
@@ -1206,7 +1211,9 @@ end:
 
 fallback:
    if (!done) {
-      _mesa_GetTexImage_sw(ctx, format, type, pixels, texImage);
+      _mesa_GetTexSubImage_sw(ctx, xoffset, yoffset, zoffset,
+                              width, height, depth,
+                              format, type, pixels, texImage);
    }
 }
 
@@ -1876,11 +1883,11 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->CopyTexSubImage = st_CopyTexSubImage;
    functions->GenerateMipmap = st_generate_mipmap;
 
-   functions->GetTexImage = st_GetTexImage;
+   functions->GetTexSubImage = st_GetTexSubImage;
 
    /* compressed texture functions */
    functions->CompressedTexImage = st_CompressedTexImage;
-   functions->GetCompressedTexImage = _mesa_GetCompressedTexImage_sw;
+   functions->GetCompressedTexSubImage = _mesa_GetCompressedTexSubImage_sw;
 
    functions->NewTextureObject = st_NewTextureObject;
    functions->NewTextureImage = st_NewTextureImage;
diff --git a/src/mesa/state_tracker/st_cb_xformfb.c b/src/mesa/state_tracker/st_cb_xformfb.c
index 07c118e227b..0c01cd5ab78 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.c
+++ b/src/mesa/state_tracker/st_cb_xformfb.c
@@ -54,9 +54,9 @@ struct st_transform_feedback_object {
    struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
 
    /* This encapsulates the count that can be used as a source for draw_vbo.
-    * It contains a stream output target from the last call of
-    * EndTransformFeedback. */
-   struct pipe_stream_output_target *draw_count;
+    * It contains stream output targets from the last call of
+    * EndTransformFeedback for each stream. */
+   struct pipe_stream_output_target *draw_count[MAX_VERTEX_STREAMS];
 };
 
 static inline struct st_transform_feedback_object *
@@ -88,7 +88,8 @@ st_delete_transform_feedback(struct gl_context *ctx,
          st_transform_feedback_object(obj);
    unsigned i;
 
-   pipe_so_target_reference(&sobj->draw_count, NULL);
+   for (i = 0; i < ARRAY_SIZE(sobj->draw_count); i++)
+      pipe_so_target_reference(&sobj->draw_count[i], NULL);
 
    /* Unreference targets. */
    for (i = 0; i < sobj->num_targets; i++) {
@@ -123,9 +124,12 @@ st_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
       struct st_buffer_object *bo = st_buffer_object(sobj->base.Buffers[i]);
 
       if (bo && bo->buffer) {
+         unsigned stream =
+            obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+
          /* Check whether we need to recreate the target. */
          if (!sobj->targets[i] ||
-             sobj->targets[i] == sobj->draw_count ||
+             sobj->targets[i] == sobj->draw_count[stream] ||
              sobj->targets[i]->buffer != bo->buffer ||
              sobj->targets[i]->buffer_offset != sobj->base.Offset[i] ||
              sobj->targets[i]->buffer_size != sobj->base.Size[i]) {
@@ -178,24 +182,6 @@ st_resume_transform_feedback(struct gl_context *ctx,
 }
 
 
-static struct pipe_stream_output_target *
-st_transform_feedback_get_draw_target(struct gl_transform_feedback_object *obj)
-{
-   struct st_transform_feedback_object *sobj =
-         st_transform_feedback_object(obj);
-   unsigned i;
-
-   for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
-      if (sobj->targets[i]) {
-         return sobj->targets[i];
-      }
-   }
-
-   assert(0);
-   return NULL;
-}
-
-
 static void
 st_end_transform_feedback(struct gl_context *ctx,
                           struct gl_transform_feedback_object *obj)
@@ -203,22 +189,41 @@ st_end_transform_feedback(struct gl_context *ctx,
    struct st_context *st = st_context(ctx);
    struct st_transform_feedback_object *sobj =
          st_transform_feedback_object(obj);
+   unsigned i;
 
    cso_set_stream_outputs(st->cso_context, 0, NULL, NULL);
 
-   pipe_so_target_reference(&sobj->draw_count,
-                            st_transform_feedback_get_draw_target(obj));
+   /* The next call to glDrawTransformFeedbackStream should use the vertex
+    * count from the last call to glEndTransformFeedback.
+    * Therefore, save the targets for each stream.
+    *
+    * NULL means the vertex counter is 0 (initial state).
+    */
+   for (i = 0; i < ARRAY_SIZE(sobj->draw_count); i++)
+      pipe_so_target_reference(&sobj->draw_count[i], NULL);
+
+   for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
+      unsigned stream =
+         obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+
+      /* Is it not bound or already set for this stream? */
+      if (!sobj->targets[i] || sobj->draw_count[stream])
+         continue;
+
+      pipe_so_target_reference(&sobj->draw_count[stream], sobj->targets[i]);
+   }
 }
 
 
-void
+bool
 st_transform_feedback_draw_init(struct gl_transform_feedback_object *obj,
-                                struct pipe_draw_info *out)
+                                unsigned stream, struct pipe_draw_info *out)
 {
    struct st_transform_feedback_object *sobj =
          st_transform_feedback_object(obj);
 
-   out->count_from_stream_output = sobj->draw_count;
+   out->count_from_stream_output = sobj->draw_count[stream];
+   return out->count_from_stream_output != NULL;
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_xformfb.h b/src/mesa/state_tracker/st_cb_xformfb.h
index 998c418257b..444d11842c5 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.h
+++ b/src/mesa/state_tracker/st_cb_xformfb.h
@@ -38,9 +38,9 @@ struct pipe_draw_info;
 extern void
 st_init_xformfb_functions(struct dd_function_table *functions);
 
-extern void
+extern bool
 st_transform_feedback_draw_init(struct gl_transform_feedback_object *obj,
-                                struct pipe_draw_info *out);
+                                unsigned stream, struct pipe_draw_info *out);
 
 
 #endif /* ST_CB_XFORMFB_H */
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index ed9ed0f1b6c..72c23cad4bc 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -287,6 +287,11 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
    /* For vertex shaders, make sure not to emit saturate when SM 3.0 is not supported */
    ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoSat = !st->has_shader_model3;
 
+   if (!ctx->Extensions.ARB_gpu_shader5) {
+      for (i = 0; i < MESA_SHADER_STAGES; i++)
+         ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler = true;
+   }
+
    _mesa_compute_version(ctx);
 
    if (ctx->Version == 0) {
@@ -308,6 +313,8 @@ static void st_init_driver_flags(struct gl_driver_flags *f)
    f->NewArray = ST_NEW_VERTEX_ARRAYS;
    f->NewRasterizerDiscard = ST_NEW_RASTERIZER;
    f->NewUniformBuffer = ST_NEW_UNIFORM_BUFFER;
+   f->NewDefaultTessLevels = ST_NEW_TESS_STATE;
+   f->NewTextureBuffer = ST_NEW_SAMPLER_VIEWS;
 }
 
 struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
@@ -369,6 +376,8 @@ void st_destroy_context( struct st_context *st )
    st_reference_fragprog(st, &st->fp, NULL);
    st_reference_geomprog(st, &st->gp, NULL);
    st_reference_vertprog(st, &st->vp, NULL);
+   st_reference_tesscprog(st, &st->tcp, NULL);
+   st_reference_tesseprog(st, &st->tep, NULL);
 
    /* release framebuffer surfaces */
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index dac5a4b9006..81d5480431a 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -53,11 +53,14 @@ struct u_upload_mgr;
 #define ST_NEW_FRAGMENT_PROGRAM        (1 << 1)
 #define ST_NEW_VERTEX_PROGRAM          (1 << 2)
 #define ST_NEW_FRAMEBUFFER             (1 << 3)
-/* gap, re-use it */
+#define ST_NEW_TESS_STATE              (1 << 4)
 #define ST_NEW_GEOMETRY_PROGRAM        (1 << 5)
 #define ST_NEW_VERTEX_ARRAYS           (1 << 6)
 #define ST_NEW_RASTERIZER              (1 << 7)
 #define ST_NEW_UNIFORM_BUFFER          (1 << 8)
+#define ST_NEW_TESSCTRL_PROGRAM        (1 << 9)
+#define ST_NEW_TESSEVAL_PROGRAM        (1 << 10)
+#define ST_NEW_SAMPLER_VIEWS           (1 << 11)
 
 
 struct st_state_flags {
@@ -137,7 +140,6 @@ struct st_context
 
    struct st_state_flags dirty;
 
-   GLboolean missing_textures;
    GLboolean vertdata_edgeflags;
    GLboolean edgeflag_culls_prims;
 
@@ -147,10 +149,14 @@ struct st_context
    struct st_vertex_program *vp;    /**< Currently bound vertex program */
    struct st_fragment_program *fp;  /**< Currently bound fragment program */
    struct st_geometry_program *gp;  /**< Currently bound geometry program */
+   struct st_tessctrl_program *tcp; /**< Currently bound tess control program */
+   struct st_tesseval_program *tep; /**< Currently bound tess eval program */
 
    struct st_vp_variant *vp_variant;
    struct st_fp_variant *fp_variant;
    struct st_gp_variant *gp_variant;
+   struct st_tcp_variant *tcp_variant;
+   struct st_tep_variant *tep_variant;
 
    struct gl_texture_object *default_texture;
 
@@ -272,6 +278,29 @@ st_fb_orientation(const struct gl_framebuffer *fb)
 }
 
 
+static inline unsigned
+st_shader_stage_to_ptarget(gl_shader_stage stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      return PIPE_SHADER_VERTEX;
+   case MESA_SHADER_FRAGMENT:
+      return PIPE_SHADER_FRAGMENT;
+   case MESA_SHADER_GEOMETRY:
+      return PIPE_SHADER_GEOMETRY;
+   case MESA_SHADER_TESS_CTRL:
+      return PIPE_SHADER_TESS_CTRL;
+   case MESA_SHADER_TESS_EVAL:
+      return PIPE_SHADER_TESS_EVAL;
+   case MESA_SHADER_COMPUTE:
+      return PIPE_SHADER_COMPUTE;
+   }
+
+   assert(!"should not be reached");
+   return PIPE_SHADER_VERTEX;
+}
+
+
 /** clear-alloc a struct-sized object, with casting */
 #define ST_CALLOC_STRUCT(T)   (struct T *) calloc(1, sizeof(struct T))
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 8b43582c14b..957fcfd410e 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -164,6 +164,7 @@ translate_prim(const struct gl_context *ctx, unsigned prim)
    STATIC_ASSERT(GL_POINTS == PIPE_PRIM_POINTS);
    STATIC_ASSERT(GL_QUADS == PIPE_PRIM_QUADS);
    STATIC_ASSERT(GL_TRIANGLE_STRIP_ADJACENCY == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY);
+   STATIC_ASSERT(GL_PATCHES == PIPE_PRIM_PATCHES);
 
    return prim;
 }
@@ -183,6 +184,7 @@ st_draw_vbo(struct gl_context *ctx,
             GLuint min_index,
             GLuint max_index,
             struct gl_transform_feedback_object *tfb_vertcount,
+            unsigned stream,
             struct gl_buffer_object *indirect)
 {
    struct st_context *st = st_context(ctx);
@@ -241,7 +243,8 @@ st_draw_vbo(struct gl_context *ctx,
       /* Transform feedback drawing is always non-indexed. */
       /* Set info.count_from_stream_output. */
       if (tfb_vertcount) {
-         st_transform_feedback_draw_init(tfb_vertcount, &info);
+         if (!st_transform_feedback_draw_init(tfb_vertcount, stream, &info))
+            return;
       }
    }
 
@@ -260,6 +263,7 @@ st_draw_vbo(struct gl_context *ctx,
       info.count = prims[i].count;
       info.start_instance = prims[i].base_instance;
       info.instance_count = prims[i].num_instances;
+      info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
       info.index_bias = prims[i].basevertex;
       if (!ib) {
          info.min_index = info.start;
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index 780d4bde713..a973c8a4a5d 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -56,6 +56,7 @@ st_draw_vbo(struct gl_context *ctx,
             GLuint min_index,
             GLuint max_index,
             struct gl_transform_feedback_object *tfb_vertcount,
+            unsigned stream,
             struct gl_buffer_object *indirect);
 
 extern void
@@ -67,6 +68,7 @@ st_feedback_draw_vbo(struct gl_context *ctx,
                      GLuint min_index,
                      GLuint max_index,
                      struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
                      struct gl_buffer_object *indirect);
 
 /**
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index 177f6b5aefa..88c10a8f150 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -117,6 +117,7 @@ st_feedback_draw_vbo(struct gl_context *ctx,
                      GLuint min_index,
                      GLuint max_index,
                      struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
                      struct gl_buffer_object *indirect)
 {
    struct st_context *st = st_context(ctx);
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index b1057f3eadd..17f572f80fb 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -165,6 +165,14 @@ void st_init_limits(struct pipe_screen *screen,
          pc = &c->Program[MESA_SHADER_GEOMETRY];
          options = &c->ShaderCompilerOptions[MESA_SHADER_GEOMETRY];
          break;
+      case PIPE_SHADER_TESS_CTRL:
+         pc = &c->Program[MESA_SHADER_TESS_CTRL];
+         options = &c->ShaderCompilerOptions[MESA_SHADER_TESS_CTRL];
+         break;
+      case PIPE_SHADER_TESS_EVAL:
+         pc = &c->Program[MESA_SHADER_TESS_EVAL];
+         options = &c->ShaderCompilerOptions[MESA_SHADER_TESS_EVAL];
+         break;
       default:
          /* compute shader, etc. */
          continue;
@@ -245,8 +253,12 @@ void st_init_limits(struct pipe_screen *screen,
       options->LowerClipDistance = true;
    }
 
+   c->LowerTessLevel = true;
+
    c->MaxCombinedTextureImageUnits =
          _min(c->Program[MESA_SHADER_VERTEX].MaxTextureImageUnits +
+              c->Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits +
+              c->Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits +
               c->Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits +
               c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
               MAX_COMBINED_TEXTURE_IMAGE_UNITS);
@@ -266,6 +278,9 @@ void st_init_limits(struct pipe_screen *screen,
    c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING);
    c->MaxGeometryOutputVertices = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
    c->MaxGeometryTotalOutputComponents = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS);
+   c->MaxTessPatchComponents =
+      MAX2(screen->get_param(screen, PIPE_CAP_MAX_SHADER_PATCH_VARYINGS),
+           MAX_VARYING) * 4;
 
    c->MinProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MIN_TEXEL_OFFSET);
    c->MaxProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MAX_TEXEL_OFFSET);
@@ -301,6 +316,8 @@ void st_init_limits(struct pipe_screen *screen,
          screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
       c->MaxCombinedUniformBlocks = c->MaxUniformBufferBindings =
          c->Program[MESA_SHADER_VERTEX].MaxUniformBlocks +
+         c->Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks +
+         c->Program[MESA_SHADER_TESS_EVAL].MaxUniformBlocks +
          c->Program[MESA_SHADER_GEOMETRY].MaxUniformBlocks +
          c->Program[MESA_SHADER_FRAGMENT].MaxUniformBlocks;
       assert(c->MaxCombinedUniformBlocks <= MAX_COMBINED_UNIFORM_BUFFERS);
@@ -417,12 +434,14 @@ void st_init_extensions(struct pipe_screen *screen,
 
    static const struct st_extension_cap_mapping cap_mapping[] = {
       { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
-      { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT },
+      { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
+      { o(ARB_color_buffer_float),           PIPE_CAP_VERTEX_COLOR_UNCLAMPED           },
       { o(ARB_depth_clamp),                  PIPE_CAP_DEPTH_CLIP_DISABLE               },
       { o(ARB_depth_texture),                PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_draw_buffers_blend),           PIPE_CAP_INDEP_BLEND_FUNC                 },
       { o(ARB_draw_instanced),               PIPE_CAP_TGSI_INSTANCEID                  },
       { o(ARB_fragment_program_shadow),      PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_framebuffer_object),           PIPE_CAP_MIXED_FRAMEBUFFER_SIZES          },
       { o(ARB_instanced_arrays),             PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR  },
       { o(ARB_occlusion_query),              PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },
@@ -432,6 +451,8 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_shader_stencil_export),        PIPE_CAP_SHADER_STENCIL_EXPORT            },
       { o(ARB_shader_texture_lod),           PIPE_CAP_SM3                              },
       { o(ARB_shadow),                       PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_texture_buffer_object),        PIPE_CAP_TEXTURE_BUFFER_OBJECTS           },
+      { o(ARB_texture_gather),               PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS    },
       { o(ARB_texture_mirror_clamp_to_edge), PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
       { o(ARB_texture_non_power_of_two),     PIPE_CAP_NPOT_TEXTURES                    },
       { o(ARB_timer_query),                  PIPE_CAP_QUERY_TIMESTAMP                  },
@@ -452,11 +473,14 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ATI_separate_stencil),             PIPE_CAP_TWO_SIDED_STENCIL                },
       { o(ATI_texture_mirror_once),          PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
       { o(NV_conditional_render),            PIPE_CAP_CONDITIONAL_RENDER               },
+      { o(NV_primitive_restart),             PIPE_CAP_PRIMITIVE_RESTART                },
       { o(NV_texture_barrier),               PIPE_CAP_TEXTURE_BARRIER                  },
       /* GL_NV_point_sprite is not supported by gallium because we don't
        * support the GL_POINT_SPRITE_R_MODE_NV option. */
 
       { o(OES_standard_derivatives),         PIPE_CAP_SM3                              },
+      { o(OES_texture_float_linear),         PIPE_CAP_TEXTURE_FLOAT_LINEAR             },
+      { o(OES_texture_half_float_linear),    PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR        },
       { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
       { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
       { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
@@ -467,6 +491,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
       { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
       { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
+      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
    };
 
    /* Required: render target and sampler support */
@@ -475,6 +500,12 @@ void st_init_extensions(struct pipe_screen *screen,
         { PIPE_FORMAT_R32G32B32A32_FLOAT,
           PIPE_FORMAT_R16G16B16A16_FLOAT } },
 
+      { { o(OES_texture_float) },
+        { PIPE_FORMAT_R32G32B32A32_FLOAT } },
+
+      { { o(OES_texture_half_float) },
+        { PIPE_FORMAT_R16G16B16A16_FLOAT } },
+
       { { o(ARB_texture_rgb10_a2ui) },
         { PIPE_FORMAT_R10G10B10A2_UINT,
           PIPE_FORMAT_B10G10R10A2_UINT },
@@ -556,7 +587,8 @@ void st_init_extensions(struct pipe_screen *screen,
           PIPE_FORMAT_R8G8B8A8_UNORM },
         GL_TRUE }, /* at least one format must be supported */
 
-      { { o(ARB_stencil_texturing) },
+      { { o(ARB_stencil_texturing),
+          o(ARB_texture_stencil8) },
         { PIPE_FORMAT_X24S8_UINT,
           PIPE_FORMAT_S8X24_UINT },
         GL_TRUE }, /* at least one format must be supported */
@@ -650,9 +682,6 @@ void st_init_extensions(struct pipe_screen *screen,
                           ARRAY_SIZE(vertex_mapping), PIPE_BUFFER,
                           PIPE_BIND_VERTEX_BUFFER);
 
-   if (extensions->ARB_stencil_texturing)
-      extensions->ARB_texture_stencil8 = GL_TRUE;
-
    /* Figure out GLSL support. */
    glsl_feature_level = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
 
@@ -693,6 +722,7 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->OES_depth_texture_cube_map = GL_TRUE;
       extensions->ARB_shading_language_420pack = GL_TRUE;
       extensions->ARB_texture_query_levels = GL_TRUE;
+      extensions->ARB_shader_subroutine = GL_TRUE;
 
       if (!options->disable_shader_bit_encoding) {
          extensions->ARB_shader_bit_encoding = GL_TRUE;
@@ -723,20 +753,9 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->ANGLE_texture_compression_dxt = GL_FALSE;
    }
 
-   if (screen->get_shader_param(screen, PIPE_SHADER_GEOMETRY,
+   if (screen->get_shader_param(screen, PIPE_SHADER_TESS_CTRL,
                                 PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
-#if 0 /* XXX re-enable when GLSL compiler again supports geometry shaders */
-      extensions->ARB_geometry_shader4 = GL_TRUE;
-#endif
-   }
-
-   if (screen->get_param(screen, PIPE_CAP_PRIMITIVE_RESTART)) {
-      extensions->NV_primitive_restart = GL_TRUE;
-   }
-
-   /* ARB_color_buffer_float. */
-   if (screen->get_param(screen, PIPE_CAP_VERTEX_COLOR_UNCLAMPED)) {
-      extensions->ARB_color_buffer_float = GL_TRUE;
+      extensions->ARB_tessellation_shader = GL_TRUE;
    }
 
    if (screen->fence_finish) {
@@ -823,9 +842,7 @@ void st_init_extensions(struct pipe_screen *screen,
    consts->MinMapBufferAlignment =
       screen->get_param(screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
 
-   if (screen->get_param(screen, PIPE_CAP_TEXTURE_BUFFER_OBJECTS)) {
-      extensions->ARB_texture_buffer_object = GL_TRUE;
-
+   if (extensions->ARB_texture_buffer_object) {
       consts->MaxTextureBufferSize =
          _min(screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE),
               (1u << 31) - 1);
@@ -840,10 +857,6 @@ void st_init_extensions(struct pipe_screen *screen,
                              PIPE_BIND_SAMPLER_VIEW);
    }
 
-   if (screen->get_param(screen, PIPE_CAP_MIXED_FRAMEBUFFER_SIZES)) {
-      extensions->ARB_framebuffer_object = GL_TRUE;
-   }
-
    /* Unpacking a varying in the fragment shader costs 1 texture indirection.
     * If the number of available texture indirections is very limited, then we
     * prefer to disable varying packing rather than run the risk of varying
@@ -868,9 +881,6 @@ void st_init_extensions(struct pipe_screen *screen,
          extensions->AMD_vertex_shader_viewport_index = GL_TRUE;
    }
 
-   if (consts->MaxProgramTextureGatherComponents > 0)
-      extensions->ARB_texture_gather = GL_TRUE;
-
    /* GL_ARB_ES3_compatibility.
     *
     * Assume that ES3 is supported if GLSL 3.30 is supported.
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 25e30c7deb2..6f007273c73 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -158,9 +158,12 @@ public:
    {
       this->file = file;
       this->index = index;
+      this->index2D = 0;
       this->writemask = writemask;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->type = type;
       this->array_id = 0;
    }
@@ -169,9 +172,12 @@ public:
    {
       this->file = file;
       this->index = 0;
+      this->index2D = 0;
       this->writemask = writemask;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->type = type;
       this->array_id = 0;
    }
@@ -181,9 +187,12 @@ public:
       this->type = GLSL_TYPE_ERROR;
       this->file = PROGRAM_UNDEFINED;
       this->index = 0;
+      this->index2D = 0;
       this->writemask = 0;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->array_id = 0;
    }
 
@@ -191,11 +200,14 @@ public:
 
    gl_register_file file; /**< PROGRAM_* from Mesa */
    int index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
+   int index2D;
    int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
    GLuint cond_mask:4;
    int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
    /** Register index should be offset by the integer in this reg. */
    st_src_reg *reladdr;
+   st_src_reg *reladdr2;
+   bool has_index2;
    unsigned array_id;
 };
 
@@ -207,9 +219,9 @@ st_src_reg::st_src_reg(st_dst_reg reg)
    this->swizzle = SWIZZLE_XYZW;
    this->negate = 0;
    this->reladdr = reg.reladdr;
-   this->index2D = 0;
-   this->reladdr2 = NULL;
-   this->has_index2 = false;
+   this->index2D = reg.index2D;
+   this->reladdr2 = reg.reladdr2;
+   this->has_index2 = reg.has_index2;
    this->double_reg2 = false;
    this->array_id = reg.array_id;
 }
@@ -222,6 +234,9 @@ st_dst_reg::st_dst_reg(st_src_reg reg)
    this->writemask = WRITEMASK_XYZW;
    this->cond_mask = COND_TR;
    this->reladdr = reg.reladdr;
+   this->index2D = reg.index2D;
+   this->reladdr2 = reg.reladdr2;
+   this->has_index2 = reg.has_index2;
    this->array_id = reg.array_id;
 }
 
@@ -551,8 +566,8 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
     * reg directly for one of the regs, and preload the other reladdr
     * sources into temps.
     */
-   num_reladdr += dst.reladdr != NULL;
-   num_reladdr += dst1.reladdr != NULL;
+   num_reladdr += dst.reladdr != NULL || dst.reladdr2;
+   num_reladdr += dst1.reladdr != NULL || dst1.reladdr2;
    num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
    num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
    num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
@@ -563,8 +578,11 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
    reladdr_to_temp(ir, &src1, &num_reladdr);
    reladdr_to_temp(ir, &src0, &num_reladdr);
 
-   if (dst.reladdr) {
-      emit_arl(ir, address_reg, *dst.reladdr);
+   if (dst.reladdr || dst.reladdr2) {
+      if (dst.reladdr)
+         emit_arl(ir, address_reg, *dst.reladdr);
+      if (dst.reladdr2)
+         emit_arl(ir, address_reg2, *dst.reladdr2);
       num_reladdr--;
    }
    if (dst1.reladdr) {
@@ -590,7 +608,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
    inst->function = NULL;
 
    /* Update indirect addressing status used by TGSI */
-   if (dst.reladdr) {
+   if (dst.reladdr || dst.reladdr2) {
       switch(dst.file) {
       case PROGRAM_STATE_VAR:
       case PROGRAM_CONSTANT:
@@ -797,7 +815,7 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
    case TGSI_OPCODE_##c: \
       if (type == GLSL_TYPE_DOUBLE) \
          op = TGSI_OPCODE_##d; \
-      else if (type == GLSL_TYPE_INT)       \
+      else if (type == GLSL_TYPE_INT || type == GLSL_TYPE_SUBROUTINE)       \
          op = TGSI_OPCODE_##i; \
       else if (type == GLSL_TYPE_UINT) \
          op = TGSI_OPCODE_##u; \
@@ -1090,6 +1108,7 @@ type_size(const struct glsl_type *type)
       return size;
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SUBROUTINE:
       /* Samplers take up one slot in UNIFORMS[], but they're baked in
        * at link time.
        */
@@ -1470,6 +1489,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
          result_src = op[0];
       }
       break;
+   case ir_unop_subroutine_to_int:
+      emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      break;
    case ir_unop_abs:
       emit_asm(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
       break;
@@ -2243,7 +2265,10 @@ is_inout_array(unsigned stage, ir_variable *var, bool *is_2d)
 
    *is_2d = false;
 
-   if (stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) {
+   if (((stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) ||
+        (stage == MESA_SHADER_TESS_EVAL && var->data.mode == ir_var_shader_in) ||
+        stage == MESA_SHADER_TESS_CTRL) &&
+       !var->data.patch) {
       if (!var->type->is_array())
          return false; /* a system value probably */
 
@@ -2355,7 +2380,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
 static void
 shrink_array_declarations(struct array_decl *arrays, unsigned count,
-                          GLbitfield64 usage_mask)
+                          GLbitfield64 usage_mask,
+                          GLbitfield patch_usage_mask)
 {
    unsigned i, j;
 
@@ -2367,8 +2393,15 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
 
       /* Shrink the beginning. */
       for (j = 0; j < decl->array_size; j++) {
-         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
-            break;
+         if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
+            if (patch_usage_mask &
+                BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
+               break;
+         }
+         else {
+            if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+               break;
+         }
 
          decl->mesa_index++;
          decl->array_size--;
@@ -2377,8 +2410,15 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
 
       /* Shrink the end. */
       for (j = decl->array_size-1; j >= 0; j--) {
-         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
-            break;
+         if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
+            if (patch_usage_mask &
+                BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
+               break;
+         }
+         else {
+            if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+               break;
+         }
 
          decl->array_size--;
       }
@@ -2391,22 +2431,34 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
    ir_constant *index;
    st_src_reg src;
    int element_size = type_size(ir->type);
-   bool is_2D_input;
+   bool is_2D = false;
 
    index = ir->array_index->constant_expression_value();
 
    ir->array->accept(this);
    src = this->result;
 
-   is_2D_input = this->prog->Target == GL_GEOMETRY_PROGRAM_NV &&
-                 src.file == PROGRAM_INPUT &&
-                 ir->array->ir_type != ir_type_dereference_array;
+   if (ir->array->ir_type != ir_type_dereference_array) {
+      switch (this->prog->Target) {
+      case GL_TESS_CONTROL_PROGRAM_NV:
+         is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
+                 !ir->variable_referenced()->data.patch;
+         break;
+      case GL_TESS_EVALUATION_PROGRAM_NV:
+         is_2D = src.file == PROGRAM_INPUT &&
+                 !ir->variable_referenced()->data.patch;
+         break;
+      case GL_GEOMETRY_PROGRAM_NV:
+         is_2D = src.file == PROGRAM_INPUT;
+         break;
+      }
+   }
 
-   if (is_2D_input)
+   if (is_2D)
       element_size = 1;
 
    if (index) {
-      if (is_2D_input) {
+      if (is_2D) {
          src.index2D = index->value.i[0];
          src.has_index2 = true;
       } else
@@ -2433,7 +2485,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
       /* If there was already a relative address register involved, add the
        * new and the old together to get the new offset.
        */
-      if (!is_2D_input && src.reladdr != NULL) {
+      if (!is_2D && src.reladdr != NULL) {
          st_src_reg accum_reg = get_temp(native_integers ?
                                 glsl_type::int_type : glsl_type::float_type);
 
@@ -2443,7 +2495,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
          index_reg = accum_reg;
       }
 
-      if (is_2D_input) {
+      if (is_2D) {
          src.reladdr2 = ralloc(mem_ctx, st_src_reg);
          memcpy(src.reladdr2, &index_reg, sizeof(index_reg));
          src.index2D = 0;
@@ -3430,7 +3482,10 @@ glsl_to_tgsi_visitor::visit(ir_end_primitive *ir)
 void
 glsl_to_tgsi_visitor::visit(ir_barrier *ir)
 {
-   unreachable("Not implemented!");
+   assert(this->prog->Target == GL_TESS_CONTROL_PROGRAM_NV ||
+          this->prog->Target == GL_COMPUTE_PROGRAM_NV);
+
+   emit_asm(ir, TGSI_OPCODE_BARRIER);
 }
 
 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
@@ -3553,7 +3608,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
 {
    int tempWritesSize = 0;
    unsigned *tempWrites = NULL;
-   unsigned outputWrites[MAX_PROGRAM_OUTPUTS];
+   unsigned outputWrites[VARYING_SLOT_TESS_MAX];
 
    memset(outputWrites, 0, sizeof(outputWrites));
 
@@ -3561,8 +3616,8 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
       unsigned prevWriteMask = 0;
 
       /* Give up if we encounter relative addressing or flow control. */
-      if (inst->dst[0].reladdr ||
-          inst->dst[1].reladdr ||
+      if (inst->dst[0].reladdr || inst->dst[0].reladdr2 ||
+          inst->dst[1].reladdr || inst->dst[1].reladdr2 ||
           tgsi_get_opcode_info(inst->op)->is_branch ||
           inst->op == TGSI_OPCODE_BGNSUB ||
           inst->op == TGSI_OPCODE_CONT ||
@@ -3573,7 +3628,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
       }
 
       if (inst->dst[0].file == PROGRAM_OUTPUT) {
-         assert(inst->dst[0].index < MAX_PROGRAM_OUTPUTS);
+         assert(inst->dst[0].index < (signed)ARRAY_SIZE(outputWrites));
          prevWriteMask = outputWrites[inst->dst[0].index];
          outputWrites[inst->dst[0].index] |= inst->dst[0].writemask;
       } else if (inst->dst[0].file == PROGRAM_TEMPORARY) {
@@ -3940,6 +3995,7 @@ glsl_to_tgsi_visitor::copy_propagate(void)
           !(inst->dst[0].file == inst->src[0].file &&
              inst->dst[0].index == inst->src[0].index) &&
           !inst->dst[0].reladdr &&
+          !inst->dst[0].reladdr2 &&
           !inst->saturate &&
           inst->src[0].file != PROGRAM_ARRAY &&
           !inst->src[0].reladdr &&
@@ -4527,6 +4583,14 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
    TGSI_SEMANTIC_SAMPLEID,
    TGSI_SEMANTIC_SAMPLEPOS,
    TGSI_SEMANTIC_SAMPLEMASK,
+
+   /* Tessellation shaders
+    */
+   TGSI_SEMANTIC_TESSCOORD,
+   TGSI_SEMANTIC_VERTICESIN,
+   TGSI_SEMANTIC_PRIMID,
+   TGSI_SEMANTIC_TESSOUTER,
+   TGSI_SEMANTIC_TESSINNER,
 };
 
 /**
@@ -4651,6 +4715,9 @@ dst_register(struct st_translate *t, gl_register_file file, unsigned index,
       if (!array_id) {
          if (t->procType == TGSI_PROCESSOR_FRAGMENT)
             assert(index < FRAG_RESULT_MAX);
+         else if (t->procType == TGSI_PROCESSOR_TESS_CTRL ||
+                  t->procType == TGSI_PROCESSOR_TESS_EVAL)
+            assert(index < VARYING_SLOT_TESS_MAX);
          else
             assert(index < VARYING_SLOT_MAX);
 
@@ -4790,6 +4857,14 @@ translate_dst(struct st_translate *t,
       dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
    }
 
+   if (dst_reg->has_index2) {
+      if (dst_reg->reladdr2)
+         dst = ureg_dst_dimension_indirect(dst, ureg_src(t->address[1]),
+                                           dst_reg->index2D);
+      else
+         dst = ureg_dst_dimension(dst, dst_reg->index2D);
+   }
+
    return dst;
 }
 
@@ -5271,6 +5346,8 @@ st_translate_program(
           TGSI_SEMANTIC_VERTEXID_NOBASE);
    assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_BASE_VERTEX] ==
           TGSI_SEMANTIC_BASEVERTEX);
+   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_TESS_COORD] ==
+          TGSI_SEMANTIC_TESSCOORD);
 
    t = CALLOC_STRUCT(st_translate);
    if (!t) {
@@ -5313,6 +5390,8 @@ st_translate_program(
       }
       break;
    case TGSI_PROCESSOR_GEOMETRY:
+   case TGSI_PROCESSOR_TESS_EVAL:
+   case TGSI_PROCESSOR_TESS_CTRL:
       for (i = 0; i < numInputs; i++) {
          unsigned array_id = 0;
          unsigned array_size;
@@ -5347,6 +5426,8 @@ st_translate_program(
    case TGSI_PROCESSOR_FRAGMENT:
       break;
    case TGSI_PROCESSOR_GEOMETRY:
+   case TGSI_PROCESSOR_TESS_EVAL:
+   case TGSI_PROCESSOR_TESS_CTRL:
    case TGSI_PROCESSOR_VERTEX:
       for (i = 0; i < numOutputs; i++) {
          unsigned array_id = 0;
@@ -5461,6 +5542,7 @@ st_translate_program(
                struct pipe_screen *pscreen = st->pipe->screen;
                assert(procType == TGSI_PROCESSOR_VERTEX);
                assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
+               (void) pscreen;
                if (!ctx->Const.NativeIntegers) {
                   struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
                   ureg_U2F( t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X), t->systemValues[i]);
@@ -5611,25 +5693,6 @@ out:
 /* ----------------------------- End TGSI code ------------------------------ */
 
 
-static unsigned
-shader_stage_to_ptarget(gl_shader_stage stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:
-      return PIPE_SHADER_VERTEX;
-   case MESA_SHADER_FRAGMENT:
-      return PIPE_SHADER_FRAGMENT;
-   case MESA_SHADER_GEOMETRY:
-      return PIPE_SHADER_GEOMETRY;
-   case MESA_SHADER_COMPUTE:
-      return PIPE_SHADER_COMPUTE;
-   }
-
-   assert(!"should not be reached");
-   return PIPE_SHADER_VERTEX;
-}
-
-
 /**
  * Convert a shader's GLSL IR into a Mesa gl_program, although without
  * generating Mesa IR.
@@ -5646,7 +5709,7 @@ get_mesa_program(struct gl_context *ctx,
    struct gl_shader_compiler_options *options =
          &ctx->Const.ShaderCompilerOptions[_mesa_shader_enum_to_shader_stage(shader->Type)];
    struct pipe_screen *pscreen = ctx->st->pipe->screen;
-   unsigned ptarget = shader_stage_to_ptarget(shader->Stage);
+   unsigned ptarget = st_shader_stage_to_ptarget(shader->Stage);
 
    validate_ir_tree(shader->ir);
 
@@ -5673,7 +5736,7 @@ get_mesa_program(struct gl_context *ctx,
                                                prog->Parameters);
 
    /* Remove reads from output registers. */
-   lower_output_reads(shader->ir);
+   lower_output_reads(shader->Stage, shader->ir);
 
    /* Emit intermediate IR for main(). */
    visit_exec_list(shader->ir, v);
@@ -5721,7 +5784,11 @@ get_mesa_program(struct gl_context *ctx,
 
    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
    v->simplify_cmp();
-   v->copy_propagate();
+
+   if (shader->Type != GL_TESS_CONTROL_SHADER &&
+       shader->Type != GL_TESS_EVALUATION_SHADER)
+      v->copy_propagate();
+
    while (v->eliminate_dead_code());
 
    v->merge_two_dsts();
@@ -5745,9 +5812,9 @@ get_mesa_program(struct gl_context *ctx,
 
    do_set_program_inouts(shader->ir, prog, shader->Stage);
    shrink_array_declarations(v->input_arrays, v->num_input_arrays,
-                             prog->InputsRead);
+                             prog->InputsRead, prog->PatchInputsRead);
    shrink_array_declarations(v->output_arrays, v->num_output_arrays,
-                             prog->OutputsWritten);
+                             prog->OutputsWritten, prog->PatchOutputsWritten);
    count_resources(v, prog);
 
    /* This must be done before the uniform storage is associated. */
@@ -5776,6 +5843,8 @@ get_mesa_program(struct gl_context *ctx,
    struct st_vertex_program *stvp;
    struct st_fragment_program *stfp;
    struct st_geometry_program *stgp;
+   struct st_tessctrl_program *sttcp;
+   struct st_tesseval_program *sttep;
 
    switch (shader->Type) {
    case GL_VERTEX_SHADER:
@@ -5790,6 +5859,14 @@ get_mesa_program(struct gl_context *ctx,
       stgp = (struct st_geometry_program *)prog;
       stgp->glsl_to_tgsi = v;
       break;
+   case GL_TESS_CONTROL_SHADER:
+      sttcp = (struct st_tessctrl_program *)prog;
+      sttcp->glsl_to_tgsi = v;
+      break;
+   case GL_TESS_EVALUATION_SHADER:
+      sttep = (struct st_tesseval_program *)prog;
+      sttep->glsl_to_tgsi = v;
+      break;
    default:
       assert(!"should not be reached");
       return NULL;
@@ -5800,6 +5877,71 @@ get_mesa_program(struct gl_context *ctx,
 
 extern "C" {
 
+static void
+st_dump_program_for_shader_db(struct gl_context *ctx,
+                              struct gl_shader_program *prog)
+{
+   /* Dump only successfully compiled and linked shaders to the specified
+    * file. This is for shader-db.
+    *
+    * These options allow some pre-processing of shaders while dumping,
+    * because some apps have ill-formed shaders.
+    */
+   const char *dump_filename = os_get_option("ST_DUMP_SHADERS");
+   const char *insert_directives = os_get_option("ST_DUMP_INSERT");
+
+   if (dump_filename && prog->Name != 0) {
+      FILE *f = fopen(dump_filename, "a");
+
+      if (f) {
+         for (unsigned i = 0; i < prog->NumShaders; i++) {
+            const struct gl_shader *sh = prog->Shaders[i];
+            const char *source;
+            bool skip_version = false;
+
+            if (!sh)
+               continue;
+
+            source = sh->Source;
+
+            /* This string mustn't be changed. shader-db uses it to find
+             * where the shader begins.
+             */
+            fprintf(f, "GLSL %s shader %d source for linked program %d:\n",
+                    _mesa_shader_stage_to_string(sh->Stage),
+                    i, prog->Name);
+
+            /* Dump the forced version if set. */
+            if (ctx->Const.ForceGLSLVersion) {
+               fprintf(f, "#version %i\n", ctx->Const.ForceGLSLVersion);
+               skip_version = true;
+            }
+
+            /* Insert directives (optional). */
+            if (insert_directives) {
+               if (!ctx->Const.ForceGLSLVersion && prog->Version)
+                  fprintf(f, "#version %i\n", prog->Version);
+               fprintf(f, "%s\n", insert_directives);
+               skip_version = true;
+            }
+
+            if (skip_version && strncmp(source, "#version ", 9) == 0) {
+               const char *next_line = strstr(source, "\n");
+
+               if (next_line)
+                  source = next_line + 1;
+               else
+                  continue;
+            }
+
+            fprintf(f, "%s", source);
+            fprintf(f, "\n");
+         }
+         fclose(f);
+      }
+   }
+}
+
 /**
  * Link a shader.
  * Called via ctx->Driver.LinkShader()
@@ -5821,7 +5963,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       gl_shader_stage stage = _mesa_shader_enum_to_shader_stage(prog->_LinkedShaders[i]->Type);
       const struct gl_shader_compiler_options *options =
             &ctx->Const.ShaderCompilerOptions[stage];
-      unsigned ptarget = shader_stage_to_ptarget(stage);
+      unsigned ptarget = st_shader_stage_to_ptarget(stage);
       bool have_dround = pscreen->get_shader_param(pscreen, ptarget,
                                                    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED);
       bool have_dfrexp = pscreen->get_shader_param(pscreen, ptarget,
@@ -5832,7 +5974,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
        */
       if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput ||
           options->EmitNoIndirectTemp || options->EmitNoIndirectUniform) {
-         lower_variable_index_to_cond_assign(ir,
+         lower_variable_index_to_cond_assign(prog->_LinkedShaders[i]->Stage, ir,
                                              options->EmitNoIndirectInput,
                                              options->EmitNoIndirectOutput,
                                              options->EmitNoIndirectTemp,
@@ -5920,6 +6062,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       _mesa_reference_program(ctx, &linked_prog, NULL);
    }
 
+   st_dump_program_for_shader_db(ctx, prog);
    return GL_TRUE;
 }
 
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index a2dee6298fa..2e2c8ffaed9 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -368,6 +368,7 @@ st_visual_to_context_mode(const struct st_visual *visual,
 
       mode->rgbBits = mode->redBits +
          mode->greenBits + mode->blueBits + mode->alphaBits;
+      mode->sRGBCapable = util_format_is_srgb(visual->color_format);
    }
 
    if (visual->depth_stencil_format != PIPE_FORMAT_NONE) {
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index fa792bc349b..e62dd7aab80 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -163,6 +163,68 @@ st_release_gp_variants(struct st_context *st, struct st_geometry_program *stgp)
 }
 
 
+/**
+ * Delete a tessellation control program variant.  Note the caller must unlink
+ * the variant from the linked list.
+ */
+static void
+delete_tcp_variant(struct st_context *st, struct st_tcp_variant *tcpv)
+{
+   if (tcpv->driver_shader)
+      cso_delete_tessctrl_shader(st->cso_context, tcpv->driver_shader);
+
+   free(tcpv);
+}
+
+
+/**
+ * Free all variants of a tessellation control program.
+ */
+void
+st_release_tcp_variants(struct st_context *st, struct st_tessctrl_program *sttcp)
+{
+   struct st_tcp_variant *tcpv;
+
+   for (tcpv = sttcp->variants; tcpv; ) {
+      struct st_tcp_variant *next = tcpv->next;
+      delete_tcp_variant(st, tcpv);
+      tcpv = next;
+   }
+
+   sttcp->variants = NULL;
+}
+
+
+/**
+ * Delete a tessellation evaluation program variant.  Note the caller must
+ * unlink the variant from the linked list.
+ */
+static void
+delete_tep_variant(struct st_context *st, struct st_tep_variant *tepv)
+{
+   if (tepv->driver_shader)
+      cso_delete_tesseval_shader(st->cso_context, tepv->driver_shader);
+
+   free(tepv);
+}
+
+
+/**
+ * Free all variants of a tessellation evaluation program.
+ */
+void
+st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep)
+{
+   struct st_tep_variant *tepv;
+
+   for (tepv = sttep->variants; tepv; ) {
+      struct st_tep_variant *next = tepv->next;
+      delete_tep_variant(st, tepv);
+      tepv = next;
+   }
+
+   sttep->variants = NULL;
+}
 
 
 /**
@@ -870,61 +932,52 @@ st_get_fp_variant(struct st_context *st,
 
 
 /**
- * Translate a geometry program to create a new variant.
+ * Translate a program. This is common code for geometry and tessellation
+ * shaders.
  */
-static struct st_gp_variant *
-st_translate_geometry_program(struct st_context *st,
-                              struct st_geometry_program *stgp,
-                              const struct st_gp_variant_key *key)
+static void
+st_translate_program_common(struct st_context *st,
+                            struct gl_program *prog,
+                            struct glsl_to_tgsi_visitor *glsl_to_tgsi,
+                            struct ureg_program *ureg,
+                            unsigned tgsi_processor,
+                            struct pipe_shader_state *out_state)
 {
-   GLuint inputSlotToAttr[VARYING_SLOT_MAX];
-   GLuint inputMapping[VARYING_SLOT_MAX];
-   GLuint outputSlotToAttr[VARYING_SLOT_MAX];
-   GLuint outputMapping[VARYING_SLOT_MAX];
-   struct pipe_context *pipe = st->pipe;
+   GLuint inputSlotToAttr[VARYING_SLOT_TESS_MAX];
+   GLuint inputMapping[VARYING_SLOT_TESS_MAX];
+   GLuint outputSlotToAttr[VARYING_SLOT_TESS_MAX];
+   GLuint outputMapping[VARYING_SLOT_TESS_MAX];
    GLuint attr;
 
-   uint gs_num_inputs = 0;
-
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS];
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   uint num_inputs = 0;
 
-   ubyte gs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
-   ubyte gs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
-   uint gs_num_outputs = 0;
+   ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
+   uint num_outputs = 0;
 
    GLint i;
-   struct ureg_program *ureg;
-   struct pipe_shader_state state = {0};
-   struct st_gp_variant *gpv;
-
-   gpv = CALLOC_STRUCT(st_gp_variant);
-   if (!gpv)
-      return NULL;
-
-   ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
-   if (ureg == NULL) {
-      free(gpv);
-      return NULL;
-   }
 
    memset(inputSlotToAttr, 0, sizeof(inputSlotToAttr));
    memset(inputMapping, 0, sizeof(inputMapping));
    memset(outputSlotToAttr, 0, sizeof(outputSlotToAttr));
    memset(outputMapping, 0, sizeof(outputMapping));
+   memset(out_state, 0, sizeof(*out_state));
 
    /*
     * Convert Mesa program inputs to TGSI input register semantics.
     */
    for (attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      if ((stgp->Base.Base.InputsRead & BITFIELD64_BIT(attr)) != 0) {
-         const GLuint slot = gs_num_inputs++;
+      if ((prog->InputsRead & BITFIELD64_BIT(attr)) != 0) {
+         const GLuint slot = num_inputs++;
 
          inputMapping[attr] = slot;
          inputSlotToAttr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_PRIMITIVE_ID:
+            assert(tgsi_processor == TGSI_PROCESSOR_GEOMETRY);
             input_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
             input_semantic_index[slot] = 0;
             break;
@@ -976,19 +1029,33 @@ st_translate_geometry_program(struct st_context *st,
             /* fall through */
          case VARYING_SLOT_VAR0:
          default:
-            assert(attr >= VARYING_SLOT_VAR0 && attr < VARYING_SLOT_MAX);
+            assert(attr >= VARYING_SLOT_VAR0 ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
             input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
             input_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
-         break;
+            break;
          }
       }
    }
 
+   /* Also add patch inputs. */
+   for (attr = 0; attr < 32; attr++) {
+      if (prog->PatchInputsRead & (1 << attr)) {
+         GLuint slot = num_inputs++;
+         GLuint patch_attr = VARYING_SLOT_PATCH0 + attr;
+
+         inputMapping[patch_attr] = slot;
+         inputSlotToAttr[slot] = patch_attr;
+         input_semantic_name[slot] = TGSI_SEMANTIC_PATCH;
+         input_semantic_index[slot] = attr;
+      }
+   }
+
    /* initialize output semantics to defaults */
    for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
-      gs_output_semantic_name[i] = TGSI_SEMANTIC_GENERIC;
-      gs_output_semantic_index[i] = 0;
+      output_semantic_name[i] = TGSI_SEMANTIC_GENERIC;
+      output_semantic_index[i] = 0;
    }
 
    /*
@@ -996,8 +1063,8 @@ st_translate_geometry_program(struct st_context *st,
     * mapping and the semantic information for each output.
     */
    for (attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      if (stgp->Base.Base.OutputsWritten & BITFIELD64_BIT(attr)) {
-         GLuint slot = gs_num_outputs++;
+      if (prog->OutputsWritten & BITFIELD64_BIT(attr)) {
+         GLuint slot = num_outputs++;
 
          outputMapping[attr] = slot;
          outputSlotToAttr[slot] = attr;
@@ -1005,56 +1072,64 @@ st_translate_geometry_program(struct st_context *st,
          switch (attr) {
          case VARYING_SLOT_POS:
             assert(slot == 0);
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_COL0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_COL1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_BFC0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_BFC1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_FOGC:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_PSIZ:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_VERTEX:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_DIST0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_DIST1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_LAYER:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_PRIMITIVE_ID:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_VIEWPORT:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
+            output_semantic_index[slot] = 0;
+            break;
+         case VARYING_SLOT_TESS_LEVEL_OUTER:
+            output_semantic_name[slot] = TGSI_SEMANTIC_TESSOUTER;
+            output_semantic_index[slot] = 0;
+            break;
+         case VARYING_SLOT_TESS_LEVEL_INNER:
+            output_semantic_name[slot] = TGSI_SEMANTIC_TESSINNER;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_TEX0:
          case VARYING_SLOT_TEX1:
@@ -1065,36 +1140,44 @@ st_translate_geometry_program(struct st_context *st,
          case VARYING_SLOT_TEX6:
          case VARYING_SLOT_TEX7:
             if (st->needs_texcoord_semantic) {
-               gs_output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
-               gs_output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
+               output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
+               output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
                break;
             }
             /* fall through */
          case VARYING_SLOT_VAR0:
          default:
-            assert(slot < ARRAY_SIZE(gs_output_semantic_name));
-            assert(attr >= VARYING_SLOT_VAR0);
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            gs_output_semantic_index[slot] =
+            assert(slot < ARRAY_SIZE(output_semantic_name));
+            assert(attr >= VARYING_SLOT_VAR0 ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
+            output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
+            output_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
-         break;
+            break;
          }
       }
    }
 
-   ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType);
-   ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType);
-   ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES,
-                 stgp->Base.VerticesOut);
-   ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations);
+   /* Also add patch outputs. */
+   for (attr = 0; attr < 32; attr++) {
+      if (prog->PatchOutputsWritten & (1 << attr)) {
+         GLuint slot = num_outputs++;
+         GLuint patch_attr = VARYING_SLOT_PATCH0 + attr;
+
+         outputMapping[patch_attr] = slot;
+         outputSlotToAttr[slot] = patch_attr;
+         output_semantic_name[slot] = TGSI_SEMANTIC_PATCH;
+         output_semantic_index[slot] = attr;
+      }
+   }
 
    st_translate_program(st->ctx,
-                        TGSI_PROCESSOR_GEOMETRY,
+                        tgsi_processor,
                         ureg,
-                        stgp->glsl_to_tgsi,
-                        &stgp->Base.Base,
+                        glsl_to_tgsi,
+                        prog,
                         /* inputs */
-                        gs_num_inputs,
+                        num_inputs,
                         inputMapping,
                         inputSlotToAttr,
                         input_semantic_name,
@@ -1102,30 +1185,64 @@ st_translate_geometry_program(struct st_context *st,
                         NULL,
                         NULL,
                         /* outputs */
-                        gs_num_outputs,
+                        num_outputs,
                         outputMapping,
                         outputSlotToAttr,
-                        gs_output_semantic_name,
-                        gs_output_semantic_index,
+                        output_semantic_name,
+                        output_semantic_index,
                         FALSE,
                         FALSE);
 
-   state.tokens = ureg_get_tokens(ureg, NULL);
+   out_state->tokens = ureg_get_tokens(ureg, NULL);
    ureg_destroy(ureg);
 
-   st_translate_stream_output_info(stgp->glsl_to_tgsi,
+   st_translate_stream_output_info(glsl_to_tgsi,
                                    outputMapping,
-                                   &state.stream_output);
+                                   &out_state->stream_output);
 
    if ((ST_DEBUG & DEBUG_TGSI) && (ST_DEBUG & DEBUG_MESA)) {
-      _mesa_print_program(&stgp->Base.Base);
+      _mesa_print_program(prog);
       debug_printf("\n");
    }
 
    if (ST_DEBUG & DEBUG_TGSI) {
-      tgsi_dump(state.tokens, 0);
+      tgsi_dump(out_state->tokens, 0);
       debug_printf("\n");
    }
+}
+
+
+/**
+ * Translate a geometry program to create a new variant.
+ */
+static struct st_gp_variant *
+st_translate_geometry_program(struct st_context *st,
+                              struct st_geometry_program *stgp,
+                              const struct st_gp_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_gp_variant *gpv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
+   if (ureg == NULL)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType);
+   ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType);
+   ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES,
+                 stgp->Base.VerticesOut);
+   ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations);
+
+   st_translate_program_common(st, &stgp->Base.Base, stgp->glsl_to_tgsi, ureg,
+                               TGSI_PROCESSOR_GEOMETRY, &state);
+
+   gpv = CALLOC_STRUCT(st_gp_variant);
+   if (!gpv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
 
    /* fill in new variant */
    gpv->driver_shader = pipe->create_gs_state(pipe, &state);
@@ -1168,6 +1285,168 @@ st_get_gp_variant(struct st_context *st,
 
 
 /**
+ * Translate a tessellation control program to create a new variant.
+ */
+static struct st_tcp_variant *
+st_translate_tessctrl_program(struct st_context *st,
+                              struct st_tessctrl_program *sttcp,
+                              const struct st_tcp_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_tcp_variant *tcpv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, pipe->screen);
+   if (ureg == NULL) {
+      return NULL;
+   }
+
+   ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT,
+                 sttcp->Base.VerticesOut);
+
+   st_translate_program_common(st, &sttcp->Base.Base, sttcp->glsl_to_tgsi,
+                               ureg, TGSI_PROCESSOR_TESS_CTRL, &state);
+
+   tcpv = CALLOC_STRUCT(st_tcp_variant);
+   if (!tcpv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
+
+   /* fill in new variant */
+   tcpv->driver_shader = pipe->create_tcs_state(pipe, &state);
+   tcpv->key = *key;
+
+   ureg_free_tokens(state.tokens);
+   return tcpv;
+}
+
+
+/**
+ * Get/create tessellation control program variant.
+ */
+struct st_tcp_variant *
+st_get_tcp_variant(struct st_context *st,
+                  struct st_tessctrl_program *sttcp,
+                  const struct st_tcp_variant_key *key)
+{
+   struct st_tcp_variant *tcpv;
+
+   /* Search for existing variant */
+   for (tcpv = sttcp->variants; tcpv; tcpv = tcpv->next) {
+      if (memcmp(&tcpv->key, key, sizeof(*key)) == 0) {
+         break;
+      }
+   }
+
+   if (!tcpv) {
+      /* create new */
+      tcpv = st_translate_tessctrl_program(st, sttcp, key);
+      if (tcpv) {
+         /* insert into list */
+         tcpv->next = sttcp->variants;
+         sttcp->variants = tcpv;
+      }
+   }
+
+   return tcpv;
+}
+
+
+/**
+ * Translate a tessellation evaluation program to create a new variant.
+ */
+static struct st_tep_variant *
+st_translate_tesseval_program(struct st_context *st,
+                              struct st_tesseval_program *sttep,
+                              const struct st_tep_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_tep_variant *tepv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, pipe->screen);
+   if (ureg == NULL) {
+      return NULL;
+   }
+
+   if (sttep->Base.PrimitiveMode == GL_ISOLINES)
+      ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, GL_LINES);
+   else
+      ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, sttep->Base.PrimitiveMode);
+
+   switch (sttep->Base.Spacing) {
+   case GL_EQUAL:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING, PIPE_TESS_SPACING_EQUAL);
+      break;
+   case GL_FRACTIONAL_EVEN:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING,
+                    PIPE_TESS_SPACING_FRACTIONAL_EVEN);
+      break;
+   case GL_FRACTIONAL_ODD:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING,
+                    PIPE_TESS_SPACING_FRACTIONAL_ODD);
+      break;
+   default:
+      assert(0);
+   }
+
+   ureg_property(ureg, TGSI_PROPERTY_TES_VERTEX_ORDER_CW,
+                 sttep->Base.VertexOrder == GL_CW);
+   ureg_property(ureg, TGSI_PROPERTY_TES_POINT_MODE, sttep->Base.PointMode);
+
+   st_translate_program_common(st, &sttep->Base.Base, sttep->glsl_to_tgsi,
+                               ureg, TGSI_PROCESSOR_TESS_EVAL, &state);
+
+   tepv = CALLOC_STRUCT(st_tep_variant);
+   if (!tepv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
+
+   /* fill in new variant */
+   tepv->driver_shader = pipe->create_tes_state(pipe, &state);
+   tepv->key = *key;
+
+   ureg_free_tokens(state.tokens);
+   return tepv;
+}
+
+
+/**
+ * Get/create tessellation evaluation program variant.
+ */
+struct st_tep_variant *
+st_get_tep_variant(struct st_context *st,
+                  struct st_tesseval_program *sttep,
+                  const struct st_tep_variant_key *key)
+{
+   struct st_tep_variant *tepv;
+
+   /* Search for existing variant */
+   for (tepv = sttep->variants; tepv; tepv = tepv->next) {
+      if (memcmp(&tepv->key, key, sizeof(*key)) == 0) {
+         break;
+      }
+   }
+
+   if (!tepv) {
+      /* create new */
+      tepv = st_translate_tesseval_program(st, sttep, key);
+      if (tepv) {
+         /* insert into list */
+         tepv->next = sttep->variants;
+         sttep->variants = tepv;
+      }
+   }
+
+   return tepv;
+}
+
+
+/**
  * Vert/Geom/Frag programs have per-context variants.  Free all the
  * variants attached to the given program which match the given context.
  */
@@ -1240,6 +1519,48 @@ destroy_program_variants(struct st_context *st, struct gl_program *program)
          }
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         struct st_tessctrl_program *sttcp =
+            (struct st_tessctrl_program *) program;
+         struct st_tcp_variant *tcpv, **prevPtr = &sttcp->variants;
+
+         for (tcpv = sttcp->variants; tcpv; ) {
+            struct st_tcp_variant *next = tcpv->next;
+            if (tcpv->key.st == st) {
+               /* unlink from list */
+               *prevPtr = next;
+               /* destroy this variant */
+               delete_tcp_variant(st, tcpv);
+            }
+            else {
+               prevPtr = &tcpv->next;
+            }
+            tcpv = next;
+         }
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         struct st_tesseval_program *sttep =
+            (struct st_tesseval_program *) program;
+         struct st_tep_variant *tepv, **prevPtr = &sttep->variants;
+
+         for (tepv = sttep->variants; tepv; ) {
+            struct st_tep_variant *next = tepv->next;
+            if (tepv->key.st == st) {
+               /* unlink from list */
+               *prevPtr = next;
+               /* destroy this variant */
+               delete_tep_variant(st, tepv);
+            }
+            else {
+               prevPtr = &tepv->next;
+            }
+            tepv = next;
+         }
+      }
+      break;
    default:
       _mesa_problem(NULL, "Unexpected program target 0x%x in "
                     "destroy_program_variants_cb()", program->Target);
@@ -1276,6 +1597,8 @@ destroy_shader_program_variants_cb(GLuint key, void *data, void *userData)
    case GL_VERTEX_SHADER:
    case GL_FRAGMENT_SHADER:
    case GL_GEOMETRY_SHADER:
+   case GL_TESS_CONTROL_SHADER:
+   case GL_TESS_EVALUATION_SHADER:
       {
          destroy_program_variants(st, shader->Program);
       }
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index bb77eb6ed65..7013993fe38 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -188,7 +188,7 @@ struct st_gp_variant_key
  */
 struct st_gp_variant
 {
-   /* Parameters which generated this translated version of a vertex */
+   /* Parameters which generated this variant. */
    struct st_gp_variant_key key;
 
    void *driver_shader;
@@ -210,6 +210,76 @@ struct st_geometry_program
 
 
 
+/** Tessellation control program variant key */
+struct st_tcp_variant_key
+{
+   struct st_context *st;          /**< variants are per-context */
+   /* no other fields yet */
+};
+
+
+/**
+ * Tessellation control program variant.
+ */
+struct st_tcp_variant
+{
+   /* Parameters which generated this variant. */
+   struct st_tcp_variant_key key;
+
+   void *driver_shader;
+
+   struct st_tcp_variant *next;
+};
+
+
+/**
+ * Derived from Mesa gl_tess_ctrl_program:
+ */
+struct st_tessctrl_program
+{
+   struct gl_tess_ctrl_program Base;  /**< The Mesa tess ctrl program */
+   struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+
+   struct st_tcp_variant *variants;
+};
+
+
+
+/** Tessellation evaluation program variant key */
+struct st_tep_variant_key
+{
+   struct st_context *st;          /**< variants are per-context */
+   /* no other fields yet */
+};
+
+
+/**
+ * Tessellation evaluation program variant.
+ */
+struct st_tep_variant
+{
+   /* Parameters which generated this variant. */
+   struct st_tep_variant_key key;
+
+   void *driver_shader;
+
+   struct st_tep_variant *next;
+};
+
+
+/**
+ * Derived from Mesa gl_tess_eval_program:
+ */
+struct st_tesseval_program
+{
+   struct gl_tess_eval_program Base;  /**< The Mesa tess eval program */
+   struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+
+   struct st_tep_variant *variants;
+};
+
+
+
 static inline struct st_fragment_program *
 st_fragment_program( struct gl_fragment_program *fp )
 {
@@ -229,6 +299,18 @@ st_geometry_program( struct gl_geometry_program *gp )
    return (struct st_geometry_program *)gp;
 }
 
+static inline struct st_tessctrl_program *
+st_tessctrl_program( struct gl_tess_ctrl_program *tcp )
+{
+   return (struct st_tessctrl_program *)tcp;
+}
+
+static inline struct st_tesseval_program *
+st_tesseval_program( struct gl_tess_eval_program *tep )
+{
+   return (struct st_tesseval_program *)tep;
+}
+
 static inline void
 st_reference_vertprog(struct st_context *st,
                       struct st_vertex_program **ptr,
@@ -259,6 +341,26 @@ st_reference_fragprog(struct st_context *st,
                            (struct gl_program *) prog);
 }
 
+static inline void
+st_reference_tesscprog(struct st_context *st,
+                       struct st_tessctrl_program **ptr,
+                       struct st_tessctrl_program *prog)
+{
+   _mesa_reference_program(st->ctx,
+                           (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
+static inline void
+st_reference_tesseprog(struct st_context *st,
+                       struct st_tesseval_program **ptr,
+                       struct st_tesseval_program *prog)
+{
+   _mesa_reference_program(st->ctx,
+                           (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
 /**
  * This defines mapping from Mesa VARYING_SLOTs to TGSI GENERIC slots.
  */
@@ -302,6 +404,16 @@ st_get_gp_variant(struct st_context *st,
                   struct st_geometry_program *stgp,
                   const struct st_gp_variant_key *key);
 
+extern struct st_tcp_variant *
+st_get_tcp_variant(struct st_context *st,
+                   struct st_tessctrl_program *stgp,
+                   const struct st_tcp_variant_key *key);
+
+extern struct st_tep_variant *
+st_get_tep_variant(struct st_context *st,
+                   struct st_tesseval_program *stgp,
+                   const struct st_tep_variant_key *key);
+
 
 extern void
 st_prepare_vertex_program(struct gl_context *ctx,
@@ -325,6 +437,14 @@ st_release_gp_variants(struct st_context *st,
                        struct st_geometry_program *stgp);
 
 extern void
+st_release_tcp_variants(struct st_context *st,
+                        struct st_tessctrl_program *stgp);
+
+extern void
+st_release_tep_variants(struct st_context *st,
+                        struct st_tesseval_program *stgp);
+
+extern void
 st_destroy_program_variants(struct st_context *st);
 
 
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 6beb21e3389..52b094330b9 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -462,6 +462,11 @@ st_texture_get_sampler_view(struct st_context *st,
    return free;
 }
 
+
+/**
+ * For the given texture object, release any sampler views which belong
+ * to the calling context.
+ */
 void
 st_texture_release_sampler_view(struct st_context *st,
                                 struct st_texture_object *stObj)
@@ -478,6 +483,11 @@ st_texture_release_sampler_view(struct st_context *st,
    }
 }
 
+
+/**
+ * Release all sampler views attached to the given texture object, regardless
+ * of the context.
+ */
 void
 st_texture_release_all_sampler_views(struct st_context *st,
                                      struct st_texture_object *stObj)
diff --git a/src/mesa/swrast/s_aaline.c b/src/mesa/swrast/s_aaline.c
index f3258e813a6..de5b42b9f6b 100644
--- a/src/mesa/swrast/s_aaline.c
+++ b/src/mesa/swrast/s_aaline.c
@@ -116,11 +116,11 @@ compute_plane(GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1,
    const GLfloat b = pz * py;
    const GLfloat c = px * px + py * py;
    const GLfloat d = -(a * x0 + b * y0 + c * z0);
-   if (a == 0.0 && b == 0.0 && c == 0.0 && d == 0.0) {
-      plane[0] = 0.0;
-      plane[1] = 0.0;
-      plane[2] = 1.0;
-      plane[3] = 0.0;
+   if (a == 0.0F && b == 0.0F && c == 0.0F && d == 0.0F) {
+      plane[0] = 0.0F;
+      plane[1] = 0.0F;
+      plane[2] = 1.0F;
+      plane[3] = 0.0F;
    }
    else {
       plane[0] = a;
@@ -135,9 +135,9 @@ compute_plane(GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1,
 static inline void
 constant_plane(GLfloat value, GLfloat plane[4])
 {
-   plane[0] = 0.0;
-   plane[1] = 0.0;
-   plane[2] = -1.0;
+   plane[0] = 0.0F;
+   plane[1] = 0.0F;
+   plane[2] = -1.0F;
    plane[3] = value;
 }
 
@@ -160,8 +160,8 @@ static inline GLfloat
 solve_plane_recip(GLfloat x, GLfloat y, const GLfloat plane[4])
 {
    const GLfloat denom = plane[3] + plane[0] * x + plane[1] * y;
-   if (denom == 0.0)
-      return 0.0;
+   if (denom == 0.0F)
+      return 0.0F;
    else
       return -plane[2] / denom;
 }
@@ -374,7 +374,7 @@ segment(struct gl_context *ctx,
       if (x0 < x1) {
          xLeft = x0 - line->halfWidth;
          xRight = x1 + line->halfWidth;
-         if (line->dy >= 0.0) {
+         if (line->dy >= 0.0F) {
             yBot = y0 - 3.0F * line->halfWidth;
             yTop = y0 + line->halfWidth;
          }
@@ -386,7 +386,7 @@ segment(struct gl_context *ctx,
       else {
          xLeft = x1 - line->halfWidth;
          xRight = x0 + line->halfWidth;
-         if (line->dy <= 0.0) {
+         if (line->dy <= 0.0F) {
             yBot = y1 - 3.0F * line->halfWidth;
             yTop = y1 + line->halfWidth;
          }
@@ -420,7 +420,7 @@ segment(struct gl_context *ctx,
       if (y0 < y1) {
          yBot = y0 - line->halfWidth;
          yTop = y1 + line->halfWidth;
-         if (line->dx >= 0.0) {
+         if (line->dx >= 0.0F) {
             xLeft = x0 - 3.0F * line->halfWidth;
             xRight = x0 + line->halfWidth;
          }
@@ -432,7 +432,7 @@ segment(struct gl_context *ctx,
       else {
          yBot = y1 - line->halfWidth;
          yTop = y0 + line->halfWidth;
-         if (line->dx <= 0.0) {
+         if (line->dx <= 0.0F) {
             xLeft = x1 - 3.0F * line->halfWidth;
             xRight = x1 + line->halfWidth;
          }
diff --git a/src/mesa/swrast/s_aalinetemp.h b/src/mesa/swrast/s_aalinetemp.h
index f1d078fd89b..bebb131a5d1 100644
--- a/src/mesa/swrast/s_aalinetemp.h
+++ b/src/mesa/swrast/s_aalinetemp.h
@@ -44,7 +44,7 @@ NAME(plot)(struct gl_context *ctx, struct LineInfo *line, int ix, int iy)
 
    (void) swrast;
 
-   if (coverage == 0.0)
+   if (coverage == 0.0F)
       return;
 
    line->span.end++;
@@ -123,7 +123,7 @@ NAME(line)(struct gl_context *ctx, const SWvertex *v0, const SWvertex *v1)
                                  ctx->Const.MinLineWidthAA,
                                  ctx->Const.MaxLineWidthAA);
 
-   if (line.len == 0.0 || IS_INF_OR_NAN(line.len))
+   if (line.len == 0.0F || IS_INF_OR_NAN(line.len))
       return;
 
    INIT_SPAN(line.span, GL_LINE);
diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c
index 9e029db25ce..2974deed41b 100644
--- a/src/mesa/swrast/s_atifragshader.c
+++ b/src/mesa/swrast/s_atifragshader.c
@@ -436,13 +436,13 @@ execute_shader(struct gl_context *ctx, const struct ati_fragment_shader *shader,
 		     for (i = 0; i < 3; i++) {
 			dst[optype][i] =
 			   (src[optype][2][i] >
-			    0.5) ? src[optype][0][i] : src[optype][1][i];
+			    0.5F) ? src[optype][0][i] : src[optype][1][i];
 		     }
 		  }
 		  else {
 		     dst[optype][3] =
 			(src[optype][2][3] >
-			 0.5) ? src[optype][0][3] : src[optype][1][3];
+			 0.5F) ? src[optype][0][3] : src[optype][1][3];
 		  }
 		  break;
 
diff --git a/src/mesa/swrast/s_copypix.c b/src/mesa/swrast/s_copypix.c
index 68c83e44e12..0dbccc0f61d 100644
--- a/src/mesa/swrast/s_copypix.c
+++ b/src/mesa/swrast/s_copypix.c
@@ -27,6 +27,7 @@
 #include "main/context.h"
 #include "main/condrender.h"
 #include "main/macros.h"
+#include "main/blit.h"
 #include "main/pixeltransfer.h"
 #include "main/imports.h"
 
@@ -51,20 +52,9 @@ regions_overlap(GLint srcx, GLint srcy,
                 GLint width, GLint height,
                 GLfloat zoomX, GLfloat zoomY)
 {
-   if (zoomX == 1.0 && zoomY == 1.0) {
-      /* no zoom */
-      if (srcx >= dstx + width || (srcx + width <= dstx)) {
-         return GL_FALSE;
-      }
-      else if (srcy < dsty) { /* this is OK */
-         return GL_FALSE;
-      }
-      else if (srcy > dsty + height) {
-         return GL_FALSE;
-      }
-      else {
-         return GL_TRUE;
-      }
+   if (zoomX == 1.0F && zoomY == 1.0F) {
+      return _mesa_regions_overlap(srcx, srcy, srcx + width, srcy + height,
+                                   dstx, dsty, dstx + width, dsty + height);
    }
    else {
       /* add one pixel of slop when zooming, just to be safe */
@@ -211,8 +201,8 @@ scale_and_bias_z(struct gl_context *ctx, GLuint width,
    GLuint i;
 
    if (depthMax <= 0xffffff &&
-       ctx->Pixel.DepthScale == 1.0 &&
-       ctx->Pixel.DepthBias == 0.0) {
+       ctx->Pixel.DepthScale == 1.0F &&
+       ctx->Pixel.DepthBias == 0.0F) {
       /* no scale or bias and no clamping and no worry of overflow */
       const GLfloat depthMaxF = ctx->DrawBuffer->_DepthMaxF;
       for (i = 0; i < width; i++) {
diff --git a/src/mesa/swrast/s_depth.c b/src/mesa/swrast/s_depth.c
index 134f897c039..ffadc05a732 100644
--- a/src/mesa/swrast/s_depth.c
+++ b/src/mesa/swrast/s_depth.c
@@ -419,8 +419,8 @@ _swrast_depth_bounds_test( struct gl_context *ctx, SWspan *span )
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    struct gl_renderbuffer *rb = fb->Attachment[BUFFER_DEPTH].Renderbuffer;
    GLubyte *zStart;
-   GLuint zMin = (GLuint) (ctx->Depth.BoundsMin * fb->_DepthMaxF + 0.5F);
-   GLuint zMax = (GLuint) (ctx->Depth.BoundsMax * fb->_DepthMaxF + 0.5F);
+   GLuint zMin = (GLuint)((double)ctx->Depth.BoundsMin * 0xffffffff);
+   GLuint zMax = (GLuint)((double)ctx->Depth.BoundsMax * 0xffffffff);
    GLubyte *mask = span->array->mask;
    const GLuint count = span->end;
    GLuint i;
@@ -444,6 +444,16 @@ _swrast_depth_bounds_test( struct gl_context *ctx, SWspan *span )
       zBufferVals = (const GLuint *) zStart;
    }
    else {
+      /* Round the bounds to the precision of the zbuffer. */
+      if (rb->Format == MESA_FORMAT_Z_UNORM16) {
+         zMin = (zMin & 0xffff0000) | (zMin >> 16);
+         zMax = (zMax & 0xffff0000) | (zMax >> 16);
+      } else {
+         /* 24 bits */
+         zMin = (zMin & 0xffffff00) | (zMin >> 24);
+         zMax = (zMax & 0xffffff00) | (zMax >> 24);
+      }
+
       /* unpack Z values into a temporary array */
       if (span->arrayMask & SPAN_XY) {
          get_z32_values(ctx, rb, count, span->array->x, span->array->y,
diff --git a/src/mesa/swrast/s_drawpix.c b/src/mesa/swrast/s_drawpix.c
index fb677ee1b16..dc6827ede9f 100644
--- a/src/mesa/swrast/s_drawpix.c
+++ b/src/mesa/swrast/s_drawpix.c
@@ -264,7 +264,7 @@ draw_stencil_pixels( struct gl_context *ctx, GLint x, GLint y,
                      const struct gl_pixelstore_attrib *unpack,
                      const GLvoid *pixels )
 {
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    const GLenum destType = GL_UNSIGNED_BYTE;
    GLint row;
    GLubyte *values;
@@ -309,8 +309,8 @@ draw_depth_pixels( struct gl_context *ctx, GLint x, GLint y,
                    const GLvoid *pixels )
 {
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+      = ctx->Pixel.DepthScale != 1.0f || ctx->Pixel.DepthBias != 0.0f;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0f || ctx->Pixel.ZoomY != 1.0f;
    SWspan span;
 
    INIT_SPAN(span, GL_BITMAP);
@@ -415,7 +415,7 @@ draw_rgba_pixels( struct gl_context *ctx, GLint x, GLint y,
                   const GLvoid *pixels )
 {
    const GLint imgX = x, imgY = y;
-   const GLboolean zoom = ctx->Pixel.ZoomX!=1.0 || ctx->Pixel.ZoomY!=1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    GLbitfield transferOps = ctx->_ImageTransferState;
    SWspan span;
 
@@ -601,10 +601,10 @@ draw_depth_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
 {
    const GLint imgX = x, imgY = y;
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
+      = ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F;
    const GLuint stencilMask = ctx->Stencil.WriteMask[0];
    const GLenum stencilType = GL_UNSIGNED_BYTE;
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    struct gl_renderbuffer *depthRb, *stencilRb;
    struct gl_pixelstore_attrib clippedUnpack = *unpack;
 
diff --git a/src/mesa/swrast/s_fragprog.c b/src/mesa/swrast/s_fragprog.c
index 175915a5a0b..4fbf66b9db7 100644
--- a/src/mesa/swrast/s_fragprog.c
+++ b/src/mesa/swrast/s_fragprog.c
@@ -243,9 +243,9 @@ run_program(struct gl_context *ctx, SWspan *span, GLuint start, GLuint end)
             /* Store result depth/z */
             if (outputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
                const GLfloat depth = machine->Outputs[FRAG_RESULT_DEPTH][2];
-               if (depth <= 0.0)
+               if (depth <= 0.0F)
                   span->array->z[i] = 0;
-               else if (depth >= 1.0)
+               else if (depth >= 1.0F)
                   span->array->z[i] = ctx->DrawBuffer->_DepthMax;
                else
                   span->array->z[i] =
diff --git a/src/mesa/swrast/s_lines.c b/src/mesa/swrast/s_lines.c
index 58bd2fc720a..ab8da7db289 100644
--- a/src/mesa/swrast/s_lines.c
+++ b/src/mesa/swrast/s_lines.c
@@ -241,7 +241,7 @@ _swrast_choose_line( struct gl_context *ctx )
          USE(general_line);
       }
       else if (ctx->Depth.Test
-               || ctx->Line.Width != 1.0
+               || ctx->Line.Width != 1.0F
                || ctx->Line.StippleFlag) {
          /* no texture, but Z, fog, width>1, stipple, etc. */
 #if CHAN_BITS == 32
@@ -252,7 +252,7 @@ _swrast_choose_line( struct gl_context *ctx )
       }
       else {
          assert(!ctx->Depth.Test);
-         assert(ctx->Line.Width == 1.0);
+         assert(ctx->Line.Width == 1.0F);
          /* simple lines */
          USE(simple_no_z_rgba_line);
       }
diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c
index 2212c95fa9a..d9aae73302c 100644
--- a/src/mesa/swrast/s_points.c
+++ b/src/mesa/swrast/s_points.c
@@ -208,9 +208,9 @@ sprite_point(struct gl_context *ctx, const SWvertex *vert)
       else {
          /* even size */
          /* 0.501 factor allows conformance to pass */
-         xmin = (GLint) (x + 0.501) - iRadius;
+         xmin = (GLint) (x + 0.501F) - iRadius;
          xmax = xmin + iSize - 1;
-         ymin = (GLint) (y + 0.501) - iRadius;
+         ymin = (GLint) (y + 0.501F) - iRadius;
          ymax = ymin + iSize - 1;
       }
 
@@ -423,9 +423,9 @@ large_point(struct gl_context *ctx, const SWvertex *vert)
       else {
          /* even size */
          /* 0.501 factor allows conformance to pass */
-         xmin = (GLint) (x + 0.501) - iRadius;
+         xmin = (GLint) (x + 0.501F) - iRadius;
          xmax = xmin + iSize - 1;
-         ymin = (GLint) (y + 0.501) - iRadius;
+         ymin = (GLint) (y + 0.501F) - iRadius;
          ymax = ymin + iSize - 1;
       }
 
@@ -552,7 +552,7 @@ _swrast_choose_point(struct gl_context *ctx)
       else if (ctx->Point.SmoothFlag) {
          swrast->Point = smooth_point;
       }
-      else if (size > 1.0 ||
+      else if (size > 1.0F ||
                ctx->Point._Attenuated ||
                ctx->VertexProgram.PointSizeEnabled) {
          swrast->Point = large_point;
diff --git a/src/mesa/swrast/s_span.c b/src/mesa/swrast/s_span.c
index 3db10e163d7..cd939ba9510 100644
--- a/src/mesa/swrast/s_span.c
+++ b/src/mesa/swrast/s_span.c
@@ -506,7 +506,7 @@ interpolate_texcoords(struct gl_context *ctx, SWspan *span)
             /* LOD is calculated directly in the ansiotropic filter, we can
              * skip the normal lambda function as the result is ignored.
              */
-            if (samp->MaxAnisotropy > 1.0 &&
+            if (samp->MaxAnisotropy > 1.0F &&
                 samp->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
                needLambda = GL_FALSE;
             }
@@ -886,16 +886,16 @@ apply_aa_coverage(SWspan *span)
       GLubyte (*rgba)[4] = span->array->rgba8;
       for (i = 0; i < span->end; i++) {
          const GLfloat a = rgba[i][ACOMP] * coverage[i];
-         rgba[i][ACOMP] = (GLubyte) CLAMP(a, 0.0, 255.0);
-         assert(coverage[i] >= 0.0);
-         assert(coverage[i] <= 1.0);
+         rgba[i][ACOMP] = (GLubyte) CLAMP(a, 0.0F, 255.0F);
+         assert(coverage[i] >= 0.0F);
+         assert(coverage[i] <= 1.0F);
       }
    }
    else if (span->array->ChanType == GL_UNSIGNED_SHORT) {
       GLushort (*rgba)[4] = span->array->rgba16;
       for (i = 0; i < span->end; i++) {
          const GLfloat a = rgba[i][ACOMP] * coverage[i];
-         rgba[i][ACOMP] = (GLushort) CLAMP(a, 0.0, 65535.0);
+         rgba[i][ACOMP] = (GLushort) CLAMP(a, 0.0F, 65535.0F);
       }
    }
    else {
diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
index 453bd36367b..da4a013634c 100644
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -670,8 +670,8 @@ _swrast_texture_span( struct gl_context *ctx, SWspan *span )
                }
             }
 
-            if (samp->MinLod != -1000.0 ||
-                samp->MaxLod != 1000.0) {
+            if (samp->MinLod != -1000.0F ||
+                samp->MaxLod != 1000.0F) {
                /* apply LOD clamping to lambda */
                const GLfloat min = samp->MinLod;
                const GLfloat max = samp->MaxLod;
@@ -682,7 +682,7 @@ _swrast_texture_span( struct gl_context *ctx, SWspan *span )
                }
             }
          }
-         else if (samp->MaxAnisotropy > 1.0 &&
+         else if (samp->MaxAnisotropy > 1.0F &&
                   samp->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
             /* sample_lambda_2d_aniso is beeing used as texture_sample_func,
              * it requires the current SWspan *span as an additional parameter.
diff --git a/src/mesa/swrast/s_texfilter.c b/src/mesa/swrast/s_texfilter.c
index abc1727cf29..314170fc751 100644
--- a/src/mesa/swrast/s_texfilter.c
+++ b/src/mesa/swrast/s_texfilter.c
@@ -1902,7 +1902,7 @@ sample_lambda_2d_aniso(struct gl_context *ctx,
    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
    const GLboolean adjustLOD =
       (texUnit->LodBias + samp->LodBias != 0.0F)
-      || (samp->MinLod != -1000.0 || samp->MaxLod != 1000.0);
+      || (samp->MinLod != -1000.0F || samp->MaxLod != 1000.0F);
 
    GLuint i;
    
@@ -1973,8 +1973,8 @@ sample_lambda_2d_aniso(struct gl_context *ctx,
                      ctx->Const.MaxTextureLodBias);
             lod += bias;
 
-            if (samp->MinLod != -1000.0 ||
-                samp->MaxLod != 1000.0) {
+            if (samp->MinLod != -1000.0F ||
+                samp->MaxLod != 1000.0F) {
                /* apply LOD clamping to lambda */
                lod = CLAMP(lod, samp->MinLod, samp->MaxLod);
             }
@@ -3713,7 +3713,7 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
                                     const struct gl_sampler_object *sampler)
 {
    if (!t || !_mesa_is_texture_complete(t, sampler)) {
-      return &null_sample_func;
+      return null_sample_func;
    }
    else {
       const GLboolean needLambda =
@@ -3722,32 +3722,32 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
       switch (t->Target) {
       case GL_TEXTURE_1D:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
-            return &sample_lambda_1d;
+            return sample_lambda_1d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_1d;
+            return sample_linear_1d;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_1d;
+            return sample_nearest_1d;
          }
       case GL_TEXTURE_2D:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
             /* Anisotropic filtering extension. Activated only if mipmaps are used */
-            if (sampler->MaxAnisotropy > 1.0 &&
+            if (sampler->MaxAnisotropy > 1.0F &&
                 sampler->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
-               return &sample_lambda_2d_aniso;
+               return sample_lambda_2d_aniso;
             }
-            return &sample_lambda_2d;
+            return sample_lambda_2d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_2d;
+            return sample_linear_2d;
          }
          else {
             /* check for a few optimized cases */
@@ -3772,72 +3772,72 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
          }
       case GL_TEXTURE_3D:
          if (needLambda) {
-            return &sample_lambda_3d;
+            return sample_lambda_3d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_3d;
+            return sample_linear_3d;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_3d;
+            return sample_nearest_3d;
          }
       case GL_TEXTURE_CUBE_MAP:
          if (needLambda) {
-            return &sample_lambda_cube;
+            return sample_lambda_cube;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_cube;
+            return sample_linear_cube;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_cube;
+            return sample_nearest_cube;
          }
       case GL_TEXTURE_RECTANGLE_NV:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
-            return &sample_lambda_rect;
+            return sample_lambda_rect;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_rect;
+            return sample_linear_rect;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_rect;
+            return sample_nearest_rect;
          }
       case GL_TEXTURE_1D_ARRAY_EXT:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
 	 else if (needLambda) {
-            return &sample_lambda_1d_array;
+            return sample_lambda_1d_array;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_1d_array;
+            return sample_linear_1d_array;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_1d_array;
+            return sample_nearest_1d_array;
          }
       case GL_TEXTURE_2D_ARRAY_EXT:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
 	 else if (needLambda) {
-            return &sample_lambda_2d_array;
+            return sample_lambda_2d_array;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_2d_array;
+            return sample_linear_2d_array;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_2d_array;
+            return sample_nearest_2d_array;
          }
       default:
          _mesa_problem(ctx,
                        "invalid target in _swrast_choose_texture_sample_func");
-         return &null_sample_func;
+         return null_sample_func;
       }
    }
 }
diff --git a/src/mesa/swrast/s_tritemp.h b/src/mesa/swrast/s_tritemp.h
index fddbbfd99d6..1d71839713c 100644
--- a/src/mesa/swrast/s_tritemp.h
+++ b/src/mesa/swrast/s_tritemp.h
@@ -242,7 +242,7 @@ static void NAME(struct gl_context *ctx, const SWvertex *v0,
       if (IS_INF_OR_NAN(area) || area == 0.0F)
          return;
 
-      if (area * bf * swrast->_BackfaceCullSign < 0.0)
+      if (area * bf * swrast->_BackfaceCullSign < 0.0F)
          return;
 
       oneOverArea = 1.0F / area;
diff --git a/src/mesa/swrast/s_zoom.c b/src/mesa/swrast/s_zoom.c
index 9879e2a5f10..34b8eb19657 100644
--- a/src/mesa/swrast/s_zoom.c
+++ b/src/mesa/swrast/s_zoom.c
@@ -114,7 +114,7 @@ unzoom_x(GLfloat zoomX, GLint imageX, GLint zx)
    (zx - imageX) / zoomX = x - imageX;
    */
    GLint x;
-   if (zoomX < 0.0)
+   if (zoomX < 0.0F)
       zx++;
    x = imageX + (GLint) ((zx - imageX) / zoomX);
    return x;
diff --git a/src/mesa/swrast_setup/ss_tritmp.h b/src/mesa/swrast_setup/ss_tritmp.h
index c38c76a4adb..adb77bd3247 100644
--- a/src/mesa/swrast_setup/ss_tritmp.h
+++ b/src/mesa/swrast_setup/ss_tritmp.h
@@ -58,7 +58,7 @@ static void TAG(triangle)(struct gl_context *ctx, GLuint e0, GLuint e1, GLuint e
 
       if (IND & (SS_TWOSIDE_BIT | SS_UNFILLED_BIT))
       {
-	 facing = (cc < 0.0) ^ ctx->Polygon._FrontBit;
+	 facing = (cc < 0.0F) ^ ctx->Polygon._FrontBit;
 
 	 if (IND & SS_UNFILLED_BIT)
 	    mode = facing ? ctx->Polygon.BackMode : ctx->Polygon.FrontMode;
@@ -138,7 +138,7 @@ static void TAG(triangle)(struct gl_context *ctx, GLuint e0, GLuint e1, GLuint e
           * so no MRD value is used here.
           */
 	 offset = ctx->Polygon.OffsetUnits;
-	 if (cc * cc > 1e-16) {
+	 if (cc * cc > 1e-16F) {
 	    const GLfloat ez = z[0] - z[2];
 	    const GLfloat fz = z[1] - z[2];
 	    const GLfloat oneOverArea = 1.0F / cc;
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index bc77ba8bf95..b5c0b3e1f5b 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -190,7 +190,7 @@ _tnl_InvalidateState( struct gl_context *ctx, GLuint new_state )
    }
 
    if (new_state & (_NEW_VIEWPORT | _NEW_BUFFERS)) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, 0, scale, translate);
       _math_matrix_viewport(&tnl->_WindowMap, scale, translate,
                             ctx->DrawBuffer->_DepthMaxF);
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index 6adf1dce676..c130ab3f93d 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -257,7 +257,7 @@ static GLboolean *_tnl_import_edgeflag( struct gl_context *ctx,
    GLuint i;
 
    for (i = 0; i < count; i++) {
-      *bptr++ = ((GLfloat *)ptr)[0] == 1.0;
+      *bptr++ = ((GLfloat *)ptr)[0] == 1.0F;
       ptr += stride;
    }
 
@@ -425,6 +425,7 @@ void _tnl_draw_prims(struct gl_context *ctx,
 			 GLuint min_index,
 			 GLuint max_index,
 			 struct gl_transform_feedback_object *tfb_vertcount,
+                         unsigned stream,
 			 struct gl_buffer_object *indirect)
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
@@ -451,7 +452,7 @@ void _tnl_draw_prims(struct gl_context *ctx,
       printf("%s %d..%d\n", __func__, min_index, max_index);
       for (i = 0; i < nr_prims; i++)
 	 printf("prim %d: %s start %d count %d\n", i, 
-		_mesa_lookup_enum_by_nr(prim[i].mode),
+		_mesa_enum_to_string(prim[i].mode),
 		prim[i].start,
 		prim[i].count);
    }
diff --git a/src/mesa/tnl/t_rasterpos.c b/src/mesa/tnl/t_rasterpos.c
index d4b45bac9ac..4bd9ac8539e 100644
--- a/src/mesa/tnl/t_rasterpos.c
+++ b/src/mesa/tnl/t_rasterpos.c
@@ -148,7 +148,7 @@ shade_rastpos(struct gl_context *ctx,
 	 SUB_3V(VP, light->_Position, vertex);
          /* d = length(VP) */
 	 d = (GLfloat) LEN_3FV( VP );
-	 if (d > 1.0e-6) {
+	 if (d > 1.0e-6F) {
             /* normalize VP */
 	    GLfloat invd = 1.0F / d;
 	    SELF_SCALE_SCALAR_3V(VP, invd);
@@ -172,7 +172,7 @@ shade_rastpos(struct gl_context *ctx,
 	 }
       }
 
-      if (attenuation < 1e-3)
+      if (attenuation < 1e-3F)
 	 continue;
 
       n_dot_VP = DOT3( normal, VP );
@@ -219,7 +219,7 @@ shade_rastpos(struct gl_context *ctx,
 	    shine = ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_SHININESS][0];
 	    spec_coef = powf(n_dot_h, shine);
 
-	    if (spec_coef > 1.0e-10) {
+	    if (spec_coef > 1.0e-10F) {
                if (ctx->Light.Model.ColorControl==GL_SEPARATE_SPECULAR_COLOR) {
                   ACC_SCALE_SCALAR_3V( specularContrib, spec_coef,
                                        light->_MatSpecular[0]);
@@ -378,7 +378,7 @@ _tnl_RasterPos(struct gl_context *ctx, const GLfloat vObj[4])
       GLfloat eye[4], clip[4], ndc[3], d;
       GLfloat *norm, eyenorm[3];
       GLfloat *objnorm = ctx->Current.Attrib[VERT_ATTRIB_NORMAL];
-      double scale[3], translate[3];
+      float scale[3], translate[3];
 
       /* apply modelview matrix:  eye = MV * obj */
       TRANSFORM_POINT( eye, ctx->ModelviewMatrixStack.Top->m, vObj );
diff --git a/src/mesa/tnl/t_vb_fog.c b/src/mesa/tnl/t_vb_fog.c
index 1ca72f866b7..5489ed6857f 100644
--- a/src/mesa/tnl/t_vb_fog.c
+++ b/src/mesa/tnl/t_vb_fog.c
@@ -45,8 +45,8 @@ struct fog_stage_data {
 #define FOG_STAGE_DATA(stage) ((struct fog_stage_data *)stage->privatePtr)
 
 #define FOG_EXP_TABLE_SIZE 256
-#define FOG_MAX (10.0)
-#define EXP_FOG_MAX .0006595
+#define FOG_MAX (10.0F)
+#define EXP_FOG_MAX .0006595F
 #define FOG_INCR (FOG_MAX/FOG_EXP_TABLE_SIZE)
 static GLfloat exp_table[FOG_EXP_TABLE_SIZE];
 static GLfloat inited = 0;
@@ -54,7 +54,7 @@ static GLfloat inited = 0;
 #if 1
 #define NEG_EXP( result, narg )						\
 do {									\
-   GLfloat f = (GLfloat) (narg * (1.0/FOG_INCR));			\
+   GLfloat f = (GLfloat) (narg * (1.0F / FOG_INCR));			\
    GLint k = (GLint) f;							\
    if (k > FOG_EXP_TABLE_SIZE-2) 					\
       result = (GLfloat) EXP_FOG_MAX;					\
diff --git a/src/mesa/tnl/t_vb_light.c b/src/mesa/tnl/t_vb_light.c
index dbd57fa6bfe..029265a4f83 100644
--- a/src/mesa/tnl/t_vb_light.c
+++ b/src/mesa/tnl/t_vb_light.c
@@ -137,23 +137,23 @@ validate_shine_table( struct gl_context *ctx, GLuint side, GLfloat shininess )
 	    break;
 
       m = s->tab;
-      m[0] = 0.0;
-      if (shininess == 0.0) {
+      m[0] = 0.0F;
+      if (shininess == 0.0F) {
 	 for (j = 1 ; j <= SHINE_TABLE_SIZE ; j++)
-	    m[j] = 1.0;
+	    m[j] = 1.0F;
       }
       else {
 	 for (j = 1 ; j < SHINE_TABLE_SIZE ; j++) {
-            GLdouble t, x = j / (GLfloat) (SHINE_TABLE_SIZE - 1);
-            if (x < 0.005) /* underflow check */
-               x = 0.005;
-            t = pow(x, shininess);
-	    if (t > 1e-20)
-	       m[j] = (GLfloat) t;
+            GLfloat t, x = j / (GLfloat) (SHINE_TABLE_SIZE - 1);
+            if (x < 0.005F) /* underflow check */
+               x = 0.005F;
+            t = powf(x, shininess);
+	    if (t > 1e-20F)
+	       m[j] = t;
 	    else
-	       m[j] = 0.0;
+	       m[j] = 0.0F;
 	 }
-	 m[SHINE_TABLE_SIZE] = 1.0;
+	 m[SHINE_TABLE_SIZE] = 1.0F;
       }
 
       s->shininess = shininess;
diff --git a/src/mesa/tnl/t_vb_lighttmp.h b/src/mesa/tnl/t_vb_lighttmp.h
index f8786accbbb..3aebcd4b799 100644
--- a/src/mesa/tnl/t_vb_lighttmp.h
+++ b/src/mesa/tnl/t_vb_lighttmp.h
@@ -112,7 +112,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 	 GLint side;
 	 GLfloat contrib[3];
 	 GLfloat attenuation;
-	 GLfloat VP[3];  /* unit vector from vertex to light */
+	 GLfloat VP[3];          /* unit vector from vertex to light */
 	 GLfloat n_dot_VP;       /* n dot VP */
 	 GLfloat *h;
 
@@ -129,7 +129,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 
 	    d = (GLfloat) LEN_3FV( VP );
 
-	    if (d > 1e-6) {
+	    if (d > 1e-6F) {
 	       GLfloat invd = 1.0F / d;
 	       SELF_SCALE_SCALAR_3V(VP, invd);
 	    }
@@ -152,7 +152,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 	    }
 	 }
 
-	 if (attenuation < 1e-3)
+	 if (attenuation < 1e-3F)
 	    continue;		/* this light makes no contribution */
 
 	 /* Compute dot product or normal and vector from V to light pos */
@@ -204,7 +204,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 
 	 if (n_dot_h > 0.0F) {
 	    GLfloat spec_coef = lookup_shininess(ctx, side, n_dot_h);
-	    if (spec_coef > 1.0e-10) {
+	    if (spec_coef > 1.0e-10F) {
 	       spec_coef *= attenuation;
 	       ACC_SCALE_SCALAR_3V( spec[side], spec_coef,
 				    light->_MatSpecular[side]);
@@ -283,12 +283,11 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 
       /* Add contribution from each enabled light source */
       foreach (light, &ctx->Light.EnabledList) {
-
 	 GLfloat n_dot_h;
 	 GLfloat correction;
 	 GLint side;
 	 GLfloat contrib[3];
-	 GLfloat attenuation = 1.0;
+	 GLfloat attenuation;
 	 GLfloat VP[3];          /* unit vector from vertex to light */
 	 GLfloat n_dot_VP;       /* n dot VP */
 	 GLfloat *h;
@@ -302,12 +301,11 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 	 else {
 	    GLfloat d;     /* distance from vertex to light */
 
-
 	    SUB_3V(VP, light->_Position, vertex);
 
 	    d = (GLfloat) LEN_3FV( VP );
 
-	    if ( d > 1e-6) {
+	    if (d > 1e-6F) {
 	       GLfloat invd = 1.0F / d;
 	       SELF_SCALE_SCALAR_3V(VP, invd);
 	    }
@@ -330,7 +328,7 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 	    }
 	 }
 
-	 if (attenuation < 1e-3)
+	 if (attenuation < 1e-3F)
 	    continue;		/* this light makes no contribution */
 
 	 /* Compute dot product or normal and vector from V to light pos */
diff --git a/src/mesa/tnl/t_vb_normals.c b/src/mesa/tnl/t_vb_normals.c
index 9aee1a2fb0b..6fc89c23b33 100644
--- a/src/mesa/tnl/t_vb_normals.c
+++ b/src/mesa/tnl/t_vb_normals.c
@@ -114,7 +114,7 @@ validate_normal_stage(struct gl_context *ctx, struct tnl_pipeline_stage *stage)
 	 store->NormalTransform = _mesa_normal_tab[transform | NORM_NORMALIZE];
       }
       else if (ctx->Transform.RescaleNormals &&
-               ctx->_ModelViewInvScale != 1.0) {
+               ctx->_ModelViewInvScale != 1.0F) {
 	 store->NormalTransform = _mesa_normal_tab[transform | NORM_RESCALE];
       }
       else {
@@ -131,7 +131,7 @@ validate_normal_stage(struct gl_context *ctx, struct tnl_pipeline_stage *stage)
 	 store->NormalTransform = _mesa_normal_tab[NORM_NORMALIZE];
       }
       else if (!ctx->Transform.RescaleNormals &&
-	       ctx->_ModelViewInvScale != 1.0) {
+	       ctx->_ModelViewInvScale != 1.0F) {
 	 store->NormalTransform = _mesa_normal_tab[NORM_RESCALE];
       }
       else {
diff --git a/src/mesa/tnl/t_vb_render.c b/src/mesa/tnl/t_vb_render.c
index 4960ac0969e..03e8fcfa196 100644
--- a/src/mesa/tnl/t_vb_render.c
+++ b/src/mesa/tnl/t_vb_render.c
@@ -315,7 +315,7 @@ static GLboolean run_render( struct gl_context *ctx,
 
 	 if (MESA_VERBOSE & VERBOSE_PRIMS) 
 	    _mesa_debug(NULL, "MESA prim %s %d..%d\n", 
-			_mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
+			_mesa_enum_to_string(prim & PRIM_MODE_MASK), 
 			start, start+length);
 
 	 if (length)
diff --git a/src/mesa/tnl/t_vertex_generic.c b/src/mesa/tnl/t_vertex_generic.c
index 2a25a96928f..6c40c868363 100644
--- a/src/mesa/tnl/t_vertex_generic.c
+++ b/src/mesa/tnl/t_vertex_generic.c
@@ -1026,7 +1026,7 @@ void _tnl_generic_interp( struct gl_context *ctx,
 
    if (tnl->NeedNdcCoords) {
       const GLfloat *dstclip = VB->ClipPtr->data[edst];
-      if (dstclip[3] != 0.0) {
+      if (dstclip[3] != 0.0f) {
 	 const GLfloat w = 1.0f / dstclip[3];
 	 GLfloat pos[4];
 
diff --git a/src/mesa/tnl/t_vertex_sse.c b/src/mesa/tnl/t_vertex_sse.c
index 30dc1a72080..14e7812ec78 100644
--- a/src/mesa/tnl/t_vertex_sse.c
+++ b/src/mesa/tnl/t_vertex_sse.c
@@ -592,7 +592,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )
 	    break;
 	 case GL_UNSIGNED_SHORT:
 	 default:
-	    printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
+	    printf("unknown CHAN_TYPE %s\n", _mesa_enum_to_string(CHAN_TYPE));
 	    return GL_FALSE;
 	 }
 	 break;
diff --git a/src/mesa/tnl/tnl.h b/src/mesa/tnl/tnl.h
index 8c59ff9e58f..5a9938e7afb 100644
--- a/src/mesa/tnl/tnl.h
+++ b/src/mesa/tnl/tnl.h
@@ -76,7 +76,7 @@ struct _mesa_prim;
 struct _mesa_index_buffer;
 
 void
-_tnl_draw_prims( struct gl_context *ctx,
+_tnl_draw_prims(struct gl_context *ctx,
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
@@ -84,6 +84,7 @@ _tnl_draw_prims( struct gl_context *ctx,
 		     GLuint min_index,
 		     GLuint max_index,
 		     struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
 		     struct gl_buffer_object *indirect );
 
 extern void
diff --git a/src/mesa/tnl_dd/t_dd_dmatmp.h b/src/mesa/tnl_dd/t_dd_dmatmp.h
index 667e2a6e5d5..7be39541e43 100644
--- a/src/mesa/tnl_dd/t_dd_dmatmp.h
+++ b/src/mesa/tnl_dd/t_dd_dmatmp.h
@@ -1256,7 +1256,7 @@ static GLboolean TAG(validate_render)( struct gl_context *ctx,
       }
       
       if (!ok) {
-/* 	 fprintf(stderr, "not ok %s\n", _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK)); */
+/* 	 fprintf(stderr, "not ok %s\n", _mesa_enum_to_string(prim & PRIM_MODE_MASK)); */
 	 return GL_FALSE;
       }
    }
diff --git a/src/mesa/tnl_dd/t_dd_unfilled.h b/src/mesa/tnl_dd/t_dd_unfilled.h
index 82190c08916..ee15e773c88 100644
--- a/src/mesa/tnl_dd/t_dd_unfilled.h
+++ b/src/mesa/tnl_dd/t_dd_unfilled.h
@@ -60,7 +60,7 @@ static void TAG(unfilled_tri)( struct gl_context *ctx,
    }
 
 /*     fprintf(stderr, "%s %s %d %d %d\n", __func__, */
-/*  	   _mesa_lookup_enum_by_nr( mode ), */
+/*  	   _mesa_enum_to_string( mode ), */
 /*  	   ef[e0], ef[e1], ef[e2]); */
 
    if (mode == GL_POINT) {
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index 54dee6c464f..2aaff5df019 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -97,7 +97,8 @@ typedef void (*vbo_draw_func)( struct gl_context *ctx,
 			       GLuint min_index,
 			       GLuint max_index,
 			       struct gl_transform_feedback_object *tfb_vertcount,
-			       struct gl_buffer_object *indirect );
+                               unsigned stream,
+			       struct gl_buffer_object *indirect);
 
 
 
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index fd1ffe2f76d..e3eb286e482 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -37,9 +37,9 @@
 
 static GLuint check_size( const GLfloat *attr )
 {
-   if (attr[3] != 1.0) return 4;
-   if (attr[2] != 0.0) return 3;
-   if (attr[1] != 0.0) return 2;
+   if (attr[3] != 1.0F) return 4;
+   if (attr[2] != 0.0F) return 3;
+   if (attr[1] != 0.0F) return 2;
    return 1;		
 }
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 72b8206ec23..34d2c1d3d6b 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -255,7 +255,7 @@ check_array_data(struct gl_context *ctx, struct gl_client_array *array,
             GLint k;
             for (k = 0; k < array->Size; k++) {
                if (IS_INF_OR_NAN(f[k]) ||
-                   f[k] >= 1.0e20 || f[k] <= -1.0e10) {
+                   f[k] >= 1.0e20F || f[k] <= -1.0e10F) {
                   printf("Bad array data:\n");
                   printf("  Element[%u].%u = %f\n", j, k, f[k]);
                   printf("  Array %u at %p\n", attrib, (void* ) array);
@@ -263,7 +263,7 @@ check_array_data(struct gl_context *ctx, struct gl_client_array *array,
 			 array->Type, array->Size, array->Stride);
                   printf("  Address/offset %p in Buffer Object %u\n",
 			 array->Ptr, array->BufferObj->Name);
-                  f[k] = 1.0; /* XXX replace the bad value! */
+                  f[k] = 1.0F; /* XXX replace the bad value! */
                }
                /*assert(!IS_INF_OR_NAN(f[k]));*/
             }
@@ -633,7 +633,7 @@ vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
          /* draw one or two prims */
          check_buffers_are_unmapped(exec->array.inputs);
          vbo->draw_prims(ctx, prim, primCount, NULL,
-                         GL_TRUE, start, start + count - 1, NULL, NULL);
+                         GL_TRUE, start, start + count - 1, NULL, 0, NULL);
       }
    }
    else {
@@ -644,7 +644,7 @@ vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
       check_buffers_are_unmapped(exec->array.inputs);
       vbo->draw_prims(ctx, prim, 1, NULL,
                       GL_TRUE, start, start + count - 1,
-                      NULL, NULL);
+                      NULL, 0, NULL);
    }
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
@@ -786,7 +786,7 @@ vbo_exec_DrawArrays(GLenum mode, GLint start, GLsizei count)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArrays(%s, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, count);
+                  _mesa_enum_to_string(mode), start, count);
 
    if (!_mesa_validate_DrawArrays(ctx, mode, count))
       return;
@@ -813,7 +813,7 @@ vbo_exec_DrawArraysInstanced(GLenum mode, GLint start, GLsizei count,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysInstanced(%s, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, count, numInstances);
+                  _mesa_enum_to_string(mode), start, count, numInstances);
 
    if (!_mesa_validate_DrawArraysInstanced(ctx, mode, start, count, numInstances))
       return;
@@ -839,7 +839,7 @@ vbo_exec_DrawArraysInstancedBaseInstance(GLenum mode, GLint first, GLsizei count
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysInstancedBaseInstance(%s, %d, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), first, count,
+                  _mesa_enum_to_string(mode), first, count,
                   numInstances, baseInstance);
 
    if (!_mesa_validate_DrawArraysInstanced(ctx, mode, first, count,
@@ -990,7 +990,7 @@ vbo_validated_drawrangeelements(struct gl_context *ctx, GLenum mode,
 
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1, &ib,
-                   index_bounds_valid, start, end, NULL, NULL);
+                   index_bounds_valid, start, end, NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
       _mesa_flush(ctx);
@@ -1021,8 +1021,8 @@ vbo_exec_DrawRangeElementsBaseVertex(GLenum mode,
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx,
                 "glDrawRangeElementsBaseVertex(%s, %u, %u, %d, %s, %p, %d)\n",
-                _mesa_lookup_enum_by_nr(mode), start, end, count,
-                _mesa_lookup_enum_by_nr(type), indices, basevertex);
+                _mesa_enum_to_string(mode), start, end, count,
+                _mesa_enum_to_string(type), indices, basevertex);
 
    if (!_mesa_validate_DrawRangeElements(ctx, mode, start, end, count,
                                          type, indices))
@@ -1099,8 +1099,8 @@ vbo_exec_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
       GET_CURRENT_CONTEXT(ctx);
       _mesa_debug(ctx,
                   "glDrawRangeElements(%s, %u, %u, %d, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, end, count,
-                  _mesa_lookup_enum_by_nr(type), indices);
+                  _mesa_enum_to_string(mode), start, end, count,
+                  _mesa_enum_to_string(type), indices);
    }
 
    vbo_exec_DrawRangeElementsBaseVertex(mode, start, end, count, type,
@@ -1119,8 +1119,8 @@ vbo_exec_DrawElements(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElements(%s, %u, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices);
 
    if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
       return;
@@ -1141,8 +1141,8 @@ vbo_exec_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsBaseVertex(%s, %d, %s, %p, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices, basevertex);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices, basevertex);
 
    if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
       return;
@@ -1163,8 +1163,8 @@ vbo_exec_DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstanced(%s, %d, %s, %p, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices, numInstances);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices, numInstances);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
                                              numInstances))
@@ -1187,8 +1187,8 @@ vbo_exec_DrawElementsInstancedBaseVertex(GLenum mode, GLsizei count, GLenum type
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseVertex(%s, %d, %s, %p, %d; %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, basevertex);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1212,8 +1212,8 @@ vbo_exec_DrawElementsInstancedBaseInstance(GLenum mode, GLsizei count, GLenum ty
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseInstance(%s, %d, %s, %p, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, baseInstance);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1238,8 +1238,8 @@ vbo_exec_DrawElementsInstancedBaseVertexBaseInstance(GLenum mode, GLsizei count,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseVertexBaseInstance(%s, %d, %s, %p, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, basevertex, baseInstance);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1350,7 +1350,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
 
       check_buffers_are_unmapped(exec->array.inputs);
       vbo->draw_prims(ctx, prim, primcount, &ib,
-                      false, ~0, ~0, NULL, NULL);
+                      false, ~0, ~0, NULL, 0, NULL);
    } else {
       /* render one prim at a time */
       for (i = 0; i < primcount; i++) {
@@ -1379,7 +1379,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
 
          check_buffers_are_unmapped(exec->array.inputs);
          vbo->draw_prims(ctx, prim, 1, &ib,
-                         false, ~0, ~0, NULL, NULL);
+                         false, ~0, ~0, NULL, 0, NULL);
       }
    }
 
@@ -1464,7 +1464,7 @@ vbo_draw_transform_feedback(struct gl_context *ctx, GLenum mode,
 
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1, NULL,
-                   GL_TRUE, 0, 0, obj, NULL);
+                   GL_TRUE, 0, 0, obj, stream, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
       _mesa_flush(ctx);
@@ -1488,7 +1488,7 @@ vbo_exec_DrawTransformFeedback(GLenum mode, GLuint name)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedback(%s, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), name);
+                  _mesa_enum_to_string(mode), name);
 
    vbo_draw_transform_feedback(ctx, mode, obj, 0, 1);
 }
@@ -1502,7 +1502,7 @@ vbo_exec_DrawTransformFeedbackStream(GLenum mode, GLuint name, GLuint stream)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackStream(%s, %u, %u)\n",
-                  _mesa_lookup_enum_by_nr(mode), name, stream);
+                  _mesa_enum_to_string(mode), name, stream);
 
    vbo_draw_transform_feedback(ctx, mode, obj, stream, 1);
 }
@@ -1517,7 +1517,7 @@ vbo_exec_DrawTransformFeedbackInstanced(GLenum mode, GLuint name,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackInstanced(%s, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), name);
+                  _mesa_enum_to_string(mode), name);
 
    vbo_draw_transform_feedback(ctx, mode, obj, 0, primcount);
 }
@@ -1533,7 +1533,7 @@ vbo_exec_DrawTransformFeedbackStreamInstanced(GLenum mode, GLuint name,
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackStreamInstanced"
                   "(%s, %u, %u, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode), name, stream, primcount);
+                  _mesa_enum_to_string(mode), name, stream, primcount);
 
    vbo_draw_transform_feedback(ctx, mode, obj, stream, primcount);
 }
@@ -1563,7 +1563,7 @@ vbo_validated_drawarraysindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1,
                    NULL, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
@@ -1603,7 +1603,7 @@ vbo_validated_multidrawarraysindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, primcount,
                    NULL, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    free(prim);
@@ -1640,7 +1640,7 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1,
                    &ib, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
@@ -1689,7 +1689,7 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, primcount,
                    &ib, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    free(prim);
@@ -1709,7 +1709,7 @@ vbo_exec_DrawArraysIndirect(GLenum mode, const GLvoid *indirect)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysIndirect(%s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), indirect);
+                  _mesa_enum_to_string(mode), indirect);
 
    if (!_mesa_validate_DrawArraysIndirect(ctx, mode, indirect))
       return;
@@ -1725,8 +1725,8 @@ vbo_exec_DrawElementsIndirect(GLenum mode, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsIndirect(%s, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode),
-                  _mesa_lookup_enum_by_nr(type), indirect);
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect);
 
    if (!_mesa_validate_DrawElementsIndirect(ctx, mode, type, indirect))
       return;
@@ -1743,7 +1743,7 @@ vbo_exec_MultiDrawArraysIndirect(GLenum mode,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glMultiDrawArraysIndirect(%s, %p, %i, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode), indirect, primcount, stride);
+                  _mesa_enum_to_string(mode), indirect, primcount, stride);
 
    /* If <stride> is zero, the array elements are treated as tightly packed. */
    if (stride == 0)
@@ -1768,8 +1768,8 @@ vbo_exec_MultiDrawElementsIndirect(GLenum mode, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glMultiDrawElementsIndirect(%s, %s, %p, %i, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode),
-                  _mesa_lookup_enum_by_nr(type), indirect, primcount, stride);
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect, primcount, stride);
 
    /* If <stride> is zero, the array elements are treated as tightly packed. */
    if (stride == 0)
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 37b53a8309d..2bfb0c32b73 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -412,7 +412,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
 				       GL_TRUE,
 				       0,
 				       exec->vtx.vert_count - 1,
-				       NULL, NULL);
+				       NULL, 0, NULL);
 
 	 /* If using a real VBO, get new storage -- unless asked not to.
           */
diff --git a/src/mesa/vbo/vbo_primitive_restart.c b/src/mesa/vbo/vbo_primitive_restart.c
index dafc4fd2a9a..0662c5cd4ef 100644
--- a/src/mesa/vbo/vbo_primitive_restart.c
+++ b/src/mesa/vbo/vbo_primitive_restart.c
@@ -251,11 +251,11 @@ vbo_sw_primitive_restart(struct gl_context *ctx,
                 (temp_prim.count == sub_prim->count)) {
                draw_prims_func(ctx, &temp_prim, 1, ib,
                                GL_TRUE, sub_prim->min_index, sub_prim->max_index,
-                               NULL, NULL);
+                               NULL, 0, NULL);
             } else {
                draw_prims_func(ctx, &temp_prim, 1, ib,
                                GL_FALSE, -1, -1,
-                               NULL, NULL);
+                               NULL, 0, NULL);
             }
          }
          if (sub_end_index >= end_index) {
diff --git a/src/mesa/vbo/vbo_rebase.c b/src/mesa/vbo/vbo_rebase.c
index c3c4b64e65c..24c04ca7e6a 100644
--- a/src/mesa/vbo/vbo_rebase.c
+++ b/src/mesa/vbo/vbo_rebase.c
@@ -258,7 +258,7 @@ void vbo_rebase_prims( struct gl_context *ctx,
 	 GL_TRUE,
 	 0, 
 	 max_index - min_index,
-	 NULL, NULL );
+	 NULL, 0, NULL );
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index de744e0c763..b1fd6892026 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -314,7 +314,7 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data)
                                       GL_TRUE,
                                       0,    /* Node is a VBO, so this is ok */
                                       node->count - 1,
-                                      NULL, NULL);
+                                      NULL, 0, NULL);
       }
    }
 
diff --git a/src/mesa/vbo/vbo_split_copy.c b/src/mesa/vbo/vbo_split_copy.c
index 7b1e20b18d2..cb27ef961ab 100644
--- a/src/mesa/vbo/vbo_split_copy.c
+++ b/src/mesa/vbo/vbo_split_copy.c
@@ -203,7 +203,7 @@ flush( struct copy_context *copy )
 	       GL_TRUE,
 	       0,
 	       copy->dstbuf_nr - 1,
-	       NULL, NULL );
+	       NULL, 0, NULL );
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;
diff --git a/src/mesa/vbo/vbo_split_inplace.c b/src/mesa/vbo/vbo_split_inplace.c
index 5887b74d829..cff4bcd30ff 100644
--- a/src/mesa/vbo/vbo_split_inplace.c
+++ b/src/mesa/vbo/vbo_split_inplace.c
@@ -94,7 +94,7 @@ static void flush_vertex( struct split_context *split )
 	       !split->ib,
 	       split->min_index,
 	       split->max_index,
-	       NULL, NULL);
+	       NULL, 0, NULL);
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;