Merge remote-tracking branch 'mesa-public/master' into vulkan

This pulls in nir cloning and some much-needed upstream refactors.
author: Jason Ekstrand <[email protected]> 2015-11-23 14:03:47 -0800
committer: Jason Ekstrand <[email protected]> 2015-11-23 14:03:47 -0800
commit: 179fc4aae8f782453f0488e8dd508f9a01117376 (patch)
tree: 5f0cc77b30d86b581fb968a71ba83c5e4c2546d7 /src
parent: e14b2c76b40398a61f45f5d058079641661a66cb (diff)
parent: d9b8fde963a53d4e06570d8bece97f806714507a (diff)
319 files changed, 13179 insertions, 6167 deletions
diff --git a/src/egl/Makefile.am b/src/egl/Makefile.am
index 5c2ba301ffb..6953d44e607 100644
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -47,12 +47,21 @@ libEGL_la_LDFLAGS = \
 	$(LD_NO_UNDEFINED)
 
 dri2_backend_FILES =
+dri3_backend_FILES =
 
 if HAVE_EGL_PLATFORM_X11
 AM_CFLAGS += -DHAVE_X11_PLATFORM
 AM_CFLAGS += $(XCB_DRI2_CFLAGS)
 libEGL_la_LIBADD += $(XCB_DRI2_LIBS)
 dri2_backend_FILES += drivers/dri2/platform_x11.c
+
+if HAVE_DRI3
+dri3_backend_FILES += \
+	drivers/dri2/platform_x11_dri3.c \
+	drivers/dri2/platform_x11_dri3.h
+
+libEGL_la_LIBADD += $(top_builddir)/src/loader/libloader_dri3_helper.la
+endif
 endif
 
 if HAVE_EGL_PLATFORM_WAYLAND
@@ -88,7 +97,8 @@ AM_CFLAGS += \
 
 libEGL_la_SOURCES += \
 	$(dri2_backend_core_FILES) \
-	$(dri2_backend_FILES)
+	$(dri2_backend_FILES) \
+	$(dri3_backend_FILES)
 
 libEGL_la_LIBADD += $(top_builddir)/src/loader/libloader.la
 libEGL_la_LIBADD += $(DLOPEN_LIBS) $(LIBDRM_LIBS)
@@ -111,7 +121,10 @@ egl_HEADERS = \
 	$(top_srcdir)/include/EGL/eglmesaext.h \
 	$(top_srcdir)/include/EGL/eglplatform.h
 
+TESTS = egl-symbols-check
+
 EXTRA_DIST = \
+	egl-symbols-check \
 	SConscript \
 	drivers/haiku \
 	docs \
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 4cc5f231333..d34b16119e2 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -352,6 +352,12 @@ struct dri2_extension_match {
    int offset;
 };
 
+static struct dri2_extension_match dri3_driver_extensions[] = {
+   { __DRI_CORE, 1, offsetof(struct dri2_egl_display, core) },
+   { __DRI_IMAGE_DRIVER, 1, offsetof(struct dri2_egl_display, image_driver) },
+   { NULL, 0, 0 }
+};
+
 static struct dri2_extension_match dri2_driver_extensions[] = {
    { __DRI_CORE, 1, offsetof(struct dri2_egl_display, core) },
    { __DRI_DRI2, 2, offsetof(struct dri2_egl_display, dri2) },
@@ -385,13 +391,13 @@ dri2_bind_extensions(struct dri2_egl_display *dri2_dpy,
    void *field;
 
    for (i = 0; extensions[i]; i++) {
-      _eglLog(_EGL_DEBUG, "DRI2: found extension `%s'", extensions[i]->name);
+      _eglLog(_EGL_DEBUG, "found extension `%s'", extensions[i]->name);
       for (j = 0; matches[j].name; j++) {
 	 if (strcmp(extensions[i]->name, matches[j].name) == 0 &&
 	     extensions[i]->version >= matches[j].version) {
 	    field = ((char *) dri2_dpy + matches[j].offset);
 	    *(const __DRIextension **) field = extensions[i];
-	    _eglLog(_EGL_INFO, "DRI2: found extension %s version %d",
+	    _eglLog(_EGL_INFO, "found extension %s version %d",
 		    extensions[i]->name, extensions[i]->version);
 	 }
       }
@@ -400,7 +406,7 @@ dri2_bind_extensions(struct dri2_egl_display *dri2_dpy,
    for (j = 0; matches[j].name; j++) {
       field = ((char *) dri2_dpy + matches[j].offset);
       if (*(const __DRIextension **) field == NULL) {
-	 _eglLog(_EGL_WARNING, "DRI2: did not find extension %s version %d",
+         _eglLog(_EGL_WARNING, "did not find extension %s version %d",
 		 matches[j].name, matches[j].version);
 	 ret = EGL_FALSE;
       }
@@ -494,6 +500,25 @@ dri2_open_driver(_EGLDisplay *disp)
 }
 
 EGLBoolean
+dri2_load_driver_dri3(_EGLDisplay *disp)
+{
+   struct dri2_egl_display *dri2_dpy = disp->DriverData;
+   const __DRIextension **extensions;
+
+   extensions = dri2_open_driver(disp);
+   if (!extensions)
+      return EGL_FALSE;
+
+   if (!dri2_bind_extensions(dri2_dpy, dri3_driver_extensions, extensions)) {
+      dlclose(dri2_dpy->driver);
+      return EGL_FALSE;
+   }
+   dri2_dpy->driver_extensions = extensions;
+
+   return EGL_TRUE;
+}
+
+EGLBoolean
 dri2_load_driver(_EGLDisplay *disp)
 {
    struct dri2_egl_display *dri2_dpy = disp->DriverData;
@@ -550,7 +575,9 @@ dri2_setup_screen(_EGLDisplay *disp)
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    unsigned int api_mask;
 
-   if (dri2_dpy->dri2) {
+   if (dri2_dpy->image_driver) {
+      api_mask = dri2_dpy->image_driver->getAPIMask(dri2_dpy->dri_screen);
+   } else if (dri2_dpy->dri2) {
       api_mask = dri2_dpy->dri2->getAPIMask(dri2_dpy->dri_screen);
    } else {
       assert(dri2_dpy->swrast);
@@ -570,7 +597,7 @@ dri2_setup_screen(_EGLDisplay *disp)
    if (api_mask & (1 << __DRI_API_GLES3))
       disp->ClientAPIs |= EGL_OPENGL_ES3_BIT_KHR;
 
-   assert(dri2_dpy->dri2 || dri2_dpy->swrast);
+   assert(dri2_dpy->image_driver || dri2_dpy->dri2 || dri2_dpy->swrast);
    disp->Extensions.KHR_surfaceless_context = EGL_TRUE;
    disp->Extensions.MESA_configless_context = EGL_TRUE;
 
@@ -578,7 +605,8 @@ dri2_setup_screen(_EGLDisplay *disp)
                                    __DRI2_RENDERER_HAS_FRAMEBUFFER_SRGB))
       disp->Extensions.KHR_gl_colorspace = EGL_TRUE;
 
-   if ((dri2_dpy->dri2 && dri2_dpy->dri2->base.version >= 3) ||
+   if (dri2_dpy->image_driver ||
+       (dri2_dpy->dri2 && dri2_dpy->dri2->base.version >= 3) ||
        (dri2_dpy->swrast && dri2_dpy->swrast->base.version >= 3)) {
       disp->Extensions.KHR_create_context = EGL_TRUE;
 
@@ -641,7 +669,14 @@ dri2_create_screen(_EGLDisplay *disp)
 
    dri2_dpy = disp->DriverData;
 
-   if (dri2_dpy->dri2) {
+   if (dri2_dpy->image_driver) {
+      dri2_dpy->dri_screen =
+         dri2_dpy->image_driver->createNewScreen2(0, dri2_dpy->fd,
+                                                  dri2_dpy->extensions,
+                                                  dri2_dpy->driver_extensions,
+                                                  &dri2_dpy->driver_configs,
+                                                  disp);
+   } else if (dri2_dpy->dri2) {
       if (dri2_dpy->dri2->base.version >= 4) {
          dri2_dpy->dri_screen =
             dri2_dpy->dri2->createNewScreen2(0, dri2_dpy->fd,
@@ -677,7 +712,7 @@ dri2_create_screen(_EGLDisplay *disp)
 
    extensions = dri2_dpy->core->getExtensions(dri2_dpy->dri_screen);
 
-   if (dri2_dpy->dri2) {
+   if (dri2_dpy->image_driver || dri2_dpy->dri2) {
       if (!dri2_bind_extensions(dri2_dpy, dri2_core_extensions, extensions))
          goto cleanup_dri_screen;
    } else {
@@ -1024,7 +1059,26 @@ dri2_create_context(_EGLDriver *drv, _EGLDisplay *disp, _EGLConfig *conf,
    else
       dri_config = NULL;
 
-   if (dri2_dpy->dri2) {
+   if (dri2_dpy->image_driver) {
+      unsigned error;
+      unsigned num_attribs = 8;
+      uint32_t ctx_attribs[8];
+
+      if (!dri2_fill_context_attribs(dri2_ctx, dri2_dpy, ctx_attribs,
+                                        &num_attribs))
+         goto cleanup;
+
+      dri2_ctx->dri_context =
+         dri2_dpy->image_driver->createContextAttribs(dri2_dpy->dri_screen,
+                                                      api,
+                                                      dri_config,
+                                                      shared,
+                                                      num_attribs / 2,
+                                                      ctx_attribs,
+                                                      & error,
+                                                      dri2_ctx);
+      dri2_create_context_attribs_error(error);
+   } else if (dri2_dpy->dri2) {
       if (dri2_dpy->dri2->base.version >= 3) {
          unsigned error;
          unsigned num_attribs = 8;
@@ -1119,11 +1173,10 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
 {
    struct dri2_egl_driver *dri2_drv = dri2_egl_driver(drv);
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   struct dri2_egl_surface *dri2_dsurf = dri2_egl_surface(dsurf);
-   struct dri2_egl_surface *dri2_rsurf = dri2_egl_surface(rsurf);
    struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
    _EGLContext *old_ctx;
    _EGLSurface *old_dsurf, *old_rsurf;
+   _EGLSurface *tmp_dsurf, *tmp_rsurf;
    __DRIdrawable *ddraw, *rdraw;
    __DRIcontext *cctx;
 
@@ -1135,8 +1188,8 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
    if (old_ctx && dri2_drv->glFlush)
       dri2_drv->glFlush();
 
-   ddraw = (dri2_dsurf) ? dri2_dsurf->dri_drawable : NULL;
-   rdraw = (dri2_rsurf) ? dri2_rsurf->dri_drawable : NULL;
+   ddraw = (dsurf) ? dri2_dpy->vtbl->get_dri_drawable(dsurf) : NULL;
+   rdraw = (rsurf) ? dri2_dpy->vtbl->get_dri_drawable(rsurf) : NULL;
    cctx = (dri2_ctx) ? dri2_ctx->dri_context : NULL;
 
    if (old_ctx) {
@@ -1156,10 +1209,10 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
       return EGL_TRUE;
    } else {
       /* undo the previous _eglBindContext */
-      _eglBindContext(old_ctx, old_dsurf, old_rsurf, &ctx, &dsurf, &rsurf);
+      _eglBindContext(old_ctx, old_dsurf, old_rsurf, &ctx, &tmp_dsurf, &tmp_rsurf);
       assert(&dri2_ctx->base == ctx &&
-             &dri2_dsurf->base == dsurf &&
-             &dri2_rsurf->base == rsurf);
+             tmp_dsurf == dsurf &&
+             tmp_rsurf == rsurf);
 
       _eglPutSurface(dsurf);
       _eglPutSurface(rsurf);
@@ -1173,6 +1226,14 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
    }
 }
 
+__DRIdrawable *
+dri2_surface_get_dri_drawable(_EGLSurface *surf)
+{
+   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
+
+   return dri2_surf->dri_drawable;
+}
+
 /*
  * Called from eglGetProcAddress() via drv->API.GetProcAddress().
  */
@@ -1235,7 +1296,7 @@ void
 dri2_flush_drawable_for_swapbuffers(_EGLDisplay *disp, _EGLSurface *draw)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
+   __DRIdrawable *dri_drawable = dri2_dpy->vtbl->get_dri_drawable(draw);
 
    if (dri2_dpy->flush) {
       if (dri2_dpy->flush->base.version >= 4) {
@@ -1253,12 +1314,12 @@ dri2_flush_drawable_for_swapbuffers(_EGLDisplay *disp, _EGLSurface *draw)
           *      after calling eglSwapBuffers."
           */
          dri2_dpy->flush->flush_with_flags(dri2_ctx->dri_context,
-                                           dri2_surf->dri_drawable,
+                                           dri_drawable,
                                            __DRI2_FLUSH_DRAWABLE |
                                            __DRI2_FLUSH_INVALIDATE_ANCILLARY,
                                            __DRI2_THROTTLE_SWAPBUFFER);
       } else {
-         dri2_dpy->flush->flush(dri2_surf->dri_drawable);
+         dri2_dpy->flush->flush(dri_drawable);
       }
    }
 }
@@ -1315,7 +1376,8 @@ static EGLBoolean
 dri2_wait_client(_EGLDriver *drv, _EGLDisplay *disp, _EGLContext *ctx)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(ctx->DrawSurface);
+   _EGLSurface *surf = ctx->DrawSurface;
+   __DRIdrawable *dri_drawable = dri2_dpy->vtbl->get_dri_drawable(surf);
 
    (void) drv;
 
@@ -1323,7 +1385,7 @@ dri2_wait_client(_EGLDriver *drv, _EGLDisplay *disp, _EGLContext *ctx)
     * we need to copy fake to real here.*/
 
    if (dri2_dpy->flush != NULL)
-      dri2_dpy->flush->flush(dri2_surf->dri_drawable);
+      dri2_dpy->flush->flush(dri_drawable);
 
    return EGL_TRUE;
 }
@@ -1346,10 +1408,10 @@ dri2_bind_tex_image(_EGLDriver *drv,
 		    _EGLDisplay *disp, _EGLSurface *surf, EGLint buffer)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
    struct dri2_egl_context *dri2_ctx;
    _EGLContext *ctx;
    GLint format, target;
+   __DRIdrawable *dri_drawable = dri2_dpy->vtbl->get_dri_drawable(surf);
 
    ctx = _eglGetCurrentContext();
    dri2_ctx = dri2_egl_context(ctx);
@@ -1357,7 +1419,7 @@ dri2_bind_tex_image(_EGLDriver *drv,
    if (!_eglBindTexImage(drv, disp, surf, buffer))
       return EGL_FALSE;
 
-   switch (dri2_surf->base.TextureFormat) {
+   switch (surf->TextureFormat) {
    case EGL_TEXTURE_RGB:
       format = __DRI_TEXTURE_FORMAT_RGB;
       break;
@@ -1369,7 +1431,7 @@ dri2_bind_tex_image(_EGLDriver *drv,
       format = __DRI_TEXTURE_FORMAT_RGBA;
    }
 
-   switch (dri2_surf->base.TextureTarget) {
+   switch (surf->TextureTarget) {
    case EGL_TEXTURE_2D:
       target = GL_TEXTURE_2D;
       break;
@@ -1380,7 +1442,7 @@ dri2_bind_tex_image(_EGLDriver *drv,
 
    (*dri2_dpy->tex_buffer->setTexBuffer2)(dri2_ctx->dri_context,
 					  target, format,
-					  dri2_surf->dri_drawable);
+					  dri_drawable);
 
    return EGL_TRUE;
 }
@@ -1390,10 +1452,10 @@ dri2_release_tex_image(_EGLDriver *drv,
 		       _EGLDisplay *disp, _EGLSurface *surf, EGLint buffer)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
    struct dri2_egl_context *dri2_ctx;
    _EGLContext *ctx;
    GLint  target;
+   __DRIdrawable *dri_drawable = dri2_dpy->vtbl->get_dri_drawable(surf);
 
    ctx = _eglGetCurrentContext();
    dri2_ctx = dri2_egl_context(ctx);
@@ -1401,7 +1463,7 @@ dri2_release_tex_image(_EGLDriver *drv,
    if (!_eglReleaseTexImage(drv, disp, surf, buffer))
       return EGL_FALSE;
 
-   switch (dri2_surf->base.TextureTarget) {
+   switch (surf->TextureTarget) {
    case EGL_TEXTURE_2D:
       target = GL_TEXTURE_2D;
       break;
@@ -1413,7 +1475,7 @@ dri2_release_tex_image(_EGLDriver *drv,
        dri2_dpy->tex_buffer->releaseTexBuffer != NULL) {
       (*dri2_dpy->tex_buffer->releaseTexBuffer)(dri2_ctx->dri_context,
                                                 target,
-                                                dri2_surf->dri_drawable);
+                                                dri_drawable);
    }
 
    return EGL_TRUE;
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 0e837b3eb8b..52ad92b182d 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -35,6 +35,10 @@
 #include <xcb/dri2.h>
 #include <xcb/xfixes.h>
 #include <X11/Xlib-xcb.h>
+
+#ifdef HAVE_DRI3
+#include "loader_dri3_helper.h"
+#endif
 #endif
 
 #ifdef HAVE_WAYLAND_PLATFORM
@@ -145,6 +149,8 @@ struct dri2_egl_display_vtbl {
    EGLBoolean (*get_sync_values)(_EGLDisplay *display, _EGLSurface *surface,
                                  EGLuint64KHR *ust, EGLuint64KHR *msc,
                                  EGLuint64KHR *sbc);
+
+   __DRIdrawable *(*get_dri_drawable)(_EGLSurface *surf);
 };
 
 struct dri2_egl_display
@@ -158,6 +164,7 @@ struct dri2_egl_display
    const __DRIconfig       **driver_configs;
    void                     *driver;
    const __DRIcoreExtension       *core;
+   const __DRIimageDriverExtension *image_driver;
    const __DRIdri2Extension       *dri2;
    const __DRIswrastExtension     *swrast;
    const __DRI2flushExtension     *flush;
@@ -190,6 +197,9 @@ struct dri2_egl_display
 #ifdef HAVE_X11_PLATFORM
    xcb_connection_t         *conn;
    int                      screen;
+#ifdef HAVE_DRI3
+   struct loader_dri3_extensions loader_dri3_ext;
+#endif
 #endif
 
 #ifdef HAVE_WAYLAND_PLATFORM
@@ -203,8 +213,9 @@ struct dri2_egl_display
    int			     formats;
    uint32_t                  capabilities;
    int			     is_render_node;
-   int			     is_different_gpu;
 #endif
+
+   int			     is_different_gpu;
 };
 
 struct dri2_egl_context
@@ -325,8 +336,14 @@ EGLBoolean
 dri2_load_driver_swrast(_EGLDisplay *disp);
 
 EGLBoolean
+dri2_load_driver_dri3(_EGLDisplay *disp);
+
+EGLBoolean
 dri2_create_screen(_EGLDisplay *disp);
 
+__DRIdrawable *
+dri2_surface_get_dri_drawable(_EGLSurface *surf);
+
 __DRIimage *
 dri2_lookup_egl_image(__DRIscreen *screen, void *image, void *data);
 
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 4abe82f63a0..8f3abcb9867 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -650,6 +650,7 @@ static struct dri2_egl_display_vtbl droid_display_vtbl = {
    .query_buffer_age = dri2_fallback_query_buffer_age,
    .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
    .get_sync_values = dri2_fallback_get_sync_values,
+   .get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
 EGLBoolean
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index 815d2674cb2..3f4f7e78190 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -594,6 +594,7 @@ static struct dri2_egl_display_vtbl dri2_drm_display_vtbl = {
    .query_buffer_age = dri2_drm_query_buffer_age,
    .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
    .get_sync_values = dri2_fallback_get_sync_values,
+   .get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
 EGLBoolean
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index a635c758da1..c2438f7509b 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -1025,6 +1025,7 @@ static struct dri2_egl_display_vtbl dri2_wl_display_vtbl = {
    .query_buffer_age = dri2_wl_query_buffer_age,
    .create_wayland_buffer_from_image = dri2_wl_create_wayland_buffer_from_image,
    .get_sync_values = dri2_fallback_get_sync_values,
+   .get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
 static EGLBoolean
@@ -1752,6 +1753,7 @@ static struct dri2_egl_display_vtbl dri2_wl_swrast_display_vtbl = {
    .query_buffer_age = dri2_fallback_query_buffer_age,
    .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
    .get_sync_values = dri2_fallback_get_sync_values,
+   .get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
 static EGLBoolean
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 88a06a8c6a8..08cbf2d8393 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -45,6 +45,10 @@
 #include "egl_dri2_fallbacks.h"
 #include "loader.h"
 
+#ifdef HAVE_DRI3
+#include "platform_x11_dri3.h"
+#endif
+
 static EGLBoolean
 dri2_x11_swap_interval(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf,
                        EGLint interval);
@@ -703,7 +707,7 @@ dri2_x11_local_authenticate(_EGLDisplay *disp)
 
 static EGLBoolean
 dri2_x11_add_configs_for_visuals(struct dri2_egl_display *dri2_dpy,
-                                 _EGLDisplay *disp)
+                                 _EGLDisplay *disp, bool supports_preserved)
 {
    xcb_screen_iterator_t s;
    xcb_depth_iterator_t d;
@@ -724,8 +728,10 @@ dri2_x11_add_configs_for_visuals(struct dri2_egl_display *dri2_dpy,
    surface_type =
       EGL_WINDOW_BIT |
       EGL_PIXMAP_BIT |
-      EGL_PBUFFER_BIT |
-      EGL_SWAP_BEHAVIOR_PRESERVED_BIT;
+      EGL_PBUFFER_BIT;
+
+   if (supports_preserved)
+      surface_type |= EGL_SWAP_BEHAVIOR_PRESERVED_BIT;
 
    while (d.rem > 0) {
       EGLBoolean class_added[6] = { 0, };
@@ -1112,6 +1118,7 @@ static struct dri2_egl_display_vtbl dri2_x11_swrast_display_vtbl = {
    .query_buffer_age = dri2_fallback_query_buffer_age,
    .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
    .get_sync_values = dri2_fallback_get_sync_values,
+   .get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
 static struct dri2_egl_display_vtbl dri2_x11_display_vtbl = {
@@ -1130,6 +1137,7 @@ static struct dri2_egl_display_vtbl dri2_x11_display_vtbl = {
    .query_buffer_age = dri2_fallback_query_buffer_age,
    .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
    .get_sync_values = dri2_x11_get_sync_values,
+   .get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
 static EGLBoolean
@@ -1179,7 +1187,7 @@ dri2_initialize_x11_swrast(_EGLDriver *drv, _EGLDisplay *disp)
    if (!dri2_create_screen(disp))
       goto cleanup_driver;
 
-   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
+   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp, true))
       goto cleanup_configs;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
@@ -1250,6 +1258,100 @@ dri2_x11_setup_swap_interval(struct dri2_egl_display *dri2_dpy)
    }
 }
 
+#ifdef HAVE_DRI3
+static EGLBoolean
+dri2_initialize_x11_dri3(_EGLDriver *drv, _EGLDisplay *disp)
+{
+   struct dri2_egl_display *dri2_dpy;
+
+   dri2_dpy = calloc(1, sizeof *dri2_dpy);
+   if (!dri2_dpy)
+      return _eglError(EGL_BAD_ALLOC, "eglInitialize");
+
+   disp->DriverData = (void *) dri2_dpy;
+   if (disp->PlatformDisplay == NULL) {
+      dri2_dpy->conn = xcb_connect(0, &dri2_dpy->screen);
+      dri2_dpy->own_device = true;
+   } else {
+      Display *dpy = disp->PlatformDisplay;
+
+      dri2_dpy->conn = XGetXCBConnection(dpy);
+      dri2_dpy->screen = DefaultScreen(dpy);
+   }
+
+   if (xcb_connection_has_error(dri2_dpy->conn)) {
+      _eglLog(_EGL_WARNING, "DRI3: xcb_connect failed");
+      goto cleanup_dpy;
+   }
+
+   if (dri2_dpy->conn) {
+      if (!dri3_x11_connect(dri2_dpy))
+         goto cleanup_conn;
+   }
+
+   if (!dri2_load_driver_dri3(disp))
+      goto cleanup_conn;
+
+   dri2_dpy->extensions[0] = &dri3_image_loader_extension.base;
+   dri2_dpy->extensions[1] = &use_invalidate.base;
+   dri2_dpy->extensions[2] = &image_lookup_extension.base;
+   dri2_dpy->extensions[3] = NULL;
+
+   dri2_dpy->swap_available = true;
+   dri2_dpy->invalidate_available = true;
+
+   if (!dri2_create_screen(disp))
+      goto cleanup_fd;
+
+   dri2_x11_setup_swap_interval(dri2_dpy);
+
+   if (!dri2_dpy->is_different_gpu)
+      disp->Extensions.KHR_image_pixmap = EGL_TRUE;
+   disp->Extensions.NOK_texture_from_pixmap = EGL_TRUE;
+   disp->Extensions.CHROMIUM_sync_control = EGL_TRUE;
+   disp->Extensions.EXT_buffer_age = EGL_TRUE;
+
+#ifdef HAVE_WAYLAND_PLATFORM
+   disp->Extensions.WL_bind_wayland_display = EGL_TRUE;
+#endif
+
+   if (dri2_dpy->conn) {
+      if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp, false))
+         goto cleanup_configs;
+   }
+
+   dri2_dpy->loader_dri3_ext.core = dri2_dpy->core;
+   dri2_dpy->loader_dri3_ext.image_driver = dri2_dpy->image_driver;
+   dri2_dpy->loader_dri3_ext.flush = dri2_dpy->flush;
+   dri2_dpy->loader_dri3_ext.tex_buffer = dri2_dpy->tex_buffer;
+   dri2_dpy->loader_dri3_ext.image = dri2_dpy->image;
+   dri2_dpy->loader_dri3_ext.config = dri2_dpy->config;
+
+   /* Fill vtbl last to prevent accidentally calling virtual function during
+    * initialization.
+    */
+   dri2_dpy->vtbl = &dri3_x11_display_vtbl;
+
+   _eglLog(_EGL_INFO, "Using DRI3");
+
+   return EGL_TRUE;
+
+ cleanup_configs:
+   _eglCleanupDisplay(disp);
+   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
+   dlclose(dri2_dpy->driver);
+ cleanup_fd:
+   close(dri2_dpy->fd);
+ cleanup_conn:
+   if (disp->PlatformDisplay == NULL)
+      xcb_disconnect(dri2_dpy->conn);
+ cleanup_dpy:
+   free(dri2_dpy);
+
+   return EGL_FALSE;
+}
+#endif
+
 static EGLBoolean
 dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
 {
@@ -1321,7 +1423,7 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
    disp->Extensions.WL_bind_wayland_display = EGL_TRUE;
 #endif
 
-   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
+   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp, true))
       goto cleanup_configs;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
@@ -1329,6 +1431,8 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
     */
    dri2_dpy->vtbl = &dri2_x11_display_vtbl;
 
+   _eglLog(_EGL_INFO, "Using DRI2");
+
    return EGL_TRUE;
 
  cleanup_configs:
@@ -1355,9 +1459,16 @@ dri2_initialize_x11(_EGLDriver *drv, _EGLDisplay *disp)
    int x11_dri2_accel = (getenv("LIBGL_ALWAYS_SOFTWARE") == NULL);
 
    if (x11_dri2_accel) {
-      if (!dri2_initialize_x11_dri2(drv, disp)) {
-         initialized = dri2_initialize_x11_swrast(drv, disp);
+#ifdef HAVE_DRI3
+      if (getenv("LIBGL_DRI3_DISABLE") != NULL ||
+          !dri2_initialize_x11_dri3(drv, disp)) {
+#endif
+         if (!dri2_initialize_x11_dri2(drv, disp)) {
+            initialized = dri2_initialize_x11_swrast(drv, disp);
+         }
+#ifdef HAVE_DRI3
       }
+#endif
    } else {
       initialized = dri2_initialize_x11_swrast(drv, disp);
    }
diff --git a/src/egl/drivers/dri2/platform_x11_dri3.c b/src/egl/drivers/dri2/platform_x11_dri3.c
new file mode 100644
index 00000000000..8e4a131b11a
--- /dev/null
+++ b/src/egl/drivers/dri2/platform_x11_dri3.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright © 2015 Boyan Ding
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <xcb/xcb.h>
+#include <xcb/dri3.h>
+#include <xcb/present.h>
+
+#include <xf86drm.h>
+
+#include "egl_dri2.h"
+#include "egl_dri2_fallbacks.h"
+#include "platform_x11_dri3.h"
+
+#include "loader.h"
+#include "loader_dri3_helper.h"
+
+static struct dri3_egl_surface *
+loader_drawable_to_egl_surface(struct loader_dri3_drawable *draw) {
+   size_t offset = offsetof(struct dri3_egl_surface, loader_drawable);
+   return (struct dri3_egl_surface *)(((void*) draw) - offset);
+}
+
+static int
+egl_dri3_get_swap_interval(struct loader_dri3_drawable *draw)
+{
+   struct dri3_egl_surface *dri3_surf = loader_drawable_to_egl_surface(draw);
+
+   return dri3_surf->base.SwapInterval;
+}
+
+static int
+egl_dri3_clamp_swap_interval(struct loader_dri3_drawable *draw, int interval)
+{
+   struct dri3_egl_surface *dri3_surf = loader_drawable_to_egl_surface(draw);
+
+   if (interval > dri3_surf->base.Config->MaxSwapInterval)
+      interval = dri3_surf->base.Config->MaxSwapInterval;
+   else if (interval < dri3_surf->base.Config->MinSwapInterval)
+      interval = dri3_surf->base.Config->MinSwapInterval;
+
+   return interval;
+}
+
+static void
+egl_dri3_set_swap_interval(struct loader_dri3_drawable *draw, int interval)
+{
+   struct dri3_egl_surface *dri3_surf = loader_drawable_to_egl_surface(draw);
+
+   dri3_surf->base.SwapInterval = interval;
+}
+
+static void
+egl_dri3_set_drawable_size(struct loader_dri3_drawable *draw,
+                           int width, int height)
+{
+   struct dri3_egl_surface *dri3_surf = loader_drawable_to_egl_surface(draw);
+
+   dri3_surf->base.Width = width;
+   dri3_surf->base.Height = height;
+}
+
+static bool
+egl_dri3_in_current_context(struct loader_dri3_drawable *draw)
+{
+   struct dri3_egl_surface *dri3_surf = loader_drawable_to_egl_surface(draw);
+   _EGLContext *ctx = _eglGetCurrentContext();
+
+   return ctx->Resource.Display == dri3_surf->base.Resource.Display;
+}
+
+static __DRIcontext *
+egl_dri3_get_dri_context(struct loader_dri3_drawable *draw)
+{
+   _EGLContext *ctx = _eglGetCurrentContext();
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+
+   return dri2_ctx->dri_context;
+}
+
+static void
+egl_dri3_flush_drawable(struct loader_dri3_drawable *draw, unsigned flags)
+{
+   struct dri3_egl_surface *dri3_surf = loader_drawable_to_egl_surface(draw);
+   _EGLDisplay *disp = dri3_surf->base.Resource.Display;
+
+   dri2_flush_drawable_for_swapbuffers(disp, &dri3_surf->base);
+}
+
+static struct loader_dri3_vtable egl_dri3_vtable = {
+   .get_swap_interval = egl_dri3_get_swap_interval,
+   .clamp_swap_interval = egl_dri3_clamp_swap_interval,
+   .set_swap_interval = egl_dri3_set_swap_interval,
+   .set_drawable_size = egl_dri3_set_drawable_size,
+   .in_current_context = egl_dri3_in_current_context,
+   .get_dri_context = egl_dri3_get_dri_context,
+   .flush_drawable = egl_dri3_flush_drawable,
+   .show_fps = NULL,
+};
+
+static EGLBoolean
+dri3_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
+{
+   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(surf);
+
+   (void) drv;
+
+   if (!_eglPutSurface(surf))
+      return EGL_TRUE;
+
+   loader_dri3_drawable_fini(&dri3_surf->loader_drawable);
+
+   free(surf);
+
+   return EGL_TRUE;
+}
+
+static EGLBoolean
+dri3_set_swap_interval(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf,
+                       EGLint interval)
+{
+   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(surf);
+
+   loader_dri3_set_swap_interval(&dri3_surf->loader_drawable, interval);
+
+   return EGL_TRUE;
+}
+
+static xcb_screen_t *
+get_xcb_screen(xcb_screen_iterator_t iter, int screen)
+{
+    for (; iter.rem; --screen, xcb_screen_next(&iter))
+        if (screen == 0)
+            return iter.data;
+
+    return NULL;
+}
+
+static _EGLSurface *
+dri3_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
+                    _EGLConfig *conf, void *native_surface,
+                    const EGLint *attrib_list)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
+   struct dri3_egl_surface *dri3_surf;
+   const __DRIconfig *dri_config;
+   xcb_drawable_t drawable;
+   xcb_screen_iterator_t s;
+   xcb_screen_t *screen;
+
+   STATIC_ASSERT(sizeof(uintptr_t) == sizeof(native_surface));
+   drawable = (uintptr_t) native_surface;
+
+   (void) drv;
+
+   dri3_surf = calloc(1, sizeof *dri3_surf);
+   if (!dri3_surf) {
+      _eglError(EGL_BAD_ALLOC, "dri3_create_surface");
+      return NULL;
+   }
+
+   if (!_eglInitSurface(&dri3_surf->base, disp, type, conf, attrib_list))
+      goto cleanup_surf;
+
+   if (type == EGL_PBUFFER_BIT) {
+      s = xcb_setup_roots_iterator(xcb_get_setup(dri2_dpy->conn));
+      screen = get_xcb_screen(s, dri2_dpy->screen);
+      if (!screen) {
+         _eglError(EGL_BAD_NATIVE_WINDOW, "dri3_create_surface");
+         goto cleanup_surf;
+      }
+
+      drawable = xcb_generate_id(dri2_dpy->conn);
+      xcb_create_pixmap(dri2_dpy->conn, conf->BufferSize,
+                        drawable, screen->root,
+                        dri3_surf->base.Width, dri3_surf->base.Height);
+   }
+
+   dri_config = dri2_get_dri_config(dri2_conf, type,
+                                    dri3_surf->base.GLColorspace);
+
+   if (loader_dri3_drawable_init(dri2_dpy->conn, drawable,
+                                 dri2_dpy->dri_screen,
+                                 dri2_dpy->is_different_gpu, dri_config,
+                                 &dri2_dpy->loader_dri3_ext,
+                                 &egl_dri3_vtable,
+                                 &dri3_surf->loader_drawable)) {
+      _eglError(EGL_BAD_ALLOC, "dri3_surface_create");
+      goto cleanup_pixmap;
+   }
+
+   return &dri3_surf->base;
+
+ cleanup_pixmap:
+   if (type == EGL_PBUFFER_BIT)
+      xcb_free_pixmap(dri2_dpy->conn, drawable);
+ cleanup_surf:
+   free(dri3_surf);
+
+   return NULL;
+}
+
+/**
+ * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
+ */
+static _EGLSurface *
+dri3_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
+                           _EGLConfig *conf, void *native_window,
+                           const EGLint *attrib_list)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   _EGLSurface *surf;
+
+   surf = dri3_create_surface(drv, disp, EGL_WINDOW_BIT, conf,
+                              native_window, attrib_list);
+   if (surf != NULL)
+      dri3_set_swap_interval(drv, disp, surf, dri2_dpy->default_swap_interval);
+
+   return surf;
+}
+
+static _EGLSurface *
+dri3_create_pixmap_surface(_EGLDriver *drv, _EGLDisplay *disp,
+                           _EGLConfig *conf, void *native_pixmap,
+                           const EGLint *attrib_list)
+{
+   return dri3_create_surface(drv, disp, EGL_PIXMAP_BIT, conf,
+                              native_pixmap, attrib_list);
+}
+
+static _EGLSurface *
+dri3_create_pbuffer_surface(_EGLDriver *drv, _EGLDisplay *disp,
+                                _EGLConfig *conf, const EGLint *attrib_list)
+{
+   return dri3_create_surface(drv, disp, EGL_PBUFFER_BIT, conf,
+                              XCB_WINDOW_NONE, attrib_list);
+}
+
+static EGLBoolean
+dri3_get_sync_values(_EGLDisplay *display, _EGLSurface *surface,
+                     EGLuint64KHR *ust, EGLuint64KHR *msc,
+                     EGLuint64KHR *sbc)
+{
+   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(surface);
+
+   return loader_dri3_wait_for_msc(&dri3_surf->loader_drawable, 0, 0, 0,
+                                   (int64_t *) ust, (int64_t *) msc,
+                                   (int64_t *) sbc) ? EGL_TRUE : EGL_FALSE;
+}
+
+static _EGLImage *
+dri3_create_image_khr_pixmap(_EGLDisplay *disp, _EGLContext *ctx,
+                             EGLClientBuffer buffer, const EGLint *attr_list)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_image *dri2_img;
+   xcb_drawable_t drawable;
+   xcb_dri3_buffer_from_pixmap_cookie_t bp_cookie;
+   xcb_dri3_buffer_from_pixmap_reply_t  *bp_reply;
+   unsigned int format;
+
+   drawable = (xcb_drawable_t) (uintptr_t) buffer;
+   bp_cookie = xcb_dri3_buffer_from_pixmap(dri2_dpy->conn, drawable);
+   bp_reply = xcb_dri3_buffer_from_pixmap_reply(dri2_dpy->conn,
+                                                bp_cookie, NULL);
+   if (!bp_reply) {
+      _eglError(EGL_BAD_ALLOC, "xcb_dri3_buffer_from_pixmap");
+      return NULL;
+   }
+
+   switch (bp_reply->depth) {
+   case 16:
+      format = __DRI_IMAGE_FORMAT_RGB565;
+      break;
+   case 24:
+      format = __DRI_IMAGE_FORMAT_XRGB8888;
+      break;
+   case 32:
+      format = __DRI_IMAGE_FORMAT_ARGB8888;
+      break;
+   default:
+      _eglError(EGL_BAD_PARAMETER,
+                "dri3_create_image_khr: unsupported pixmap depth");
+      free(bp_reply);
+      return EGL_NO_IMAGE_KHR;
+   }
+
+   dri2_img = malloc(sizeof *dri2_img);
+   if (!dri2_img) {
+      _eglError(EGL_BAD_ALLOC, "dri3_create_image_khr");
+      return EGL_NO_IMAGE_KHR;
+   }
+
+   if (!_eglInitImage(&dri2_img->base, disp)) {
+      free(dri2_img);
+      return EGL_NO_IMAGE_KHR;
+   }
+
+   dri2_img->dri_image = loader_dri3_create_image(dri2_dpy->conn,
+                                                  bp_reply,
+                                                  format,
+                                                  dri2_dpy->dri_screen,
+                                                  dri2_dpy->image,
+                                                  dri2_img);
+
+   free(bp_reply);
+
+   return &dri2_img->base;
+}
+
+static _EGLImage *
+dri3_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
+                      _EGLContext *ctx, EGLenum target,
+                      EGLClientBuffer buffer, const EGLint *attr_list)
+{
+   (void) drv;
+
+   switch (target) {
+   case EGL_NATIVE_PIXMAP_KHR:
+      return dri3_create_image_khr_pixmap(disp, ctx, buffer, attr_list);
+   default:
+      return dri2_create_image_khr(drv, disp, ctx, target, buffer, attr_list);
+   }
+}
+
+/**
+ * Called by the driver when it needs to update the real front buffer with the
+ * contents of its fake front buffer.
+ */
+static void
+dri3_flush_front_buffer(__DRIdrawable *driDrawable, void *loaderPrivate)
+{
+   /* There does not seem to be any kind of consensus on whether we should
+    * support front-buffer rendering or not:
+    * http://lists.freedesktop.org/archives/mesa-dev/2013-June/040129.html
+    */
+   _eglLog(_EGL_WARNING, "FIXME: egl/x11 doesn't support front buffer rendering.");
+   (void) driDrawable;
+   (void) loaderPrivate;
+}
+
+const __DRIimageLoaderExtension dri3_image_loader_extension = {
+   .base = { __DRI_IMAGE_LOADER, 1 },
+
+   .getBuffers          = loader_dri3_get_buffers,
+   .flushFrontBuffer    = dri3_flush_front_buffer,
+};
+
+static EGLBoolean
+dri3_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
+{
+   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(draw);
+
+   /* No-op for a pixmap or pbuffer surface */
+   if (draw->Type == EGL_PIXMAP_BIT || draw->Type == EGL_PBUFFER_BIT)
+      return 0;
+
+   return loader_dri3_swap_buffers_msc(&dri3_surf->loader_drawable,
+                                       0, 0, 0, 0,
+                                       draw->SwapBehavior == EGL_BUFFER_PRESERVED) != -1;
+}
+
+static EGLBoolean
+dri3_copy_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf,
+                  void *native_pixmap_target)
+{
+   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(surf);
+   xcb_pixmap_t target;
+
+   STATIC_ASSERT(sizeof(uintptr_t) == sizeof(native_pixmap_target));
+   target = (uintptr_t) native_pixmap_target;
+
+   loader_dri3_copy_drawable(&dri3_surf->loader_drawable, target,
+                             dri3_surf->loader_drawable.drawable);
+
+   return EGL_TRUE;
+}
+
+static int
+dri3_query_buffer_age(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf)
+{
+   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(surf);
+
+   return loader_dri3_query_buffer_age(&dri3_surf->loader_drawable);
+}
+
+static __DRIdrawable *
+dri3_get_dri_drawable(_EGLSurface *surf)
+{
+   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(surf);
+
+   return dri3_surf->loader_drawable.dri_drawable;
+}
+
+struct dri2_egl_display_vtbl dri3_x11_display_vtbl = {
+   .authenticate = NULL,
+   .create_window_surface = dri3_create_window_surface,
+   .create_pixmap_surface = dri3_create_pixmap_surface,
+   .create_pbuffer_surface = dri3_create_pbuffer_surface,
+   .destroy_surface = dri3_destroy_surface,
+   .create_image = dri3_create_image_khr,
+   .swap_interval = dri3_set_swap_interval,
+   .swap_buffers = dri3_swap_buffers,
+   .swap_buffers_with_damage = dri2_fallback_swap_buffers_with_damage,
+   .swap_buffers_region = dri2_fallback_swap_buffers_region,
+   .post_sub_buffer = dri2_fallback_post_sub_buffer,
+   .copy_buffers = dri3_copy_buffers,
+   .query_buffer_age = dri3_query_buffer_age,
+   .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
+   .get_sync_values = dri3_get_sync_values,
+   .get_dri_drawable = dri3_get_dri_drawable,
+};
+
+static char *
+dri3_get_device_name(int fd)
+{
+   char *ret = NULL;
+
+   ret = drmGetRenderDeviceNameFromFd(fd);
+   if (ret)
+      return ret;
+
+   /* For dri3, render node support is required for WL_bind_wayland_display.
+    * In order not to regress on older systems without kernel or libdrm
+    * support, fall back to dri2. User can override it with environment
+    * variable if they don't need to use that extension.
+    */
+   if (getenv("EGL_FORCE_DRI3") == NULL) {
+      _eglLog(_EGL_WARNING, "Render node support not available, falling back to dri2");
+      _eglLog(_EGL_WARNING, "If you want to force dri3, set EGL_FORCE_DRI3 environment variable");
+   } else
+      ret = loader_get_device_name_for_fd(fd);
+
+   return ret;
+}
+
+EGLBoolean
+dri3_x11_connect(struct dri2_egl_display *dri2_dpy)
+{
+   xcb_dri3_query_version_reply_t *dri3_query;
+   xcb_dri3_query_version_cookie_t dri3_query_cookie;
+   xcb_present_query_version_reply_t *present_query;
+   xcb_present_query_version_cookie_t present_query_cookie;
+   xcb_generic_error_t *error;
+   xcb_screen_iterator_t s;
+   xcb_screen_t *screen;
+   const xcb_query_extension_reply_t *extension;
+
+   xcb_prefetch_extension_data (dri2_dpy->conn, &xcb_dri3_id);
+   xcb_prefetch_extension_data (dri2_dpy->conn, &xcb_present_id);
+
+   extension = xcb_get_extension_data(dri2_dpy->conn, &xcb_dri3_id);
+   if (!(extension && extension->present))
+      return EGL_FALSE;
+
+   extension = xcb_get_extension_data(dri2_dpy->conn, &xcb_present_id);
+   if (!(extension && extension->present))
+      return EGL_FALSE;
+
+   dri3_query_cookie = xcb_dri3_query_version(dri2_dpy->conn,
+                                              XCB_DRI3_MAJOR_VERSION,
+                                              XCB_DRI3_MINOR_VERSION);
+
+   present_query_cookie = xcb_present_query_version(dri2_dpy->conn,
+                                                    XCB_PRESENT_MAJOR_VERSION,
+                                                    XCB_PRESENT_MINOR_VERSION);
+
+   dri3_query =
+      xcb_dri3_query_version_reply(dri2_dpy->conn, dri3_query_cookie, &error);
+   if (dri3_query == NULL || error != NULL) {
+      _eglLog(_EGL_WARNING, "DRI3: failed to query the version");
+      free(dri3_query);
+      free(error);
+      return EGL_FALSE;
+   }
+   free(dri3_query);
+
+   present_query =
+      xcb_present_query_version_reply(dri2_dpy->conn,
+                                      present_query_cookie, &error);
+   if (present_query == NULL || error != NULL) {
+      _eglLog(_EGL_WARNING, "DRI3: failed to query Present version");
+      free(present_query);
+      free(error);
+      return EGL_FALSE;
+   }
+   free(present_query);
+
+   s = xcb_setup_roots_iterator(xcb_get_setup(dri2_dpy->conn));
+   screen = get_xcb_screen(s, dri2_dpy->screen);
+   if (!screen) {
+      _eglError(EGL_BAD_NATIVE_WINDOW, "dri3_x11_connect");
+      return EGL_FALSE;
+   }
+
+   dri2_dpy->fd = loader_dri3_open(dri2_dpy->conn, screen->root, 0);
+   if (dri2_dpy->fd < 0) {
+      int conn_error = xcb_connection_has_error(dri2_dpy->conn);
+      _eglLog(_EGL_WARNING, "DRI3: Screen seems not DRI3 capable");
+
+      if (conn_error)
+         _eglLog(_EGL_WARNING, "DRI3: Failed to initialize");
+
+      return EGL_FALSE;
+   }
+
+   dri2_dpy->fd = loader_get_user_preferred_fd(dri2_dpy->fd, &dri2_dpy->is_different_gpu);
+
+   dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd, 0);
+   if (!dri2_dpy->driver_name) {
+      _eglLog(_EGL_WARNING, "DRI3: No driver found");
+      close(dri2_dpy->fd);
+      return EGL_FALSE;
+   }
+
+   dri2_dpy->device_name = dri3_get_device_name(dri2_dpy->fd);
+   if (!dri2_dpy->device_name) {
+      close(dri2_dpy->fd);
+      return EGL_FALSE;
+   }
+
+   return EGL_TRUE;
+}
diff --git a/src/egl/drivers/dri2/platform_x11_dri3.h b/src/egl/drivers/dri2/platform_x11_dri3.h
new file mode 100644
index 00000000000..13d85724288
--- /dev/null
+++ b/src/egl/drivers/dri2/platform_x11_dri3.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2015 Boyan Ding
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#ifndef EGL_X11_DRI3_INCLUDED
+#define EGL_X11_DRI3_INCLUDED
+
+#include "egl_dri2.h"
+
+_EGL_DRIVER_TYPECAST(dri3_egl_surface, _EGLSurface, obj)
+
+struct dri3_egl_surface {
+   _EGLSurface base;
+   struct loader_dri3_drawable loader_drawable;
+};
+
+extern const __DRIimageLoaderExtension dri3_image_loader_extension;
+extern struct dri2_egl_display_vtbl dri3_x11_display_vtbl;
+
+EGLBoolean
+dri3_x11_connect(struct dri2_egl_display *dri2_dpy);
+
+#endif
diff --git a/src/egl/egl-symbols-check b/src/egl/egl-symbols-check
new file mode 100755
index 00000000000..5d46fed57c9
--- /dev/null
+++ b/src/egl/egl-symbols-check
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+FUNCS=$(nm -D --defined-only ${1-.libs/libEGL.so} | grep -o "T .*" | cut -c 3- | while read func; do
+( grep -q "^$func$" || echo $func )  <<EOF
+eglBindAPI
+eglBindTexImage
+eglChooseConfig
+eglClientWaitSync
+eglCopyBuffers
+eglCreateContext
+eglCreateImage
+eglCreatePbufferFromClientBuffer
+eglCreatePbufferSurface
+eglCreatePixmapSurface
+eglCreatePlatformPixmapSurface
+eglCreatePlatformWindowSurface
+eglCreateSync
+eglCreateWindowSurface
+eglDestroyContext
+eglDestroyImage
+eglDestroySurface
+eglDestroySync
+eglGetConfigAttrib
+eglGetConfigs
+eglGetCurrentContext
+eglGetCurrentDisplay
+eglGetCurrentSurface
+eglGetDisplay
+eglGetError
+eglGetPlatformDisplay
+eglGetProcAddress
+eglGetSyncAttrib
+eglInitialize
+eglMakeCurrent
+eglQueryAPI
+eglQueryContext
+eglQueryString
+eglQuerySurface
+eglReleaseTexImage
+eglReleaseThread
+eglSurfaceAttrib
+eglSwapBuffers
+eglSwapInterval
+eglTerminate
+eglWaitClient
+eglWaitGL
+eglWaitNative
+eglWaitSync
+_fini
+_init
+EOF
+done)
+
+test ! -n "$FUNCS" || echo $FUNCS
+test ! -n "$FUNCS"
diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk
index 39e064e9538..b406d4a5480 100644
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -27,6 +27,7 @@ GALLIUM_TOP := $(call my-dir)
 GALLIUM_COMMON_MK := $(GALLIUM_TOP)/Android.common.mk
 
 SUBDIRS := auxiliary
+SUBDIRS += auxiliary/pipe-loader
 
 #
 # Gallium drivers and their respective winsys
diff --git a/src/gallium/Automake.inc b/src/gallium/Automake.inc
index ee07ab6c8f9..6fe2e22fecf 100644
--- a/src/gallium/Automake.inc
+++ b/src/gallium/Automake.inc
@@ -67,3 +67,9 @@ if HAVE_DRISW
 GALLIUM_PIPE_LOADER_WINSYS_LIBS += \
 	$(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la
 endif
+
+if HAVE_DRISW_KMS
+GALLIUM_PIPE_LOADER_WINSYS_LIBS += \
+	$(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \
+	$(LIBDRM_LIBS)
+endif
diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
index 611d55fafe2..e42a8f17703 100644
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@@ -5,6 +5,7 @@ SUBDIRS =
 ##
 
 SUBDIRS += auxiliary
+SUBDIRS += auxiliary/pipe-loader
 
 ##
 ## Gallium pipe drivers and their respective winsys'
@@ -98,7 +99,7 @@ if HAVE_DRISW
 SUBDIRS += winsys/sw/dri
 endif
 
-if HAVE_DRI2
+if HAVE_DRISW_KMS
 SUBDIRS += winsys/sw/kms-dri
 endif
 
@@ -120,7 +121,8 @@ EXTRA_DIST = \
 ## Gallium state trackers and their users (targets)
 ##
 
-if HAVE_LOADER_GALLIUM
+## XXX: Rename the conditional once we have a config switch for static/dynamic pipe-drivers
+if HAVE_CLOVER
 SUBDIRS += targets/pipe-loader
 endif
 
diff --git a/src/gallium/SConscript b/src/gallium/SConscript
index fa5fa6e8734..0c3a3742c16 100644
--- a/src/gallium/SConscript
+++ b/src/gallium/SConscript
@@ -5,6 +5,7 @@ Import('env')
 #
 
 SConscript('auxiliary/SConscript')
+SConscript('auxiliary/pipe-loader/SConscript')
 
 #
 # Drivers
diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am
index a728162bd9d..ee296ceda33 100644
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -1,7 +1,3 @@
-if HAVE_LOADER_GALLIUM
-SUBDIRS := pipe-loader
-endif
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
@@ -66,15 +62,7 @@ COMMON_VL_CFLAGS = \
 	$(AM_CFLAGS) \
 	$(VL_CFLAGS) \
 	$(DRI2PROTO_CFLAGS) \
-	$(LIBDRM_CFLAGS) \
-	$(GALLIUM_PIPE_LOADER_DEFINES) \
-	-DPIPE_SEARCH_DIR=\"$(libdir)/gallium-pipe\"
-
-if HAVE_GALLIUM_STATIC_TARGETS
-COMMON_VL_CFLAGS += \
-	-DGALLIUM_STATIC_TARGETS=1
-
-endif # HAVE_GALLIUM_STATIC_TARGETS
+	$(LIBDRM_CFLAGS)
 
 noinst_LTLIBRARIES += libgalliumvl.la
 
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 82ef5ecfce4..61601920a94 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -219,8 +219,6 @@ C_SOURCES := \
 	util/u_format.h \
 	util/u_format_etc.c \
 	util/u_format_etc.h \
-	util/u_format_fake.c \
-	util/u_format_fake.h \
 	util/u_format_latc.c \
 	util/u_format_latc.h \
 	util/u_format_other.c \
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 7bda1184ee9..3ee708f4fad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -536,6 +536,15 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
 
 #if defined(PIPE_ARCH_PPC)
    MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
+#if HAVE_LLVM >= 0x0304
+   /*
+    * Make sure VSX instructions are disabled
+    * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=25503#c7
+    */
+   if (util_cpu_caps.has_altivec) {
+      MAttrs.push_back("-vsx");
+   }
+#endif
 #endif
 
    builder.setMAttrs(MAttrs);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 7d2cd9a9e73..28c7a86316e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -2608,7 +2608,12 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
    params.type = bld->bld_base.base.type;
    params.sample_key = sample_key;
    params.texture_index = unit;
-   params.sampler_index = unit;
+   /*
+    * sampler not actually used, set to 0 so it won't exceed PIPE_MAX_SAMPLERS
+    * and trigger some assertions with d3d10 where the sampler view number
+    * can exceed this.
+    */
+   params.sampler_index = 0;
    params.context_ptr = bld->context_ptr;
    params.thread_data_ptr = bld->thread_data_ptr;
    params.coords = coords;
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index ffe30b8fa79..efceb85e38d 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -33,6 +33,7 @@
  * Set GALLIUM_HUD=help for more info.
  */
 
+#include <signal.h>
 #include <stdio.h>
 
 #include "hud/hud_context.h"
@@ -51,12 +52,15 @@
 #include "tgsi/tgsi_text.h"
 #include "tgsi/tgsi_dump.h"
 
+/* Control the visibility of all HUD contexts */
+static boolean huds_visible = TRUE;
 
 struct hud_context {
    struct pipe_context *pipe;
    struct cso_context *cso;
    struct u_upload_mgr *uploader;
 
+   struct hud_batch_query_context *batch_query;
    struct list_head pane_list;
 
    /* states */
@@ -95,6 +99,13 @@ struct hud_context {
    } text, bg, whitelines;
 };
 
+#ifdef PIPE_OS_UNIX
+static void
+signal_visible_handler(int sig, siginfo_t *siginfo, void *context)
+{
+   huds_visible = !huds_visible;
+}
+#endif
 
 static void
 hud_draw_colored_prims(struct hud_context *hud, unsigned prim,
@@ -441,6 +452,9 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    struct hud_pane *pane;
    struct hud_graph *gr;
 
+   if (!huds_visible)
+      return;
+
    hud->fb_width = tex->width0;
    hud->fb_height = tex->height0;
    hud->constants.two_div_fb_width = 2.0f / hud->fb_width;
@@ -510,6 +524,8 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    hud_alloc_vertices(hud, &hud->text, 4 * 512, 4 * sizeof(float));
 
    /* prepare all graphs */
+   hud_batch_query_update(hud->batch_query);
+
    LIST_FOR_EACH_ENTRY(pane, &hud->pane_list, head) {
       LIST_FOR_EACH_ENTRY(gr, &pane->graph_list, head) {
          gr->query_new_value(gr);
@@ -903,17 +919,21 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
       }
       else if (strcmp(name, "samples-passed") == 0 &&
                has_occlusion_query(hud->pipe->screen)) {
-         hud_pipe_query_install(pane, hud->pipe, "samples-passed",
+         hud_pipe_query_install(&hud->batch_query, pane, hud->pipe,
+                                "samples-passed",
                                 PIPE_QUERY_OCCLUSION_COUNTER, 0, 0,
                                 PIPE_DRIVER_QUERY_TYPE_UINT64,
-                                PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
+                                PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE,
+                                0);
       }
       else if (strcmp(name, "primitives-generated") == 0 &&
                has_streamout(hud->pipe->screen)) {
-         hud_pipe_query_install(pane, hud->pipe, "primitives-generated",
+         hud_pipe_query_install(&hud->batch_query, pane, hud->pipe,
+                                "primitives-generated",
                                 PIPE_QUERY_PRIMITIVES_GENERATED, 0, 0,
                                 PIPE_DRIVER_QUERY_TYPE_UINT64,
-                                PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
+                                PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE,
+                                0);
       }
       else {
          boolean processed = FALSE;
@@ -938,17 +958,19 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
                if (strcmp(name, pipeline_statistics_names[i]) == 0)
                   break;
             if (i < Elements(pipeline_statistics_names)) {
-               hud_pipe_query_install(pane, hud->pipe, name,
+               hud_pipe_query_install(&hud->batch_query, pane, hud->pipe, name,
                                       PIPE_QUERY_PIPELINE_STATISTICS, i,
                                       0, PIPE_DRIVER_QUERY_TYPE_UINT64,
-                                      PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
+                                      PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE,
+                                      0);
                processed = TRUE;
             }
          }
 
          /* driver queries */
          if (!processed) {
-            if (!hud_driver_query_install(pane, hud->pipe, name)){
+            if (!hud_driver_query_install(&hud->batch_query, pane, hud->pipe,
+                                          name)) {
                fprintf(stderr, "gallium_hud: unknown driver query '%s'\n", name);
             }
          }
@@ -1125,6 +1147,12 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
    struct pipe_sampler_view view_templ;
    unsigned i;
    const char *env = debug_get_option("GALLIUM_HUD", NULL);
+   unsigned signo = debug_get_num_option("GALLIUM_HUD_TOGGLE_SIGNAL", 0);
+#ifdef PIPE_OS_UNIX
+   static boolean sig_handled = FALSE;
+   struct sigaction action = {};
+#endif
+   huds_visible = debug_get_bool_option("GALLIUM_HUD_VISIBLE", TRUE);
 
    if (!env || !*env)
       return NULL;
@@ -1267,6 +1295,22 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
 
    LIST_INITHEAD(&hud->pane_list);
 
+   /* setup sig handler once for all hud contexts */
+#ifdef PIPE_OS_UNIX
+   if (!sig_handled && signo != 0) {
+      action.sa_sigaction = &signal_visible_handler;
+      action.sa_flags = SA_SIGINFO;
+
+      if (signo >= NSIG)
+         fprintf(stderr, "gallium_hud: invalid signal %u\n", signo);
+      else if (sigaction(signo, &action, NULL) < 0)
+         fprintf(stderr, "gallium_hud: unable to set handler for signal %u\n", signo);
+      fflush(stderr);
+
+      sig_handled = TRUE;
+   }
+#endif
+
    hud_parse_env_var(hud, env);
    return hud;
 }
@@ -1287,6 +1331,7 @@ hud_destroy(struct hud_context *hud)
       FREE(pane);
    }
 
+   hud_batch_query_cleanup(&hud->batch_query);
    pipe->delete_fs_state(pipe, hud->fs_color);
    pipe->delete_fs_state(pipe, hud->fs_text);
    pipe->delete_vs_state(pipe, hud->vs);
diff --git a/src/gallium/auxiliary/hud/hud_driver_query.c b/src/gallium/auxiliary/hud/hud_driver_query.c
index f14305ea835..d7b1f11ed56 100644
--- a/src/gallium/auxiliary/hud/hud_driver_query.c
+++ b/src/gallium/auxiliary/hud/hud_driver_query.c
@@ -34,13 +34,164 @@
 #include "hud/hud_private.h"
 #include "pipe/p_screen.h"
 #include "os/os_time.h"
+#include "util/u_math.h"
 #include "util/u_memory.h"
 #include <stdio.h>
 
+// Must be a power of two
 #define NUM_QUERIES 8
 
+struct hud_batch_query_context {
+   struct pipe_context *pipe;
+   unsigned num_query_types;
+   unsigned allocated_query_types;
+   unsigned *query_types;
+
+   boolean failed;
+   struct pipe_query *query[NUM_QUERIES];
+   union pipe_query_result *result[NUM_QUERIES];
+   unsigned head, pending, results;
+};
+
+void
+hud_batch_query_update(struct hud_batch_query_context *bq)
+{
+   struct pipe_context *pipe;
+
+   if (!bq || bq->failed)
+      return;
+
+   pipe = bq->pipe;
+
+   if (bq->query[bq->head])
+      pipe->end_query(pipe, bq->query[bq->head]);
+
+   bq->results = 0;
+
+   while (bq->pending) {
+      unsigned idx = (bq->head - bq->pending + 1) % NUM_QUERIES;
+      struct pipe_query *query = bq->query[idx];
+
+      if (!bq->result[idx])
+         bq->result[idx] = MALLOC(sizeof(bq->result[idx]->batch[0]) *
+                                  bq->num_query_types);
+      if (!bq->result[idx]) {
+         fprintf(stderr, "gallium_hud: out of memory.\n");
+         bq->failed = TRUE;
+         return;
+      }
+
+      if (!pipe->get_query_result(pipe, query, FALSE, bq->result[idx]))
+         break;
+
+      ++bq->results;
+      --bq->pending;
+   }
+
+   bq->head = (bq->head + 1) % NUM_QUERIES;
+
+   if (bq->pending == NUM_QUERIES) {
+      fprintf(stderr,
+              "gallium_hud: all queries busy after %i frames, dropping data.\n",
+              NUM_QUERIES);
+
+      assert(bq->query[bq->head]);
+
+      pipe->destroy_query(bq->pipe, bq->query[bq->head]);
+      bq->query[bq->head] = NULL;
+   }
+
+   ++bq->pending;
+
+   if (!bq->query[bq->head]) {
+      bq->query[bq->head] = pipe->create_batch_query(pipe,
+                                                     bq->num_query_types,
+                                                     bq->query_types);
+
+      if (!bq->query[bq->head]) {
+         fprintf(stderr,
+                 "gallium_hud: create_batch_query failed. You may have "
+                 "selected too many or incompatible queries.\n");
+         bq->failed = TRUE;
+         return;
+      }
+   }
+
+   if (!pipe->begin_query(pipe, bq->query[bq->head])) {
+      fprintf(stderr,
+              "gallium_hud: could not begin batch query. You may have "
+              "selected too many or incompatible queries.\n");
+      bq->failed = TRUE;
+   }
+}
+
+static boolean
+batch_query_add(struct hud_batch_query_context **pbq,
+                struct pipe_context *pipe, unsigned query_type,
+                unsigned *result_index)
+{
+   struct hud_batch_query_context *bq = *pbq;
+   unsigned i;
+
+   if (!bq) {
+      bq = CALLOC_STRUCT(hud_batch_query_context);
+      if (!bq)
+         return false;
+      bq->pipe = pipe;
+      *pbq = bq;
+   }
+
+   for (i = 0; i < bq->num_query_types; ++i) {
+      if (bq->query_types[i] == query_type) {
+         *result_index = i;
+         return true;
+      }
+   }
+
+   if (bq->num_query_types == bq->allocated_query_types) {
+      unsigned new_alloc = MAX2(16, bq->allocated_query_types * 2);
+      unsigned *new_query_types
+         = REALLOC(bq->query_types,
+                   bq->allocated_query_types * sizeof(unsigned),
+                   new_alloc * sizeof(unsigned));
+      if (!new_query_types)
+         return false;
+      bq->query_types = new_query_types;
+      bq->allocated_query_types = new_alloc;
+   }
+
+   bq->query_types[bq->num_query_types] = query_type;
+   *result_index = bq->num_query_types++;
+   return true;
+}
+
+void
+hud_batch_query_cleanup(struct hud_batch_query_context **pbq)
+{
+   struct hud_batch_query_context *bq = *pbq;
+   unsigned idx;
+
+   if (!bq)
+      return;
+
+   *pbq = NULL;
+
+   if (bq->query[bq->head] && !bq->failed)
+      bq->pipe->end_query(bq->pipe, bq->query[bq->head]);
+
+   for (idx = 0; idx < NUM_QUERIES; ++idx) {
+      if (bq->query[idx])
+         bq->pipe->destroy_query(bq->pipe, bq->query[idx]);
+      FREE(bq->result[idx]);
+   }
+
+   FREE(bq->query_types);
+   FREE(bq);
+}
+
 struct query_info {
    struct pipe_context *pipe;
+   struct hud_batch_query_context *batch;
    unsigned query_type;
    unsigned result_index; /* unit depends on query_type */
    enum pipe_driver_query_result_type result_type;
@@ -48,7 +199,6 @@ struct query_info {
    /* Ring of queries. If a query is busy, we use another slot. */
    struct pipe_query *query[NUM_QUERIES];
    unsigned head, tail;
-   unsigned num_queries;
 
    uint64_t last_time;
    uint64_t results_cumulative;
@@ -56,11 +206,26 @@ struct query_info {
 };
 
 static void
-query_new_value(struct hud_graph *gr)
+query_new_value_batch(struct query_info *info)
+{
+   struct hud_batch_query_context *bq = info->batch;
+   unsigned result_index = info->result_index;
+   unsigned idx = (bq->head - bq->pending) % NUM_QUERIES;
+   unsigned results = bq->results;
+
+   while (results) {
+      info->results_cumulative += bq->result[idx]->batch[result_index].u64;
+      ++info->num_results;
+
+      --results;
+      idx = (idx - 1) % NUM_QUERIES;
+   }
+}
+
+static void
+query_new_value_normal(struct query_info *info)
 {
-   struct query_info *info = gr->query_data;
    struct pipe_context *pipe = info->pipe;
-   uint64_t now = os_time_get();
 
    if (info->last_time) {
       if (info->query[info->head])
@@ -107,30 +272,9 @@ query_new_value(struct hud_graph *gr)
             break;
          }
       }
-
-      if (info->num_results && info->last_time + gr->pane->period <= now) {
-         uint64_t value;
-
-         switch (info->result_type) {
-         default:
-         case PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE:
-            value = info->results_cumulative / info->num_results;
-            break;
-         case PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE:
-            value = info->results_cumulative;
-            break;
-         }
-
-         hud_graph_add_value(gr, value);
-
-         info->last_time = now;
-         info->results_cumulative = 0;
-         info->num_results = 0;
-      }
    }
    else {
       /* initialize */
-      info->last_time = now;
       info->query[info->head] = pipe->create_query(pipe, info->query_type, 0);
    }
 
@@ -139,11 +283,49 @@ query_new_value(struct hud_graph *gr)
 }
 
 static void
+query_new_value(struct hud_graph *gr)
+{
+   struct query_info *info = gr->query_data;
+   uint64_t now = os_time_get();
+
+   if (info->batch) {
+      query_new_value_batch(info);
+   } else {
+      query_new_value_normal(info);
+   }
+
+   if (!info->last_time) {
+      info->last_time = now;
+      return;
+   }
+
+   if (info->num_results && info->last_time + gr->pane->period <= now) {
+      uint64_t value;
+
+      switch (info->result_type) {
+      default:
+      case PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE:
+         value = info->results_cumulative / info->num_results;
+         break;
+      case PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE:
+         value = info->results_cumulative;
+         break;
+      }
+
+      hud_graph_add_value(gr, value);
+
+      info->last_time = now;
+      info->results_cumulative = 0;
+      info->num_results = 0;
+   }
+}
+
+static void
 free_query_info(void *ptr)
 {
    struct query_info *info = ptr;
 
-   if (info->last_time) {
+   if (!info->batch && info->last_time) {
       struct pipe_context *pipe = info->pipe;
       int i;
 
@@ -159,11 +341,13 @@ free_query_info(void *ptr)
 }
 
 void
-hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
+hud_pipe_query_install(struct hud_batch_query_context **pbq,
+                       struct hud_pane *pane, struct pipe_context *pipe,
                        const char *name, unsigned query_type,
                        unsigned result_index,
                        uint64_t max_value, enum pipe_driver_query_type type,
-                       enum pipe_driver_query_result_type result_type)
+                       enum pipe_driver_query_result_type result_type,
+                       unsigned flags)
 {
    struct hud_graph *gr;
    struct query_info *info;
@@ -175,28 +359,40 @@ hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    strncpy(gr->name, name, sizeof(gr->name));
    gr->name[sizeof(gr->name) - 1] = '\0';
    gr->query_data = CALLOC_STRUCT(query_info);
-   if (!gr->query_data) {
-      FREE(gr);
-      return;
-   }
+   if (!gr->query_data)
+      goto fail_gr;
 
    gr->query_new_value = query_new_value;
    gr->free_query_data = free_query_info;
 
    info = gr->query_data;
    info->pipe = pipe;
-   info->query_type = query_type;
-   info->result_index = result_index;
    info->result_type = result_type;
 
+   if (flags & PIPE_DRIVER_QUERY_FLAG_BATCH) {
+      if (!batch_query_add(pbq, pipe, query_type, &info->result_index))
+         goto fail_info;
+      info->batch = *pbq;
+   } else {
+      info->query_type = query_type;
+      info->result_index = result_index;
+   }
+
    hud_pane_add_graph(pane, gr);
    if (pane->max_value < max_value)
       hud_pane_set_max_value(pane, max_value);
    pane->type = type;
+   return;
+
+fail_info:
+   FREE(info);
+fail_gr:
+   FREE(gr);
 }
 
 boolean
-hud_driver_query_install(struct hud_pane *pane, struct pipe_context *pipe,
+hud_driver_query_install(struct hud_batch_query_context **pbq,
+                         struct hud_pane *pane, struct pipe_context *pipe,
                          const char *name)
 {
    struct pipe_screen *screen = pipe->screen;
@@ -220,8 +416,9 @@ hud_driver_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    if (!found)
       return FALSE;
 
-   hud_pipe_query_install(pane, pipe, query.name, query.query_type, 0,
-                          query.max_value.u64, query.type, query.result_type);
+   hud_pipe_query_install(pbq, pane, pipe, query.name, query.query_type, 0,
+                          query.max_value.u64, query.type, query.result_type,
+                          query.flags);
 
    return TRUE;
 }
diff --git a/src/gallium/auxiliary/hud/hud_private.h b/src/gallium/auxiliary/hud/hud_private.h
index 01caf7b8b2c..4a788bba456 100644
--- a/src/gallium/auxiliary/hud/hud_private.h
+++ b/src/gallium/auxiliary/hud/hud_private.h
@@ -80,19 +80,26 @@ void hud_pane_set_max_value(struct hud_pane *pane, uint64_t value);
 void hud_graph_add_value(struct hud_graph *gr, uint64_t value);
 
 /* graphs/queries */
+struct hud_batch_query_context;
+
 #define ALL_CPUS ~0 /* optionally set as cpu_index */
 
 int hud_get_num_cpus(void);
 
 void hud_fps_graph_install(struct hud_pane *pane);
 void hud_cpu_graph_install(struct hud_pane *pane, unsigned cpu_index);
-void hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
+void hud_pipe_query_install(struct hud_batch_query_context **pbq,
+                            struct hud_pane *pane, struct pipe_context *pipe,
                             const char *name, unsigned query_type,
                             unsigned result_index,
                             uint64_t max_value,
                             enum pipe_driver_query_type type,
-                            enum pipe_driver_query_result_type result_type);
-boolean hud_driver_query_install(struct hud_pane *pane,
+                            enum pipe_driver_query_result_type result_type,
+                            unsigned flags);
+boolean hud_driver_query_install(struct hud_batch_query_context **pbq,
+                                 struct hud_pane *pane,
                                  struct pipe_context *pipe, const char *name);
+void hud_batch_query_update(struct hud_batch_query_context *bq);
+void hud_batch_query_cleanup(struct hud_batch_query_context **pbq);
 
 #endif
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 0539cfc16a1..86c2ffadbc8 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -295,7 +295,7 @@ ttn_emit_declaration(struct ttn_compile *c)
          type = nir_type_int;
          break;
       case TGSI_RETURN_TYPE_UINT:
-         type = nir_type_unsigned;
+         type = nir_type_uint;
          break;
       case TGSI_RETURN_TYPE_FLOAT:
       default:
@@ -1239,6 +1239,11 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       op = nir_texop_tex;
       num_srcs = 1;
       break;
+   case TGSI_OPCODE_TEX2:
+      op = nir_texop_tex;
+      num_srcs = 1;
+      samp = 2;
+      break;
    case TGSI_OPCODE_TXP:
       op = nir_texop_tex;
       num_srcs = 2;
@@ -1275,6 +1280,10 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       num_srcs = 3;
       samp = 3;
       break;
+   case TGSI_OPCODE_LODQ:
+      op = nir_texop_lod;
+      num_srcs = 1;
+      break;
 
    default:
       fprintf(stderr, "unknown TGSI tex op %d\n", tgsi_inst->Instruction.Opcode);
@@ -1327,7 +1336,9 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
     */
    sview = instr->sampler_index;
 
-   if (sview < c->num_samp_types) {
+   if (op == nir_texop_lod) {
+      instr->dest_type = nir_type_float;
+   } else if (sview < c->num_samp_types) {
       instr->dest_type = c->samp_types[sview];
    } else {
       instr->dest_type = nir_type_float;
@@ -1394,10 +1405,12 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    }
 
    if (instr->is_shadow) {
-      if (instr->coord_components < 3)
-         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], Z));
-      else
+      if (instr->coord_components == 4)
+         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[1], X));
+      else if (instr->coord_components == 3)
          instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], W));
+      else
+         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], Z));
 
       instr->src[src_number].src_type = nir_tex_src_comparitor;
       src_number++;
@@ -1641,7 +1654,7 @@ static const nir_op op_trans[TGSI_OPCODE_LAST] = {
    [TGSI_OPCODE_UMUL_HI] = nir_op_umul_high,
 
    [TGSI_OPCODE_TG4] = 0,
-   [TGSI_OPCODE_LODQ] = 0, /* XXX */
+   [TGSI_OPCODE_LODQ] = 0,
 
    [TGSI_OPCODE_IBFE] = nir_op_ibitfield_extract,
    [TGSI_OPCODE_UBFE] = nir_op_ubitfield_extract,
@@ -1650,7 +1663,7 @@ static const nir_op op_trans[TGSI_OPCODE_LAST] = {
    [TGSI_OPCODE_POPC] = nir_op_bit_count,
    [TGSI_OPCODE_LSB] = nir_op_find_lsb,
    [TGSI_OPCODE_IMSB] = nir_op_ifind_msb,
-   [TGSI_OPCODE_UMSB] = nir_op_ifind_msb, /* XXX: signed vs unsigned */
+   [TGSI_OPCODE_UMSB] = nir_op_ufind_msb,
 
    [TGSI_OPCODE_INTERP_CENTROID] = 0, /* XXX */
    [TGSI_OPCODE_INTERP_SAMPLE] = 0, /* XXX */
@@ -1803,11 +1816,13 @@ ttn_emit_instruction(struct ttn_compile *c)
    case TGSI_OPCODE_TXL:
    case TGSI_OPCODE_TXB:
    case TGSI_OPCODE_TXD:
+   case TGSI_OPCODE_TEX2:
    case TGSI_OPCODE_TXL2:
    case TGSI_OPCODE_TXB2:
    case TGSI_OPCODE_TXQ_LZ:
    case TGSI_OPCODE_TXF:
    case TGSI_OPCODE_TG4:
+   case TGSI_OPCODE_LODQ:
       ttn_tex(c, dest, src);
       break;
 
diff --git a/src/gallium/auxiliary/os/os_process.c b/src/gallium/auxiliary/os/os_process.c
index a6262283d87..d2dcd0d7fbc 100644
--- a/src/gallium/auxiliary/os/os_process.c
+++ b/src/gallium/auxiliary/os/os_process.c
@@ -54,37 +54,48 @@ boolean
 os_get_process_name(char *procname, size_t size)
 {
    const char *name;
+
+   /* First, check if the GALLIUM_PROCESS_NAME env var is set to
+    * override the normal process name query.
+    */
+   name = os_get_option("GALLIUM_PROCESS_NAME");
+
+   if (!name) {
+      /* do normal query */
+
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-   char szProcessPath[MAX_PATH];
-   char *lpProcessName;
-   char *lpProcessExt;
+      char szProcessPath[MAX_PATH];
+      char *lpProcessName;
+      char *lpProcessExt;
 
-   GetModuleFileNameA(NULL, szProcessPath, Elements(szProcessPath));
+      GetModuleFileNameA(NULL, szProcessPath, Elements(szProcessPath));
 
-   lpProcessName = strrchr(szProcessPath, '\\');
-   lpProcessName = lpProcessName ? lpProcessName + 1 : szProcessPath;
+      lpProcessName = strrchr(szProcessPath, '\\');
+      lpProcessName = lpProcessName ? lpProcessName + 1 : szProcessPath;
 
-   lpProcessExt = strrchr(lpProcessName, '.');
-   if (lpProcessExt) {
-      *lpProcessExt = '\0';
-   }
+      lpProcessExt = strrchr(lpProcessName, '.');
+      if (lpProcessExt) {
+         *lpProcessExt = '\0';
+      }
 
-   name = lpProcessName;
+      name = lpProcessName;
 
 #elif defined(__GLIBC__) || defined(__CYGWIN__)
-   name = program_invocation_short_name;
+      name = program_invocation_short_name;
 #elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE)
-   /* *BSD and OS X */
-   name = getprogname();
+      /* *BSD and OS X */
+      name = getprogname();
 #elif defined(PIPE_OS_HAIKU)
-   image_info info;
-   get_image_info(B_CURRENT_TEAM, &info);
-   name = info.name;
+      image_info info;
+      get_image_info(B_CURRENT_TEAM, &info);
+      name = info.name;
 #else
 #warning unexpected platform in os_process.c
-   return FALSE;
+      return FALSE;
 #endif
 
+   }
+
    assert(size > 0);
    assert(procname);
 
diff --git a/src/gallium/auxiliary/pipe-loader/Android.mk b/src/gallium/auxiliary/pipe-loader/Android.mk
new file mode 100644
index 00000000000..27893137a1a
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/Android.mk
@@ -0,0 +1,49 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C) 2015 Emil Velikov <[email protected]>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# NOTE: Currently we build only a 'static' pipe-loader
+LOCAL_PATH := $(call my-dir)
+
+# get COMMON_SOURCES and DRM_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_CFLAGS := \
+	-DHAVE_PIPE_LOADER_DRI \
+	-DDROP_PIPE_LOADER_MISC \
+	-DGALLIUM_STATIC_TARGETS
+
+LOCAL_SRC_FILES := $(COMMON_SOURCES)
+
+LOCAL_MODULE := libmesa_pipe_loader
+
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DHAVE_LIBDRM
+LOCAL_SRC_FILES += $(DRM_SOURCES)
+
+LOCAL_SHARED_LIBRARIES := libdrm
+LOCAL_STATIC_LIBRARIES := libmesa_loader
+endif
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/auxiliary/pipe-loader/Makefile.am b/src/gallium/auxiliary/pipe-loader/Makefile.am
index 8c837996539..8039a957b1b 100644
--- a/src/gallium/auxiliary/pipe-loader/Makefile.am
+++ b/src/gallium/auxiliary/pipe-loader/Makefile.am
@@ -9,20 +9,40 @@ AM_CFLAGS = \
 	$(GALLIUM_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
 
-noinst_LTLIBRARIES = libpipe_loader.la
+noinst_LTLIBRARIES = \
+	libpipe_loader_static.la \
+	libpipe_loader_dynamic.la
 
-libpipe_loader_la_SOURCES = \
+libpipe_loader_static_la_CFLAGS = \
+	$(AM_CFLAGS) \
+	-DGALLIUM_STATIC_TARGETS=1
+
+libpipe_loader_dynamic_la_CFLAGS = \
+	$(AM_CFLAGS) \
+	-DPIPE_SEARCH_DIR=\"$(libdir)/gallium-pipe\"
+
+libpipe_loader_static_la_SOURCES = \
 	$(COMMON_SOURCES)
 
-if HAVE_DRM_LOADER_GALLIUM
+libpipe_loader_dynamic_la_SOURCES = \
+	$(COMMON_SOURCES)
+
+if HAVE_LIBDRM
 AM_CFLAGS += \
 	$(LIBDRM_CFLAGS)
 
-libpipe_loader_la_SOURCES += \
+libpipe_loader_static_la_SOURCES += \
 	$(DRM_SOURCES)
 
-libpipe_loader_la_LIBADD = \
-	$(top_builddir)/src/loader/libloader.la
+libpipe_loader_dynamic_la_SOURCES += \
+	$(DRM_SOURCES)
 
 endif
 
+libpipe_loader_static_la_LIBADD = \
+	$(top_builddir)/src/loader/libloader.la
+
+libpipe_loader_dynamic_la_LIBADD = \
+	$(top_builddir)/src/loader/libloader.la
+
+EXTRA_DIST = SConscript
diff --git a/src/gallium/auxiliary/pipe-loader/SConscript b/src/gallium/auxiliary/pipe-loader/SConscript
new file mode 100644
index 00000000000..c611fb892f8
--- /dev/null
+++ b/src/gallium/auxiliary/pipe-loader/SConscript
@@ -0,0 +1,33 @@
+Import('*')
+
+env = env.Clone()
+
+env.MSVC2008Compat()
+
+env.Append(CPPPATH = [
+    '#/src/loader',
+    '#/src/gallium/winsys',
+])
+
+env.Append(CPPDEFINES = [
+    ('HAVE_PIPE_LOADER_DRI', '1'),
+    ('DROP_PIPE_LOADER_MISC', '1'),
+    ('GALLIUM_STATIC_TARGETS', '1'),
+])
+
+source = env.ParseSourceList('Makefile.sources', 'COMMON_SOURCES')
+
+if env['HAVE_DRM']:
+    source += env.ParseSourceList('Makefile.sources', 'DRM_SOURCES')
+
+    env.PkgUseModules('DRM')
+    env.Append(LIBS = [libloader])
+
+pipe_loader = env.ConvenienceLibrary(
+    target = 'pipe_loader',
+    source = source,
+)
+
+env.Alias('pipe_loader', pipe_loader)
+
+Export('pipe_loader')
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.c b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
index 8e79f853b0a..aef996c4617 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
@@ -32,10 +32,15 @@
 #include "util/u_string.h"
 #include "util/u_dl.h"
 
+#ifdef _MSC_VER
+#include <stdlib.h>
+#define PATH_MAX _MAX_PATH
+#endif
+
 #define MODULE_PREFIX "pipe_"
 
 static int (*backends[])(struct pipe_loader_device **, int) = {
-#ifdef HAVE_PIPE_LOADER_DRM
+#ifdef HAVE_LIBDRM
    &pipe_loader_drm_probe,
 #endif
    &pipe_loader_sw_probe
@@ -69,10 +74,9 @@ pipe_loader_configuration(struct pipe_loader_device *dev,
 }
 
 struct pipe_screen *
-pipe_loader_create_screen(struct pipe_loader_device *dev,
-                          const char *library_paths)
+pipe_loader_create_screen(struct pipe_loader_device *dev)
 {
-   return dev->ops->create_screen(dev, library_paths);
+   return dev->ops->create_screen(dev);
 }
 
 struct util_dl_library *
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
index 9b8712666bb..690d088ed82 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@@ -82,13 +82,9 @@ pipe_loader_probe(struct pipe_loader_device **devs, int ndev);
  * Create a pipe_screen for the specified device.
  *
  * \param dev Device the screen will be created for.
- * \param library_paths Colon-separated list of filesystem paths that
- *                      will be used to look for the pipe driver
- *                      module that handles this device.
  */
 struct pipe_screen *
-pipe_loader_create_screen(struct pipe_loader_device *dev,
-                          const char *library_paths);
+pipe_loader_create_screen(struct pipe_loader_device *dev);
 
 /**
  * Query the configuration parameters for the specified device.
@@ -112,8 +108,6 @@ pipe_loader_configuration(struct pipe_loader_device *dev,
 void
 pipe_loader_release(struct pipe_loader_device **devs, int ndev);
 
-#ifdef HAVE_PIPE_LOADER_DRI
-
 /**
  * Initialize sw dri device give the drisw_loader_funcs.
  *
@@ -125,7 +119,15 @@ bool
 pipe_loader_sw_probe_dri(struct pipe_loader_device **devs,
                          struct drisw_loader_funcs *drisw_lf);
 
-#endif
+/**
+ * Initialize a kms backed sw device given an fd.
+ *
+ * This function is platform-specific.
+ *
+ * \sa pipe_loader_probe
+ */
+bool
+pipe_loader_sw_probe_kms(struct pipe_loader_device **devs, int fd);
 
 /**
  * Initialize a null sw device.
@@ -158,8 +160,6 @@ boolean
 pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev,
                              struct pipe_screen *screen);
 
-#ifdef HAVE_PIPE_LOADER_DRM
-
 /**
  * Get a list of known DRM devices.
  *
@@ -180,8 +180,6 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev);
 bool
 pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd);
 
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index 1799df7e4c5..994a284385c 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -36,6 +36,7 @@
 #include <unistd.h>
 
 #include "loader.h"
+#include "target-helpers/drm_helper_public.h"
 #include "state_tracker/drm_driver.h"
 #include "pipe_loader_priv.h"
 
@@ -50,13 +51,123 @@
 
 struct pipe_loader_drm_device {
    struct pipe_loader_device base;
+   const struct drm_driver_descriptor *dd;
+#ifndef GALLIUM_STATIC_TARGETS
    struct util_dl_library *lib;
+#endif
    int fd;
 };
 
 #define pipe_loader_drm_device(dev) ((struct pipe_loader_drm_device *)dev)
 
-static struct pipe_loader_ops pipe_loader_drm_ops;
+static const struct pipe_loader_ops pipe_loader_drm_ops;
+
+#ifdef GALLIUM_STATIC_TARGETS
+static const struct drm_conf_ret throttle_ret = {
+   DRM_CONF_INT,
+   {2},
+};
+
+static const struct drm_conf_ret share_fd_ret = {
+   DRM_CONF_BOOL,
+   {true},
+};
+
+static inline const struct drm_conf_ret *
+configuration_query(enum drm_conf conf)
+{
+   switch (conf) {
+   case DRM_CONF_THROTTLE:
+      return &throttle_ret;
+   case DRM_CONF_SHARE_FD:
+      return &share_fd_ret;
+   default:
+      break;
+   }
+   return NULL;
+}
+
+static const struct drm_driver_descriptor driver_descriptors[] = {
+    {
+        .name = "i915",
+        .driver_name = "i915",
+        .create_screen = pipe_i915_create_screen,
+        .configuration = configuration_query,
+    },
+#ifdef USE_VC4_SIMULATOR
+    /* VC4 simulator and ILO (i965) are mutually exclusive (error at
+     * configure). As the latter is unconditionally added, keep this one above
+     * it.
+     */
+    {
+        .name = "i965",
+        .driver_name = "vc4",
+        .create_screen = pipe_vc4_create_screen,
+        .configuration = configuration_query,
+    },
+#endif
+    {
+        .name = "i965",
+        .driver_name = "i915",
+        .create_screen = pipe_ilo_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "nouveau",
+        .driver_name = "nouveau",
+        .create_screen = pipe_nouveau_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "r300",
+        .driver_name = "radeon",
+        .create_screen = pipe_r300_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "r600",
+        .driver_name = "radeon",
+        .create_screen = pipe_r600_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "radeonsi",
+        .driver_name = "radeon",
+        .create_screen = pipe_radeonsi_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "vmwgfx",
+        .driver_name = "vmwgfx",
+        .create_screen = pipe_vmwgfx_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "kgsl",
+        .driver_name = "freedreno",
+        .create_screen = pipe_freedreno_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "msm",
+        .driver_name = "freedreno",
+        .create_screen = pipe_freedreno_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "virtio_gpu",
+        .driver_name = "virtio-gpu",
+        .create_screen = pipe_virgl_create_screen,
+        .configuration = configuration_query,
+    },
+    {
+        .name = "vc4",
+        .driver_name = "vc4",
+        .create_screen = pipe_vc4_create_screen,
+        .configuration = configuration_query,
+    },
+};
+#endif
 
 bool
 pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd)
@@ -81,10 +192,36 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd)
    if (!ddev->base.driver_name)
       goto fail;
 
+#ifdef GALLIUM_STATIC_TARGETS
+   for (int i = 0; i < ARRAY_SIZE(driver_descriptors); i++) {
+      if (strcmp(driver_descriptors[i].name, ddev->base.driver_name) == 0) {
+         ddev->dd = &driver_descriptors[i];
+         break;
+      }
+   }
+   if (!ddev->dd)
+      goto fail;
+#else
+   ddev->lib = pipe_loader_find_module(&ddev->base, PIPE_SEARCH_DIR);
+   if (!ddev->lib)
+      goto fail;
+
+   ddev->dd = (const struct drm_driver_descriptor *)
+      util_dl_get_proc_address(ddev->lib, "driver_descriptor");
+
+   /* sanity check on the name */
+   if (!ddev->dd || strcmp(ddev->dd->name, ddev->base.driver_name) != 0)
+      goto fail;
+#endif
+
    *dev = &ddev->base;
    return true;
 
   fail:
+#ifndef GALLIUM_STATIC_TARGETS
+   if (ddev->lib)
+      util_dl_close(ddev->lib);
+#endif
    FREE(ddev);
    return false;
 }
@@ -105,8 +242,9 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
 
    for (i = DRM_RENDER_NODE_MIN_MINOR, j = 0;
         i <= DRM_RENDER_NODE_MAX_MINOR; i++) {
-      fd = open_drm_render_node_minor(i);
       struct pipe_loader_device *dev;
+
+      fd = open_drm_render_node_minor(i);
       if (fd < 0)
          continue;
 
@@ -132,8 +270,10 @@ pipe_loader_drm_release(struct pipe_loader_device **dev)
 {
    struct pipe_loader_drm_device *ddev = pipe_loader_drm_device(*dev);
 
+#ifndef GALLIUM_STATIC_TARGETS
    if (ddev->lib)
       util_dl_close(ddev->lib);
+#endif
 
    close(ddev->fd);
    FREE(ddev->base.driver_name);
@@ -146,47 +286,22 @@ pipe_loader_drm_configuration(struct pipe_loader_device *dev,
                               enum drm_conf conf)
 {
    struct pipe_loader_drm_device *ddev = pipe_loader_drm_device(dev);
-   const struct drm_driver_descriptor *dd;
-
-   if (!ddev->lib)
-      return NULL;
-
-   dd = (const struct drm_driver_descriptor *)
-      util_dl_get_proc_address(ddev->lib, "driver_descriptor");
 
-   /* sanity check on the name */
-   if (!dd || strcmp(dd->name, ddev->base.driver_name) != 0)
+   if (!ddev->dd->configuration)
       return NULL;
 
-   if (!dd->configuration)
-      return NULL;
-
-   return dd->configuration(conf);
+   return ddev->dd->configuration(conf);
 }
 
 static struct pipe_screen *
-pipe_loader_drm_create_screen(struct pipe_loader_device *dev,
-                              const char *library_paths)
+pipe_loader_drm_create_screen(struct pipe_loader_device *dev)
 {
    struct pipe_loader_drm_device *ddev = pipe_loader_drm_device(dev);
-   const struct drm_driver_descriptor *dd;
-
-   if (!ddev->lib)
-      ddev->lib = pipe_loader_find_module(dev, library_paths);
-   if (!ddev->lib)
-      return NULL;
-
-   dd = (const struct drm_driver_descriptor *)
-      util_dl_get_proc_address(ddev->lib, "driver_descriptor");
-
-   /* sanity check on the name */
-   if (!dd || strcmp(dd->name, ddev->base.driver_name) != 0)
-      return NULL;
 
-   return dd->create_screen(ddev->fd);
+   return ddev->dd->create_screen(ddev->fd);
 }
 
-static struct pipe_loader_ops pipe_loader_drm_ops = {
+static const struct pipe_loader_ops pipe_loader_drm_ops = {
    .create_screen = pipe_loader_drm_create_screen,
    .configuration = pipe_loader_drm_configuration,
    .release = pipe_loader_drm_release
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h b/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h
index d3b025221c5..da2ca8c6e1f 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h
@@ -31,8 +31,7 @@
 #include "pipe_loader.h"
 
 struct pipe_loader_ops {
-   struct pipe_screen *(*create_screen)(struct pipe_loader_device *dev,
-                                        const char *library_paths);
+   struct pipe_screen *(*create_screen)(struct pipe_loader_device *dev);
 
    const struct drm_conf_ret *(*configuration)(struct pipe_loader_device *dev,
                                                enum drm_conf conf);
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
index 6794930193d..5539a730b4c 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -30,45 +30,160 @@
 #include "util/u_memory.h"
 #include "util/u_dl.h"
 #include "sw/dri/dri_sw_winsys.h"
+#include "sw/kms-dri/kms_dri_sw_winsys.h"
 #include "sw/null/null_sw_winsys.h"
 #include "sw/wrapper/wrapper_sw_winsys.h"
 #include "target-helpers/inline_sw_helper.h"
 #include "state_tracker/drisw_api.h"
+#include "state_tracker/sw_driver.h"
 
 struct pipe_loader_sw_device {
    struct pipe_loader_device base;
+   const struct sw_driver_descriptor *dd;
+#ifndef GALLIUM_STATIC_TARGETS
    struct util_dl_library *lib;
+#endif
    struct sw_winsys *ws;
 };
 
 #define pipe_loader_sw_device(dev) ((struct pipe_loader_sw_device *)dev)
 
-static struct pipe_loader_ops pipe_loader_sw_ops;
+static const struct pipe_loader_ops pipe_loader_sw_ops;
 
-static struct sw_winsys *(*backends[])() = {
-   null_sw_create
+#ifdef GALLIUM_STATIC_TARGETS
+static const struct sw_driver_descriptor driver_descriptors = {
+   .create_screen = sw_screen_create,
+   .winsys = {
+#ifdef HAVE_PIPE_LOADER_DRI
+      {
+         .name = "dri",
+         .create_winsys = dri_create_sw_winsys,
+      },
+#endif
+#ifdef HAVE_PIPE_LOADER_KMS
+      {
+         .name = "kms_dri",
+         .create_winsys = kms_dri_create_winsys,
+      },
+#endif
+/**
+ * XXX: Do not include these two for non autotools builds.
+ * They don't have neither opencl nor nine, where these are used.
+ */
+#ifndef DROP_PIPE_LOADER_MISC
+      {
+         .name = "null",
+         .create_winsys = null_sw_create,
+      },
+      {
+         .name = "wrapped",
+         .create_winsys = wrapper_sw_winsys_wrap_pipe_screen,
+      },
+#endif
+      { 0 },
+   }
 };
+#endif
+
+static bool
+pipe_loader_sw_probe_init_common(struct pipe_loader_sw_device *sdev)
+{
+   sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
+   sdev->base.driver_name = "swrast";
+   sdev->base.ops = &pipe_loader_sw_ops;
+
+#ifdef GALLIUM_STATIC_TARGETS
+   sdev->dd = &driver_descriptors;
+   if (!sdev->dd)
+      return false;
+#else
+   sdev->lib = pipe_loader_find_module(&sdev->base, PIPE_SEARCH_DIR);
+   if (!sdev->lib)
+      return false;
+
+   sdev->dd = (const struct sw_driver_descriptor *)
+      util_dl_get_proc_address(sdev->lib, "swrast_driver_descriptor");
+
+   if (!sdev->dd){
+      util_dl_close(sdev->lib);
+      sdev->lib = NULL;
+      return false;
+   }
+#endif
+
+   return true;
+}
+
+static void
+pipe_loader_sw_probe_teardown_common(struct pipe_loader_sw_device *sdev)
+{
+#ifndef GALLIUM_STATIC_TARGETS
+   if (sdev->lib)
+      util_dl_close(sdev->lib);
+#endif
+}
 
 #ifdef HAVE_PIPE_LOADER_DRI
 bool
 pipe_loader_sw_probe_dri(struct pipe_loader_device **devs, struct drisw_loader_funcs *drisw_lf)
 {
    struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
+   int i;
 
    if (!sdev)
       return false;
 
-   sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
-   sdev->base.driver_name = "swrast";
-   sdev->base.ops = &pipe_loader_sw_ops;
-   sdev->ws = dri_create_sw_winsys(drisw_lf);
-   if (!sdev->ws) {
-      FREE(sdev);
-      return false;
+   if (!pipe_loader_sw_probe_init_common(sdev))
+      goto fail;
+
+   for (i = 0; sdev->dd->winsys; i++) {
+      if (strcmp(sdev->dd->winsys[i].name, "dri") == 0) {
+         sdev->ws = sdev->dd->winsys[i].create_winsys(drisw_lf);
+         break;
+      }
    }
+   if (!sdev->ws)
+      goto fail;
+
    *devs = &sdev->base;
+   return true;
+
+fail:
+   pipe_loader_sw_probe_teardown_common(sdev);
+   FREE(sdev);
+   return false;
+}
+#endif
+
+#ifdef HAVE_PIPE_LOADER_KMS
+bool
+pipe_loader_sw_probe_kms(struct pipe_loader_device **devs, int fd)
+{
+   struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
+   int i;
 
+   if (!sdev)
+      return false;
+
+   if (!pipe_loader_sw_probe_init_common(sdev))
+      goto fail;
+
+   for (i = 0; sdev->dd->winsys; i++) {
+      if (strcmp(sdev->dd->winsys[i].name, "kms_dri") == 0) {
+         sdev->ws = sdev->dd->winsys[i].create_winsys(fd);
+         break;
+      }
+   }
+   if (!sdev->ws)
+      goto fail;
+
+   *devs = &sdev->base;
    return true;
+
+fail:
+   pipe_loader_sw_probe_teardown_common(sdev);
+   FREE(sdev);
+   return false;
 }
 #endif
 
@@ -76,38 +191,40 @@ bool
 pipe_loader_sw_probe_null(struct pipe_loader_device **devs)
 {
    struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
+   int i;
 
    if (!sdev)
       return false;
 
-   sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
-   sdev->base.driver_name = "swrast";
-   sdev->base.ops = &pipe_loader_sw_ops;
-   sdev->ws = null_sw_create();
-   if (!sdev->ws) {
-      FREE(sdev);
-      return false;
+   if (!pipe_loader_sw_probe_init_common(sdev))
+      goto fail;
+
+   for (i = 0; sdev->dd->winsys; i++) {
+      if (strcmp(sdev->dd->winsys[i].name, "null") == 0) {
+         sdev->ws = sdev->dd->winsys[i].create_winsys();
+         break;
+      }
    }
-   *devs = &sdev->base;
+   if (!sdev->ws)
+      goto fail;
 
+   *devs = &sdev->base;
    return true;
+
+fail:
+   pipe_loader_sw_probe_teardown_common(sdev);
+   FREE(sdev);
+   return false;
 }
 
 int
 pipe_loader_sw_probe(struct pipe_loader_device **devs, int ndev)
 {
-   int i;
-
-   for (i = 0; i < Elements(backends); i++) {
-      if (i < ndev) {
-         struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
-	 /* TODO: handle CALLOC_STRUCT failure */
+   int i = 1;
 
-         sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
-         sdev->base.driver_name = "swrast";
-         sdev->base.ops = &pipe_loader_sw_ops;
-         sdev->ws = backends[i]();
-         devs[i] = &sdev->base;
+   if (i < ndev) {
+      if (!pipe_loader_sw_probe_null(devs)) {
+         i--;
       }
    }
 
@@ -119,21 +236,30 @@ pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev,
                              struct pipe_screen *screen)
 {
    struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
+   int i;
 
    if (!sdev)
       return false;
 
-   sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
-   sdev->base.driver_name = "swrast";
-   sdev->base.ops = &pipe_loader_sw_ops;
-   sdev->ws = wrapper_sw_winsys_wrap_pipe_screen(screen);
+   if (!pipe_loader_sw_probe_init_common(sdev))
+      goto fail;
 
-   if (!sdev->ws) {
-      FREE(sdev);
-      return false;
+   for (i = 0; sdev->dd->winsys; i++) {
+      if (strcmp(sdev->dd->winsys[i].name, "wrapped") == 0) {
+         sdev->ws = sdev->dd->winsys[i].create_winsys(screen);
+         break;
+      }
    }
+   if (!sdev->ws)
+      goto fail;
+
    *dev = &sdev->base;
    return true;
+
+fail:
+   pipe_loader_sw_probe_teardown_common(sdev);
+   FREE(sdev);
+   return false;
 }
 
 static void
@@ -141,8 +267,10 @@ pipe_loader_sw_release(struct pipe_loader_device **dev)
 {
    struct pipe_loader_sw_device *sdev = pipe_loader_sw_device(*dev);
 
+#ifndef GALLIUM_STATIC_TARGETS
    if (sdev->lib)
       util_dl_close(sdev->lib);
+#endif
 
    FREE(sdev);
    *dev = NULL;
@@ -156,28 +284,19 @@ pipe_loader_sw_configuration(struct pipe_loader_device *dev,
 }
 
 static struct pipe_screen *
-pipe_loader_sw_create_screen(struct pipe_loader_device *dev,
-                             const char *library_paths)
+pipe_loader_sw_create_screen(struct pipe_loader_device *dev)
 {
    struct pipe_loader_sw_device *sdev = pipe_loader_sw_device(dev);
-   struct pipe_screen *(*init)(struct sw_winsys *);
+   struct pipe_screen *screen;
 
-   if (!sdev->lib)
-      sdev->lib = pipe_loader_find_module(dev, library_paths);
-   if (!sdev->lib)
-      return NULL;
-
-   init = (void *)util_dl_get_proc_address(sdev->lib, "swrast_create_screen");
-   if (!init){
-      util_dl_close(sdev->lib);
-      sdev->lib = NULL;
-      return NULL;
-   }
+   screen = sdev->dd->create_screen(sdev->ws);
+   if (!screen)
+      sdev->ws->destroy(sdev->ws);
 
-   return init(sdev->ws);
+   return screen;
 }
 
-static struct pipe_loader_ops pipe_loader_sw_ops = {
+static const struct pipe_loader_ops pipe_loader_sw_ops = {
    .create_screen = pipe_loader_sw_create_screen,
    .configuration = pipe_loader_sw_configuration,
    .release = pipe_loader_sw_release
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h
new file mode 100644
index 00000000000..332b1cba984
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -0,0 +1,275 @@
+#ifndef DRM_HELPER_H
+#define DRM_HELPER_H
+
+#include <stdio.h>
+#include "target-helpers/inline_debug_helper.h"
+#include "target-helpers/drm_helper_public.h"
+
+#ifdef GALLIUM_I915
+#include "i915/drm/i915_drm_public.h"
+#include "i915/i915_public.h"
+
+struct pipe_screen *
+pipe_i915_create_screen(int fd)
+{
+   struct i915_winsys *iws;
+   struct pipe_screen *screen;
+
+   iws = i915_drm_winsys_create(fd);
+   if (!iws)
+      return NULL;
+
+   screen = i915_screen_create(iws);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_i915_create_screen(int fd)
+{
+   fprintf(stderr, "i915g: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_ILO
+#include "intel/drm/intel_drm_public.h"
+#include "ilo/ilo_public.h"
+
+struct pipe_screen *
+pipe_ilo_create_screen(int fd)
+{
+   struct intel_winsys *iws;
+   struct pipe_screen *screen;
+
+   iws = intel_winsys_create_for_fd(fd);
+   if (!iws)
+      return NULL;
+
+   screen = ilo_screen_create(iws);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_ilo_create_screen(int fd)
+{
+   fprintf(stderr, "ilo: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_NOUVEAU
+#include "nouveau/drm/nouveau_drm_public.h"
+
+struct pipe_screen *
+pipe_nouveau_create_screen(int fd)
+{
+   struct pipe_screen *screen;
+
+   screen = nouveau_drm_screen_create(fd);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_nouveau_create_screen(int fd)
+{
+   fprintf(stderr, "nouveau: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_R300
+#include "radeon/radeon_winsys.h"
+#include "radeon/drm/radeon_drm_public.h"
+#include "r300/r300_public.h"
+
+struct pipe_screen *
+pipe_r300_create_screen(int fd)
+{
+   struct radeon_winsys *rw;
+
+   rw = radeon_drm_winsys_create(fd, r300_screen_create);
+   return rw ? debug_screen_wrap(rw->screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_r300_create_screen(int fd)
+{
+   fprintf(stderr, "r300: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_R600
+#include "radeon/radeon_winsys.h"
+#include "radeon/drm/radeon_drm_public.h"
+#include "r600/r600_public.h"
+
+struct pipe_screen *
+pipe_r600_create_screen(int fd)
+{
+   struct radeon_winsys *rw;
+
+   rw = radeon_drm_winsys_create(fd, r600_screen_create);
+   return rw ? debug_screen_wrap(rw->screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_r600_create_screen(int fd)
+{
+   fprintf(stderr, "r600: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_RADEONSI
+#include "radeon/radeon_winsys.h"
+#include "radeon/drm/radeon_drm_public.h"
+#include "amdgpu/drm/amdgpu_public.h"
+#include "radeonsi/si_public.h"
+
+struct pipe_screen *
+pipe_radeonsi_create_screen(int fd)
+{
+   struct radeon_winsys *rw;
+
+   /* First, try amdgpu. */
+   rw = amdgpu_winsys_create(fd, radeonsi_screen_create);
+
+   if (!rw)
+      rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+
+   return rw ? debug_screen_wrap(rw->screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_radeonsi_create_screen(int fd)
+{
+   fprintf(stderr, "radeonsi: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_VMWGFX
+#include "svga/drm/svga_drm_public.h"
+#include "svga/svga_public.h"
+
+struct pipe_screen *
+pipe_vmwgfx_create_screen(int fd)
+{
+   struct svga_winsys_screen *sws;
+   struct pipe_screen *screen;
+
+   sws = svga_drm_winsys_screen_create(fd);
+   if (!sws)
+      return NULL;
+
+   screen = svga_screen_create(sws);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_vmwgfx_create_screen(int fd)
+{
+   fprintf(stderr, "svga: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_FREEDRENO
+#include "freedreno/drm/freedreno_drm_public.h"
+
+struct pipe_screen *
+pipe_freedreno_create_screen(int fd)
+{
+   struct pipe_screen *screen;
+
+   screen = fd_drm_screen_create(fd);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_freedreno_create_screen(int fd)
+{
+   fprintf(stderr, "freedreno: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_VIRGL
+#include "virgl/drm/virgl_drm_public.h"
+#include "virgl/virgl_public.h"
+
+struct pipe_screen *
+pipe_virgl_create_screen(int fd)
+{
+   struct virgl_winsys *vws;
+   struct pipe_screen *screen;
+
+   vws = virgl_drm_winsys_create(fd);
+   if (!vws)
+      return NULL;
+
+   screen = virgl_create_screen(vws);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_virgl_create_screen(int fd)
+{
+   fprintf(stderr, "virgl: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+#ifdef GALLIUM_VC4
+#include "vc4/drm/vc4_drm_public.h"
+
+struct pipe_screen *
+pipe_vc4_create_screen(int fd)
+{
+   struct pipe_screen *screen;
+
+   screen = vc4_drm_screen_create(fd);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_vc4_create_screen(int fd)
+{
+   fprintf(stderr, "vc4: driver missing\n");
+   return NULL;
+}
+
+#endif
+
+
+#endif /* DRM_HELPER_H */
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
new file mode 100644
index 00000000000..d1f9382a6f9
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
@@ -0,0 +1,37 @@
+#ifndef _DRM_HELPER_PUBLIC_H
+#define _DRM_HELPER_PUBLIC_H
+
+
+struct pipe_screen;
+
+struct pipe_screen *
+pipe_i915_create_screen(int fd);
+
+struct pipe_screen *
+pipe_ilo_create_screen(int fd);
+
+struct pipe_screen *
+pipe_nouveau_create_screen(int fd);
+
+struct pipe_screen *
+pipe_r300_create_screen(int fd);
+
+struct pipe_screen *
+pipe_r600_create_screen(int fd);
+
+struct pipe_screen *
+pipe_radeonsi_create_screen(int fd);
+
+struct pipe_screen *
+pipe_vmwgfx_create_screen(int fd);
+
+struct pipe_screen *
+pipe_freedreno_create_screen(int fd);
+
+struct pipe_screen *
+pipe_virgl_create_screen(int fd);
+
+struct pipe_screen *
+pipe_vc4_create_screen(int fd);
+
+#endif /* _DRM_HELPER_PUBLIC_H */
diff --git a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
deleted file mode 100644
index 6ca4dc8136c..00000000000
--- a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
+++ /dev/null
@@ -1,531 +0,0 @@
-#ifndef INLINE_DRM_HELPER_H
-#define INLINE_DRM_HELPER_H
-
-#include "state_tracker/drm_driver.h"
-#include "target-helpers/inline_debug_helper.h"
-#include "loader.h"
-#if defined(DRI_TARGET)
-#include "dri_screen.h"
-#endif
-
-#if GALLIUM_SOFTPIPE
-#include "target-helpers/inline_sw_helper.h"
-#include "sw/kms-dri/kms_dri_sw_winsys.h"
-#endif
-
-#if GALLIUM_I915
-#include "i915/drm/i915_drm_public.h"
-#include "i915/i915_public.h"
-#endif
-
-#if GALLIUM_ILO
-#include "intel/drm/intel_drm_public.h"
-#include "ilo/ilo_public.h"
-#endif
-
-#if GALLIUM_NOUVEAU
-#include "nouveau/drm/nouveau_drm_public.h"
-#endif
-
-#if GALLIUM_R300
-#include "radeon/radeon_winsys.h"
-#include "radeon/drm/radeon_drm_public.h"
-#include "r300/r300_public.h"
-#endif
-
-#if GALLIUM_R600
-#include "radeon/radeon_winsys.h"
-#include "radeon/drm/radeon_drm_public.h"
-#include "r600/r600_public.h"
-#endif
-
-#if GALLIUM_RADEONSI
-#include "radeon/radeon_winsys.h"
-#include "radeon/drm/radeon_drm_public.h"
-#include "amdgpu/drm/amdgpu_public.h"
-#include "radeonsi/si_public.h"
-#endif
-
-#if GALLIUM_VMWGFX
-#include "svga/drm/svga_drm_public.h"
-#include "svga/svga_public.h"
-#endif
-
-#if GALLIUM_FREEDRENO
-#include "freedreno/drm/freedreno_drm_public.h"
-#endif
-
-#if GALLIUM_VC4
-#include "vc4/drm/vc4_drm_public.h"
-#endif
-
-#if GALLIUM_VIRGL
-#include "virgl/drm/virgl_drm_public.h"
-#include "virgl/virgl_public.h"
-#endif
-
-static char* driver_name = NULL;
-
-/* XXX: We need to teardown the winsys if *screen_create() fails. */
-
-#if defined(GALLIUM_SOFTPIPE)
-#if defined(DRI_TARGET)
-#if defined(HAVE_LIBDRM)
-
-const __DRIextension **__driDriverGetExtensions_kms_swrast(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_kms_swrast(void)
-{
-   globalDriverAPI = &dri_kms_driver_api;
-   return galliumdrm_driver_extensions;
-}
-
-struct pipe_screen *
-kms_swrast_create_screen(int fd)
-{
-   struct sw_winsys *sws;
-   struct pipe_screen *screen;
-
-   sws = kms_dri_create_winsys(fd);
-   if (!sws)
-      return NULL;
-
-   screen = sw_screen_create(sws);
-   return screen ? debug_screen_wrap(screen) : NULL;
-}
-#endif
-#endif
-#endif
-
-#if defined(GALLIUM_I915)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_i915(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_i915(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_i915_create_screen(int fd)
-{
-   struct i915_winsys *iws;
-   struct pipe_screen *screen;
-
-   iws = i915_drm_winsys_create(fd);
-   if (!iws)
-      return NULL;
-
-   screen = i915_screen_create(iws);
-   return screen ? debug_screen_wrap(screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_ILO)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_i965(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_i965(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_ilo_create_screen(int fd)
-{
-   struct intel_winsys *iws;
-   struct pipe_screen *screen;
-
-   iws = intel_winsys_create_for_fd(fd);
-   if (!iws)
-      return NULL;
-
-   screen = ilo_screen_create(iws);
-   return screen ? debug_screen_wrap(screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_NOUVEAU)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_nouveau(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_nouveau(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_nouveau_create_screen(int fd)
-{
-   struct pipe_screen *screen;
-
-   screen = nouveau_drm_screen_create(fd);
-   return screen ? debug_screen_wrap(screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_R300)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_r300(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_r300(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_r300_create_screen(int fd)
-{
-   struct radeon_winsys *rw;
-
-   rw = radeon_drm_winsys_create(fd, r300_screen_create);
-   return rw ? debug_screen_wrap(rw->screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_R600)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_r600(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_r600(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_r600_create_screen(int fd)
-{
-   struct radeon_winsys *rw;
-
-   rw = radeon_drm_winsys_create(fd, r600_screen_create);
-   return rw ? debug_screen_wrap(rw->screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_RADEONSI)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_radeonsi(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_radeonsi(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_radeonsi_create_screen(int fd)
-{
-   struct radeon_winsys *rw;
-
-   /* First, try amdgpu. */
-   rw = amdgpu_winsys_create(fd, radeonsi_screen_create);
-
-   if (!rw)
-      rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
-
-   return rw ? debug_screen_wrap(rw->screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_VMWGFX)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_vmwgfx(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_vmwgfx(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_vmwgfx_create_screen(int fd)
-{
-   struct svga_winsys_screen *sws;
-   struct pipe_screen *screen;
-
-   sws = svga_drm_winsys_screen_create(fd);
-   if (!sws)
-      return NULL;
-
-   screen = svga_screen_create(sws);
-   return screen ? debug_screen_wrap(screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_FREEDRENO)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_msm(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_msm(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-
-const __DRIextension **__driDriverGetExtensions_kgsl(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_kgsl(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_freedreno_create_screen(int fd)
-{
-   struct pipe_screen *screen;
-
-   screen = fd_drm_screen_create(fd);
-   return screen ? debug_screen_wrap(screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_VIRGL)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_virtio_gpu(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_virtio_gpu(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-static struct pipe_screen *
-pipe_virgl_create_screen(int fd)
-{
-   struct virgl_winsys *vws;
-   struct pipe_screen *screen;
-
-   vws = virgl_drm_winsys_create(fd);
-   if (!vws)
-      return NULL;
-
-   screen = virgl_create_screen(vws);
-   return screen ? debug_screen_wrap(screen) : NULL;
-}
-#endif
-
-#if defined(GALLIUM_VC4)
-#if defined(DRI_TARGET)
-
-const __DRIextension **__driDriverGetExtensions_vc4(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_vc4(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-
-#if defined(USE_VC4_SIMULATOR)
-const __DRIextension **__driDriverGetExtensions_i965(void);
-
-/**
- * When building using the simulator (on x86), we advertise ourselves as the
- * i965 driver so that you can just make a directory with a link from
- * i965_dri.so to the built vc4_dri.so, and point LIBGL_DRIVERS_PATH to that
- * on your i965-using host to run the driver under simulation.
- *
- * This is, of course, incompatible with building with the ilo driver, but you
- * shouldn't be building that anyway.
- */
-PUBLIC const __DRIextension **__driDriverGetExtensions_i965(void)
-{
-   globalDriverAPI = &galliumdrm_driver_api;
-   return galliumdrm_driver_extensions;
-}
-#endif
-
-#endif
-
-static struct pipe_screen *
-pipe_vc4_create_screen(int fd)
-{
-   struct pipe_screen *screen;
-
-   screen = vc4_drm_screen_create(fd);
-   return screen ? debug_screen_wrap(screen) : NULL;
-}
-#endif
-
-inline struct pipe_screen *
-dd_create_screen(int fd)
-{
-   driver_name = loader_get_driver_for_fd(fd, _LOADER_GALLIUM);
-   if (!driver_name)
-      return NULL;
-
-#if defined(GALLIUM_I915)
-   if (strcmp(driver_name, "i915") == 0)
-      return pipe_i915_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_ILO)
-   if (strcmp(driver_name, "i965") == 0)
-      return pipe_ilo_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_NOUVEAU)
-   if (strcmp(driver_name, "nouveau") == 0)
-      return pipe_nouveau_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_R300)
-   if (strcmp(driver_name, "r300") == 0)
-      return pipe_r300_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_R600)
-   if (strcmp(driver_name, "r600") == 0)
-      return pipe_r600_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_RADEONSI)
-   if (strcmp(driver_name, "radeonsi") == 0)
-      return pipe_radeonsi_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_VMWGFX)
-   if (strcmp(driver_name, "vmwgfx") == 0)
-      return pipe_vmwgfx_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_FREEDRENO)
-   if ((strcmp(driver_name, "kgsl") == 0) || (strcmp(driver_name, "msm") == 0))
-      return pipe_freedreno_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_VIRGL)
-   if ((strcmp(driver_name, "virtio_gpu") == 0))
-      return pipe_virgl_create_screen(fd);
-   else
-#endif
-#if defined(GALLIUM_VC4)
-   if (strcmp(driver_name, "vc4") == 0)
-      return pipe_vc4_create_screen(fd);
-   else
-#if defined(USE_VC4_SIMULATOR)
-   if (strcmp(driver_name, "i965") == 0)
-      return pipe_vc4_create_screen(fd);
-   else
-#endif
-#endif
-      return NULL;
-}
-
-inline const char *
-dd_driver_name(void)
-{
-   return driver_name;
-}
-
-static const struct drm_conf_ret throttle_ret = {
-   DRM_CONF_INT,
-   {2},
-};
-
-static const struct drm_conf_ret share_fd_ret = {
-   DRM_CONF_BOOL,
-   {true},
-};
-
-static inline const struct drm_conf_ret *
-configuration_query(enum drm_conf conf)
-{
-   switch (conf) {
-   case DRM_CONF_THROTTLE:
-      return &throttle_ret;
-   case DRM_CONF_SHARE_FD:
-      return &share_fd_ret;
-   default:
-      break;
-   }
-   return NULL;
-}
-
-inline const struct drm_conf_ret *
-dd_configuration(enum drm_conf conf)
-{
-   if (!driver_name)
-      return NULL;
-
-#if defined(GALLIUM_I915)
-   if (strcmp(driver_name, "i915") == 0)
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_ILO)
-   if (strcmp(driver_name, "i965") == 0)
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_NOUVEAU)
-   if (strcmp(driver_name, "nouveau") == 0)
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_R300)
-   if (strcmp(driver_name, "r300") == 0)
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_R600)
-   if (strcmp(driver_name, "r600") == 0)
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_RADEONSI)
-   if (strcmp(driver_name, "radeonsi") == 0)
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_VMWGFX)
-   if (strcmp(driver_name, "vmwgfx") == 0)
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_FREEDRENO)
-   if ((strcmp(driver_name, "kgsl") == 0) || (strcmp(driver_name, "msm") == 0))
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_VIRGL)
-   if ((strcmp(driver_name, "virtio_gpu") == 0))
-      return configuration_query(conf);
-   else
-#endif
-#if defined(GALLIUM_VC4)
-   if (strcmp(driver_name, "vc4") == 0)
-      return configuration_query(conf);
-   else
-#if defined(USE_VC4_SIMULATOR)
-   if (strcmp(driver_name, "i965") == 0)
-      return configuration_query(conf);
-   else
-#endif
-#endif
-      return NULL;
-}
-#endif /* INLINE_DRM_HELPER_H */
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index f3693fb1f39..a9ab16f2b54 100644
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -69,69 +69,4 @@ sw_screen_create(struct sw_winsys *winsys)
    return sw_screen_create_named(winsys, driver);
 }
 
-#if defined(GALLIUM_SOFTPIPE)
-#if defined(DRI_TARGET)
-#include "target-helpers/inline_debug_helper.h"
-#include "sw/dri/dri_sw_winsys.h"
-#include "dri_screen.h"
-
-const __DRIextension **__driDriverGetExtensions_swrast(void);
-
-PUBLIC const __DRIextension **__driDriverGetExtensions_swrast(void)
-{
-   globalDriverAPI = &galliumsw_driver_api;
-   return galliumsw_driver_extensions;
-}
-
-inline struct pipe_screen *
-drisw_create_screen(struct drisw_loader_funcs *lf)
-{
-   struct sw_winsys *winsys = NULL;
-   struct pipe_screen *screen = NULL;
-
-   winsys = dri_create_sw_winsys(lf);
-   if (winsys == NULL)
-      return NULL;
-
-   screen = sw_screen_create(winsys);
-   if (screen == NULL) {
-      winsys->destroy(winsys);
-      return NULL;
-   }
-
-   screen = debug_screen_wrap(screen);
-   return screen;
-}
-#endif // DRI_TARGET
-
-#if defined(NINE_TARGET)
-#include "sw/wrapper/wrapper_sw_winsys.h"
-#include "target-helpers/inline_debug_helper.h"
-
-extern struct pipe_screen *ninesw_create_screen(struct pipe_screen *screen);
-
-inline struct pipe_screen *
-ninesw_create_screen(struct pipe_screen *pscreen)
-{
-   struct sw_winsys *winsys = NULL;
-   struct pipe_screen *screen = NULL;
-
-   winsys = wrapper_sw_winsys_wrap_pipe_screen(pscreen);
-   if (winsys == NULL)
-      return NULL;
-
-   screen = sw_screen_create(winsys);
-   if (screen == NULL) {
-      winsys->destroy(winsys);
-      return NULL;
-   }
-
-   screen = debug_screen_wrap(screen);
-   return screen;
-}
-#endif // NINE_TARGET
-
-#endif // GALLIUM_SOFTPIPE
-
-
 #endif
diff --git a/src/gallium/auxiliary/util/u_dl.c b/src/gallium/auxiliary/util/u_dl.c
index aca435d6cad..9b97d8dc4b9 100644
--- a/src/gallium/auxiliary/util/u_dl.c
+++ b/src/gallium/auxiliary/util/u_dl.c
@@ -45,7 +45,7 @@ struct util_dl_library *
 util_dl_open(const char *filename)
 {
 #if defined(PIPE_OS_UNIX)
-   return (struct util_dl_library *)dlopen(filename, RTLD_LAZY | RTLD_GLOBAL);
+   return (struct util_dl_library *)dlopen(filename, RTLD_LAZY | RTLD_LOCAL);
 #elif defined(PIPE_OS_WINDOWS)
    return (struct util_dl_library *)LoadLibraryA(filename);
 #else
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index d3b77e6b99b..c26d7331d4c 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -202,6 +202,36 @@ PIPE_FORMAT_BPTC_SRGBA            , bptc, 4, 4, x128,     ,     ,     , xyzw, sr
 PIPE_FORMAT_BPTC_RGB_FLOAT        , bptc, 4, 4, x128,     ,     ,     , xyz1, rgb
 PIPE_FORMAT_BPTC_RGB_UFLOAT       , bptc, 4, 4, x128,     ,     ,     , xyz1, rgb
 
+PIPE_FORMAT_ASTC_4x4              , astc, 4, 4, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_5x4              , astc, 5, 4, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_5x5              , astc, 5, 5, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_6x5              , astc, 6, 5, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_6x6              , astc, 6, 6, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_8x5              , astc, 8, 5, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_8x6              , astc, 8, 6, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_8x8              , astc, 8, 8, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_10x5             , astc,10, 5, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_10x6             , astc,10, 6, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_10x8             , astc,10, 8, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_10x10            , astc,10,10, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_12x10            , astc,12,10, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_ASTC_12x12            , astc,12,12, x128,     ,     ,     , xyzw, rgb
+
+PIPE_FORMAT_ASTC_4x4_SRGB         , astc, 4, 4, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_5x4_SRGB         , astc, 5, 4, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_5x5_SRGB         , astc, 5, 5, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_6x5_SRGB         , astc, 6, 5, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_6x6_SRGB         , astc, 6, 6, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_8x5_SRGB         , astc, 8, 5, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_8x6_SRGB         , astc, 8, 6, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_8x8_SRGB         , astc, 8, 8, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_10x5_SRGB        , astc,10, 5, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_10x6_SRGB        , astc,10, 6, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_10x8_SRGB        , astc,10, 8, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_10x10_SRGB       , astc,10,10, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_12x10_SRGB       , astc,12,10, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_ASTC_12x12_SRGB       , astc,12,12, x128,     ,     ,     , xyzw, srgb
+
 # Straightforward D3D10-like formats (also used for 
 # vertex buffer element description)
 # 
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index a1b1b28fa41..ffdb864fa83 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -84,9 +84,14 @@ enum util_format_layout {
    UTIL_FORMAT_LAYOUT_BPTC = 7,
 
    /**
+    * ASTC
+    */
+   UTIL_FORMAT_LAYOUT_ASTC = 8,
+
+   /**
     * Everything else that doesn't fit in any of the above layouts.
     */
-   UTIL_FORMAT_LAYOUT_OTHER = 8
+   UTIL_FORMAT_LAYOUT_OTHER = 9
 };
 
 
@@ -481,6 +486,7 @@ util_format_is_compressed(enum pipe_format format)
    case UTIL_FORMAT_LAYOUT_RGTC:
    case UTIL_FORMAT_LAYOUT_ETC:
    case UTIL_FORMAT_LAYOUT_BPTC:
+   case UTIL_FORMAT_LAYOUT_ASTC:
       /* XXX add other formats in the future */
       return TRUE;
    default:
@@ -924,6 +930,35 @@ util_format_srgb(enum pipe_format format)
       return PIPE_FORMAT_B5G6R5_SRGB;
    case PIPE_FORMAT_BPTC_RGBA_UNORM:
       return PIPE_FORMAT_BPTC_SRGBA;
+   case PIPE_FORMAT_ASTC_4x4:
+      return PIPE_FORMAT_ASTC_4x4_SRGB;
+   case PIPE_FORMAT_ASTC_5x4:
+      return PIPE_FORMAT_ASTC_5x4_SRGB;
+   case PIPE_FORMAT_ASTC_5x5:
+      return PIPE_FORMAT_ASTC_5x5_SRGB;
+   case PIPE_FORMAT_ASTC_6x5:
+      return PIPE_FORMAT_ASTC_6x5_SRGB;
+   case PIPE_FORMAT_ASTC_6x6:
+      return PIPE_FORMAT_ASTC_6x6_SRGB;
+   case PIPE_FORMAT_ASTC_8x5:
+      return PIPE_FORMAT_ASTC_8x5_SRGB;
+   case PIPE_FORMAT_ASTC_8x6:
+      return PIPE_FORMAT_ASTC_8x6_SRGB;
+   case PIPE_FORMAT_ASTC_8x8:
+      return PIPE_FORMAT_ASTC_8x8_SRGB;
+   case PIPE_FORMAT_ASTC_10x5:
+      return PIPE_FORMAT_ASTC_10x5_SRGB;
+   case PIPE_FORMAT_ASTC_10x6:
+      return PIPE_FORMAT_ASTC_10x6_SRGB;
+   case PIPE_FORMAT_ASTC_10x8:
+      return PIPE_FORMAT_ASTC_10x8_SRGB;
+   case PIPE_FORMAT_ASTC_10x10:
+      return PIPE_FORMAT_ASTC_10x10_SRGB;
+   case PIPE_FORMAT_ASTC_12x10:
+      return PIPE_FORMAT_ASTC_12x10_SRGB;
+   case PIPE_FORMAT_ASTC_12x12:
+      return PIPE_FORMAT_ASTC_12x12_SRGB;
+
    default:
       return PIPE_FORMAT_NONE;
    }
@@ -971,6 +1006,34 @@ util_format_linear(enum pipe_format format)
       return PIPE_FORMAT_B5G6R5_UNORM;
    case PIPE_FORMAT_BPTC_SRGBA:
       return PIPE_FORMAT_BPTC_RGBA_UNORM;
+   case PIPE_FORMAT_ASTC_4x4_SRGB:
+      return PIPE_FORMAT_ASTC_4x4;
+   case PIPE_FORMAT_ASTC_5x4_SRGB:
+      return PIPE_FORMAT_ASTC_5x4;
+   case PIPE_FORMAT_ASTC_5x5_SRGB:
+      return PIPE_FORMAT_ASTC_5x5;
+   case PIPE_FORMAT_ASTC_6x5_SRGB:
+      return PIPE_FORMAT_ASTC_6x5;
+   case PIPE_FORMAT_ASTC_6x6_SRGB:
+      return PIPE_FORMAT_ASTC_6x6;
+   case PIPE_FORMAT_ASTC_8x5_SRGB:
+      return PIPE_FORMAT_ASTC_8x5;
+   case PIPE_FORMAT_ASTC_8x6_SRGB:
+      return PIPE_FORMAT_ASTC_8x6;
+   case PIPE_FORMAT_ASTC_8x8_SRGB:
+      return PIPE_FORMAT_ASTC_8x8;
+   case PIPE_FORMAT_ASTC_10x5_SRGB:
+      return PIPE_FORMAT_ASTC_10x5;
+   case PIPE_FORMAT_ASTC_10x6_SRGB:
+      return PIPE_FORMAT_ASTC_10x6;
+   case PIPE_FORMAT_ASTC_10x8_SRGB:
+      return PIPE_FORMAT_ASTC_10x8;
+   case PIPE_FORMAT_ASTC_10x10_SRGB:
+      return PIPE_FORMAT_ASTC_10x10;
+   case PIPE_FORMAT_ASTC_12x10_SRGB:
+      return PIPE_FORMAT_ASTC_12x10;
+   case PIPE_FORMAT_ASTC_12x12_SRGB:
+      return PIPE_FORMAT_ASTC_12x12;
    default:
       return format;
    }
diff --git a/src/gallium/auxiliary/util/u_format_fake.c b/src/gallium/auxiliary/util/u_format_fake.c
deleted file mode 100644
index 77e896d27bd..00000000000
--- a/src/gallium/auxiliary/util/u_format_fake.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "u_format.h"
-#include "u_format_fake.h"
-
-#define fake(format) \
-void \
-util_format_##format##_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) {assert(0);} \
-\
-void \
-util_format_##format##_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) {assert(0);} \
-\
-void \
-util_format_##format##_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) {assert(0);} \
-\
-void \
-util_format_##format##_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) {assert(0);} \
-\
-void \
-util_format_##format##_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) {assert(0);} \
-\
-void \
-util_format_##format##_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) {assert(0);}
-
-fake(bptc_rgba_unorm)
-fake(bptc_srgba)
-fake(bptc_rgb_float)
-fake(bptc_rgb_ufloat)
-
-fake(etc2_rgb8)
-fake(etc2_srgb8)
-fake(etc2_rgb8a1)
-fake(etc2_srgb8a1)
-fake(etc2_rgba8)
-fake(etc2_srgba8)
-fake(etc2_r11_unorm)
-fake(etc2_r11_snorm)
-fake(etc2_rg11_unorm)
-fake(etc2_rg11_snorm)
diff --git a/src/gallium/auxiliary/util/u_format_fake.h b/src/gallium/auxiliary/util/u_format_fake.h
deleted file mode 100644
index e6bfd4e1594..00000000000
--- a/src/gallium/auxiliary/util/u_format_fake.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2011 Red Hat Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- **************************************************************************/
-
-#ifndef U_FORMAT_FAKE_H_
-#define U_FORMAT_FAKE_H_
-
-#define __format_fake(format) \
-void \
-util_format_##format##_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); \
-\
-void \
-util_format_##format##_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); \
-\
-void \
-util_format_##format##_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); \
-\
-void \
-util_format_##format##_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); \
-\
-void \
-util_format_##format##_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); \
-\
-void \
-util_format_##format##_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j);
-
-__format_fake(bptc_rgba_unorm)
-__format_fake(bptc_srgba)
-__format_fake(bptc_rgb_float)
-__format_fake(bptc_rgb_ufloat)
-
-__format_fake(etc2_rgb8)
-__format_fake(etc2_srgb8)
-__format_fake(etc2_rgb8a1)
-__format_fake(etc2_srgb8a1)
-__format_fake(etc2_rgba8)
-__format_fake(etc2_srgba8)
-__format_fake(etc2_r11_unorm)
-__format_fake(etc2_r11_snorm)
-__format_fake(etc2_rg11_unorm)
-__format_fake(etc2_rg11_snorm)
-
-#endif
diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py
index fb42de723c4..d4bb1de4cb5 100644
--- a/src/gallium/auxiliary/util/u_format_pack.py
+++ b/src/gallium/auxiliary/util/u_format_pack.py
@@ -686,7 +686,7 @@ def generate_format_fetch(format, dst_channel, dst_native_type, dst_suffix):
 
 
 def is_format_hand_written(format):
-    return format.layout in ('s3tc', 'rgtc', 'etc', 'bptc', 'subsampled', 'other') or format.colorspace == ZS
+    return format.layout in ('s3tc', 'rgtc', 'etc', 'bptc', 'astc', 'subsampled', 'other') or format.colorspace == ZS
 
 
 def generate(formats):
diff --git a/src/gallium/auxiliary/util/u_format_table.py b/src/gallium/auxiliary/util/u_format_table.py
index aceb0caf7e1..879d10ff01d 100755
--- a/src/gallium/auxiliary/util/u_format_table.py
+++ b/src/gallium/auxiliary/util/u_format_table.py
@@ -90,7 +90,6 @@ def write_format_table(formats):
     print '#include "u_format_rgtc.h"'
     print '#include "u_format_latc.h"'
     print '#include "u_format_etc.h"'
-    print '#include "u_format_fake.h"'
     print
     
     u_format_pack.generate(formats)
@@ -139,10 +138,15 @@ def write_format_table(formats):
         u_format_pack.print_channels(format, do_channel_array)
         u_format_pack.print_channels(format, do_swizzle_array)
         print "   %s," % (colorspace_map(format.colorspace),)
-        if format.colorspace != ZS and not format.is_pure_color():
+        access = True
+        if format.layout in ('bptc', 'astc'):
+            access = False
+        if format.layout == 'etc' and format.short_name() != 'etc1_rgb8':
+            access = False
+        if format.colorspace != ZS and not format.is_pure_color() and access:
             print "   &util_format_%s_unpack_rgba_8unorm," % format.short_name() 
             print "   &util_format_%s_pack_rgba_8unorm," % format.short_name() 
-            if format.layout == 's3tc' or format.layout == 'rgtc' or format.layout == 'bptc':
+            if format.layout == 's3tc' or format.layout == 'rgtc':
                 print "   &util_format_%s_fetch_rgba_8unorm," % format.short_name()
             else:
                 print "   NULL, /* fetch_rgba_8unorm */" 
diff --git a/src/gallium/auxiliary/vl/vl_winsys.h b/src/gallium/auxiliary/vl/vl_winsys.h
index df01917466f..1af7653d650 100644
--- a/src/gallium/auxiliary/vl/vl_winsys.h
+++ b/src/gallium/auxiliary/vl/vl_winsys.h
@@ -42,34 +42,31 @@ struct pipe_loader_device;
 
 struct vl_screen
 {
-   struct pipe_screen *pscreen;
-   struct pipe_loader_device *dev;
-};
+   void (*destroy)(struct vl_screen *vscreen);
 
-struct vl_screen*
-vl_screen_create(Display *display, int screen);
+   struct pipe_resource *
+   (*texture_from_drawable)(struct vl_screen *vscreen, void *drawable);
 
-void vl_screen_destroy(struct vl_screen *vscreen);
+   struct u_rect *
+   (*get_dirty_area)(struct vl_screen *vscreen);
 
-struct pipe_resource*
-vl_screen_texture_from_drawable(struct vl_screen *vscreen, Drawable drawable);
+   uint64_t
+   (*get_timestamp)(struct vl_screen *vscreen, void *drawable);
 
-struct u_rect *
-vl_screen_get_dirty_area(struct vl_screen *vscreen);
+   void
+   (*set_next_timestamp)(struct vl_screen *vscreen, uint64_t stamp);
 
-uint64_t
-vl_screen_get_timestamp(struct vl_screen *vscreen, Drawable drawable);
+   void *
+   (*get_private)(struct vl_screen *vscreen);
 
-void
-vl_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp);
+   struct pipe_screen *pscreen;
+   struct pipe_loader_device *dev;
+};
 
-void*
-vl_screen_get_private(struct vl_screen *vscreen);
+struct vl_screen *
+vl_dri2_screen_create(Display *display, int screen);
 
-struct vl_screen*
+struct vl_screen *
 vl_drm_screen_create(int fd);
 
-void
-vl_drm_screen_destroy(struct vl_screen *vscreen);
-
 #endif
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c
index 3b1b87f9523..ae0d4cdee1b 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -73,24 +73,27 @@ struct vl_dri_screen
    int64_t last_ust, ns_frame, last_msc, next_msc;
 };
 
-static const unsigned int attachments[1] = { XCB_DRI2_ATTACHMENT_BUFFER_BACK_LEFT };
+static const unsigned attachments[1] = { XCB_DRI2_ATTACHMENT_BUFFER_BACK_LEFT };
+
+static void vl_dri2_screen_destroy(struct vl_screen *vscreen);
 
 static void
-vl_dri2_handle_stamps(struct vl_dri_screen* scrn,
+vl_dri2_handle_stamps(struct vl_dri_screen *scrn,
                       uint32_t ust_hi, uint32_t ust_lo,
                       uint32_t msc_hi, uint32_t msc_lo)
 {
    int64_t ust = ((((uint64_t)ust_hi) << 32) | ust_lo) * 1000;
    int64_t msc = (((uint64_t)msc_hi) << 32) | msc_lo;
 
-   if (scrn->last_ust && scrn->last_msc && (ust > scrn->last_ust) && (msc > scrn->last_msc))
+   if (scrn->last_ust && (ust > scrn->last_ust) &&
+       scrn->last_msc && (msc > scrn->last_msc))
       scrn->ns_frame = (ust - scrn->last_ust) / (msc - scrn->last_msc);
 
    scrn->last_ust = ust;
    scrn->last_msc = msc;
 }
 
-static xcb_dri2_get_buffers_reply_t*
+static xcb_dri2_get_buffers_reply_t *
 vl_dri2_get_flush_reply(struct vl_dri_screen *scrn)
 {
    xcb_dri2_wait_sbc_reply_t *wait_sbc_reply;
@@ -120,7 +123,7 @@ vl_dri2_flush_frontbuffer(struct pipe_screen *screen,
                           unsigned level, unsigned layer,
                           void *context_private, struct pipe_box *sub_box)
 {
-   struct vl_dri_screen *scrn = (struct vl_dri_screen*)context_private;
+   struct vl_dri_screen *scrn = (struct vl_dri_screen *)context_private;
    uint32_t msc_hi, msc_lo;
 
    assert(screen);
@@ -132,9 +135,11 @@ vl_dri2_flush_frontbuffer(struct pipe_screen *screen,
    msc_hi = scrn->next_msc >> 32;
    msc_lo = scrn->next_msc & 0xFFFFFFFF;
 
-   scrn->swap_cookie = xcb_dri2_swap_buffers_unchecked(scrn->conn, scrn->drawable, msc_hi, msc_lo, 0, 0, 0, 0);
+   scrn->swap_cookie = xcb_dri2_swap_buffers_unchecked(scrn->conn, scrn->drawable,
+                                                       msc_hi, msc_lo, 0, 0, 0, 0);
    scrn->wait_cookie = xcb_dri2_wait_sbc_unchecked(scrn->conn, scrn->drawable, 0, 0);
-   scrn->buffers_cookie = xcb_dri2_get_buffers_unchecked(scrn->conn, scrn->drawable, 1, 1, attachments);
+   scrn->buffers_cookie = xcb_dri2_get_buffers_unchecked(scrn->conn, scrn->drawable,
+                                                         1, 1, attachments);
 
    scrn->flushed = true;
    scrn->current_buffer = !scrn->current_buffer;
@@ -170,10 +175,10 @@ vl_dri2_set_drawable(struct vl_dri_screen *scrn, Drawable drawable)
    scrn->drawable = drawable;
 }
 
-struct pipe_resource*
-vl_screen_texture_from_drawable(struct vl_screen *vscreen, Drawable drawable)
+static struct pipe_resource *
+vl_dri2_screen_texture_from_drawable(struct vl_screen *vscreen, void *drawable)
 {
-   struct vl_dri_screen *scrn = (struct vl_dri_screen*)vscreen;
+   struct vl_dri_screen *scrn = (struct vl_dri_screen *)vscreen;
 
    struct winsys_handle dri2_handle;
    struct pipe_resource template, *tex;
@@ -185,11 +190,12 @@ vl_screen_texture_from_drawable(struct vl_screen *vscreen, Drawable drawable)
 
    assert(scrn);
 
-   vl_dri2_set_drawable(scrn, drawable);
+   vl_dri2_set_drawable(scrn, (Drawable)drawable);
    reply = vl_dri2_get_flush_reply(scrn);
    if (!reply) {
       xcb_dri2_get_buffers_cookie_t cookie;
-      cookie = xcb_dri2_get_buffers_unchecked(scrn->conn, drawable, 1, 1, attachments);
+      cookie = xcb_dri2_get_buffers_unchecked(scrn->conn, (Drawable)drawable,
+                                              1, 1, attachments);
       reply = xcb_dri2_get_buffers_reply(scrn->conn, cookie, NULL);
    }
    if (!reply)
@@ -241,32 +247,33 @@ vl_screen_texture_from_drawable(struct vl_screen *vscreen, Drawable drawable)
    template.bind = PIPE_BIND_RENDER_TARGET;
    template.flags = 0;
 
-   tex = scrn->base.pscreen->resource_from_handle(scrn->base.pscreen, &template, &dri2_handle);
+   tex = scrn->base.pscreen->resource_from_handle(scrn->base.pscreen, &template,
+                                                  &dri2_handle);
    free(reply);
 
    return tex;
 }
 
-struct u_rect *
-vl_screen_get_dirty_area(struct vl_screen *vscreen)
+static struct u_rect *
+vl_dri2_screen_get_dirty_area(struct vl_screen *vscreen)
 {
-   struct vl_dri_screen *scrn = (struct vl_dri_screen*)vscreen;
+   struct vl_dri_screen *scrn = (struct vl_dri_screen *)vscreen;
    assert(scrn);
    return &scrn->dirty_areas[scrn->current_buffer];
 }
 
-uint64_t
-vl_screen_get_timestamp(struct vl_screen *vscreen, Drawable drawable)
+static uint64_t
+vl_dri2_screen_get_timestamp(struct vl_screen *vscreen, void *drawable)
 {
-   struct vl_dri_screen *scrn = (struct vl_dri_screen*)vscreen;
+   struct vl_dri_screen *scrn = (struct vl_dri_screen *)vscreen;
    xcb_dri2_get_msc_cookie_t cookie;
    xcb_dri2_get_msc_reply_t *reply;
 
    assert(scrn);
 
-   vl_dri2_set_drawable(scrn, drawable);
+   vl_dri2_set_drawable(scrn, (Drawable)drawable);
    if (!scrn->last_ust) {
-      cookie = xcb_dri2_get_msc_unchecked(scrn->conn, drawable);
+      cookie = xcb_dri2_get_msc_unchecked(scrn->conn, (Drawable)drawable);
       reply = xcb_dri2_get_msc_reply(scrn->conn, cookie, NULL);
 
       if (reply) {
@@ -278,19 +285,20 @@ vl_screen_get_timestamp(struct vl_screen *vscreen, Drawable drawable)
    return scrn->last_ust;
 }
 
-void
-vl_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp)
+static void
+vl_dri2_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp)
 {
-   struct vl_dri_screen *scrn = (struct vl_dri_screen*)vscreen;
+   struct vl_dri_screen *scrn = (struct vl_dri_screen *)vscreen;
    assert(scrn);
    if (stamp && scrn->last_ust && scrn->ns_frame && scrn->last_msc)
-      scrn->next_msc = ((int64_t)stamp - scrn->last_ust + scrn->ns_frame/2) / scrn->ns_frame + scrn->last_msc;
+      scrn->next_msc = ((int64_t)stamp - scrn->last_ust + scrn->ns_frame/2) /
+                       scrn->ns_frame + scrn->last_msc;
    else
       scrn->next_msc = 0;
 }
 
-void*
-vl_screen_get_private(struct vl_screen *vscreen)
+static void *
+vl_dri2_screen_get_private(struct vl_screen *vscreen)
 {
    return vscreen;
 }
@@ -305,8 +313,8 @@ get_xcb_screen(xcb_screen_iterator_t iter, int screen)
     return NULL;
 }
 
-struct vl_screen*
-vl_screen_create(Display *display, int screen)
+struct vl_screen *
+vl_dri2_screen_create(Display *display, int screen)
 {
    struct vl_dri_screen *scrn;
    const xcb_query_extension_reply_t *extension;
@@ -320,7 +328,7 @@ vl_screen_create(Display *display, int screen)
    xcb_generic_error_t *error = NULL;
    char *device_name;
    int fd, device_name_length;
-   unsigned int driverType;
+   unsigned driverType;
 
    drm_magic_t magic;
 
@@ -340,7 +348,9 @@ vl_screen_create(Display *display, int screen)
    if (!(extension && extension->present))
       goto free_screen;
 
-   dri2_query_cookie = xcb_dri2_query_version (scrn->conn, XCB_DRI2_MAJOR_VERSION, XCB_DRI2_MINOR_VERSION);
+   dri2_query_cookie = xcb_dri2_query_version (scrn->conn,
+                                               XCB_DRI2_MAJOR_VERSION,
+                                               XCB_DRI2_MINOR_VERSION);
    dri2_query = xcb_dri2_query_version_reply (scrn->conn, dri2_query_cookie, &error);
    if (dri2_query == NULL || error != NULL || dri2_query->minor_version < 2)
       goto free_query;
@@ -352,7 +362,7 @@ vl_screen_create(Display *display, int screen)
    {
       char *prime = getenv("DRI_PRIME");
       if (prime) {
-         unsigned int primeid;
+         unsigned primeid;
          errno = 0;
          primeid = strtoul(prime, NULL, 0);
          if (errno == 0)
@@ -362,9 +372,12 @@ vl_screen_create(Display *display, int screen)
    }
 #endif
 
-   connect_cookie = xcb_dri2_connect_unchecked(scrn->conn, get_xcb_screen(s, screen)->root, driverType);
+   connect_cookie = xcb_dri2_connect_unchecked(scrn->conn,
+                                               get_xcb_screen(s, screen)->root,
+                                               driverType);
    connect = xcb_dri2_connect_reply(scrn->conn, connect_cookie, NULL);
-   if (connect == NULL || connect->driver_name_length + connect->device_name_length == 0)
+   if (connect == NULL ||
+       connect->driver_name_length + connect->device_name_length == 0)
       goto free_connect;
 
    device_name_length = xcb_dri2_connect_device_name_length(connect);
@@ -381,22 +394,26 @@ vl_screen_create(Display *display, int screen)
    if (drmGetMagic(fd, &magic))
       goto free_connect;
 
-   authenticate_cookie = xcb_dri2_authenticate_unchecked(scrn->conn, get_xcb_screen(s, screen)->root, magic);
+   authenticate_cookie = xcb_dri2_authenticate_unchecked(scrn->conn,
+                                                         get_xcb_screen(s, screen)->root,
+                                                         magic);
    authenticate = xcb_dri2_authenticate_reply(scrn->conn, authenticate_cookie, NULL);
 
    if (authenticate == NULL || !authenticate->authenticated)
       goto free_authenticate;
 
-#if GALLIUM_STATIC_TARGETS
-   scrn->base.pscreen = dd_create_screen(fd);
-#else
-   if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd))
-      scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev, PIPE_SEARCH_DIR);
-#endif // GALLIUM_STATIC_TARGETS
+   if (pipe_loader_drm_probe_fd(&scrn->base.dev, dup(fd)))
+      scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev);
 
    if (!scrn->base.pscreen)
       goto release_pipe;
 
+   scrn->base.destroy = vl_dri2_screen_destroy;
+   scrn->base.texture_from_drawable = vl_dri2_screen_texture_from_drawable;
+   scrn->base.get_dirty_area = vl_dri2_screen_get_dirty_area;
+   scrn->base.get_timestamp = vl_dri2_screen_get_timestamp;
+   scrn->base.set_next_timestamp = vl_dri2_screen_set_next_timestamp;
+   scrn->base.get_private = vl_dri2_screen_get_private;
    scrn->base.pscreen->flush_frontbuffer = vl_dri2_flush_frontbuffer;
    vl_compositor_reset_dirty_area(&scrn->dirty_areas[0]);
    vl_compositor_reset_dirty_area(&scrn->dirty_areas[1]);
@@ -409,10 +426,8 @@ vl_screen_create(Display *display, int screen)
    return &scrn->base;
 
 release_pipe:
-#if !GALLIUM_STATIC_TARGETS
    if (scrn->base.dev)
       pipe_loader_release(&scrn->base.dev, 1);
-#endif // !GALLIUM_STATIC_TARGETS
 free_authenticate:
    free(authenticate);
 free_connect:
@@ -426,9 +441,10 @@ free_screen:
    return NULL;
 }
 
-void vl_screen_destroy(struct vl_screen *vscreen)
+static void
+vl_dri2_screen_destroy(struct vl_screen *vscreen)
 {
-   struct vl_dri_screen *scrn = (struct vl_dri_screen*)vscreen;
+   struct vl_dri_screen *scrn = (struct vl_dri_screen *)vscreen;
 
    assert(vscreen);
 
@@ -440,8 +456,6 @@ void vl_screen_destroy(struct vl_screen *vscreen)
 
    vl_dri2_destroy_drawable(scrn);
    scrn->base.pscreen->destroy(scrn->base.pscreen);
-#if !GALLIUM_STATIC_TARGETS
    pipe_loader_release(&scrn->base.dev, 1);
-#endif // !GALLIUM_STATIC_TARGETS
    FREE(scrn);
 }
diff --git a/src/gallium/auxiliary/vl/vl_winsys_drm.c b/src/gallium/auxiliary/vl/vl_winsys_drm.c
index 1167fcf6a90..f993e2c7727 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_drm.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@@ -34,7 +34,10 @@
 #include "util/u_memory.h"
 #include "vl/vl_winsys.h"
 
-struct vl_screen*
+static void
+vl_drm_screen_destroy(struct vl_screen *vscreen);
+
+struct vl_screen *
 vl_drm_screen_create(int fd)
 {
    struct vl_screen *vscreen;
@@ -43,35 +46,34 @@ vl_drm_screen_create(int fd)
    if (!vscreen)
       return NULL;
 
-#if GALLIUM_STATIC_TARGETS
-   vscreen->pscreen = dd_create_screen(fd);
-#else
-   if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd))) {
-      vscreen->pscreen =
-         pipe_loader_create_screen(vscreen->dev, PIPE_SEARCH_DIR);
-      if (!vscreen->pscreen)
-         pipe_loader_release(&vscreen->dev, 1);
-   }
-#endif
+   if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd)))
+      vscreen->pscreen = pipe_loader_create_screen(vscreen->dev);
 
-   if (!vscreen->pscreen) {
-      FREE(vscreen);
-      return NULL;
-   }
+   if (!vscreen->pscreen)
+      goto error;
 
+   vscreen->destroy = vl_drm_screen_destroy;
+   vscreen->texture_from_drawable = NULL;
+   vscreen->get_dirty_area = NULL;
+   vscreen->get_timestamp = NULL;
+   vscreen->set_next_timestamp = NULL;
+   vscreen->get_private = NULL;
    return vscreen;
+
+error:
+   if (vscreen->dev)
+      pipe_loader_release(&vscreen->dev, 1);
+
+   FREE(vscreen);
+   return NULL;
 }
 
-void
+static void
 vl_drm_screen_destroy(struct vl_screen *vscreen)
 {
    assert(vscreen);
 
    vscreen->pscreen->destroy(vscreen->pscreen);
-
-#if !GALLIUM_STATIC_TARGETS
    pipe_loader_release(&vscreen->dev, 1);
-#endif
-
    FREE(vscreen);
 }
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index ef235734755..77f708f449c 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index b5e1ddadde0..a6940dfefea 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
@@ -111,10 +111,14 @@ enum a3xx_vtx_fmt {
 	VFMT_8_8_SNORM = 53,
 	VFMT_8_8_8_SNORM = 54,
 	VFMT_8_8_8_8_SNORM = 55,
-	VFMT_10_10_10_2_UINT = 60,
-	VFMT_10_10_10_2_UNORM = 61,
-	VFMT_10_10_10_2_SINT = 62,
-	VFMT_10_10_10_2_SNORM = 63,
+	VFMT_10_10_10_2_UINT = 56,
+	VFMT_10_10_10_2_UNORM = 57,
+	VFMT_10_10_10_2_SINT = 58,
+	VFMT_10_10_10_2_SNORM = 59,
+	VFMT_2_10_10_10_UINT = 60,
+	VFMT_2_10_10_10_UNORM = 61,
+	VFMT_2_10_10_10_SINT = 62,
+	VFMT_2_10_10_10_SNORM = 63,
 };
 
 enum a3xx_tex_fmt {
@@ -138,10 +142,12 @@ enum a3xx_tex_fmt {
 	TFMT_DXT1 = 36,
 	TFMT_DXT3 = 37,
 	TFMT_DXT5 = 38,
+	TFMT_2_10_10_10_UNORM = 40,
 	TFMT_10_10_10_2_UNORM = 41,
 	TFMT_9_9_9_E5_FLOAT = 42,
 	TFMT_11_11_10_FLOAT = 43,
 	TFMT_A8_UNORM = 44,
+	TFMT_L8_UNORM = 45,
 	TFMT_L8_A8_UNORM = 47,
 	TFMT_8_UNORM = 48,
 	TFMT_8_8_UNORM = 49,
@@ -183,6 +189,8 @@ enum a3xx_tex_fmt {
 	TFMT_32_SINT = 92,
 	TFMT_32_32_SINT = 93,
 	TFMT_32_32_32_32_SINT = 95,
+	TFMT_2_10_10_10_UINT = 96,
+	TFMT_10_10_10_2_UINT = 97,
 	TFMT_ETC2_RG11_SNORM = 112,
 	TFMT_ETC2_RG11_UNORM = 113,
 	TFMT_ETC2_R11_SNORM = 114,
@@ -215,6 +223,9 @@ enum a3xx_color_fmt {
 	RB_R8_UINT = 14,
 	RB_R8_SINT = 15,
 	RB_R10G10B10A2_UNORM = 16,
+	RB_A2R10G10B10_UNORM = 17,
+	RB_R10G10B10A2_UINT = 18,
+	RB_A2R10G10B10_UINT = 19,
 	RB_A8_UNORM = 20,
 	RB_R8_UNORM = 21,
 	RB_R16_FLOAT = 24,
@@ -251,25 +262,6 @@ enum a3xx_sp_perfcounter_select {
 	SP_ALU_ACTIVE_CYCLES = 29,
 };
 
-enum a3xx_rop_code {
-	ROP_CLEAR = 0,
-	ROP_NOR = 1,
-	ROP_AND_INVERTED = 2,
-	ROP_COPY_INVERTED = 3,
-	ROP_AND_REVERSE = 4,
-	ROP_INVERT = 5,
-	ROP_XOR = 6,
-	ROP_NAND = 7,
-	ROP_AND = 8,
-	ROP_EQUIV = 9,
-	ROP_NOOP = 10,
-	ROP_OR_INVERTED = 11,
-	ROP_COPY = 12,
-	ROP_OR_REVERSE = 13,
-	ROP_OR = 14,
-	ROP_SET = 15,
-};
-
 enum a3xx_rb_blend_opcode {
 	BLEND_DST_PLUS_SRC = 0,
 	BLEND_SRC_MINUS_DST = 1,
@@ -1620,12 +1612,24 @@ static inline uint32_t A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(uint32_t val)
 }
 
 #define REG_A3XX_VFD_CONTROL_1					0x00002241
-#define A3XX_VFD_CONTROL_1_MAXSTORAGE__MASK			0x0000ffff
+#define A3XX_VFD_CONTROL_1_MAXSTORAGE__MASK			0x0000000f
 #define A3XX_VFD_CONTROL_1_MAXSTORAGE__SHIFT			0
 static inline uint32_t A3XX_VFD_CONTROL_1_MAXSTORAGE(uint32_t val)
 {
 	return ((val) << A3XX_VFD_CONTROL_1_MAXSTORAGE__SHIFT) & A3XX_VFD_CONTROL_1_MAXSTORAGE__MASK;
 }
+#define A3XX_VFD_CONTROL_1_MAXTHRESHOLD__MASK			0x000000f0
+#define A3XX_VFD_CONTROL_1_MAXTHRESHOLD__SHIFT			4
+static inline uint32_t A3XX_VFD_CONTROL_1_MAXTHRESHOLD(uint32_t val)
+{
+	return ((val) << A3XX_VFD_CONTROL_1_MAXTHRESHOLD__SHIFT) & A3XX_VFD_CONTROL_1_MAXTHRESHOLD__MASK;
+}
+#define A3XX_VFD_CONTROL_1_MINTHRESHOLD__MASK			0x00000f00
+#define A3XX_VFD_CONTROL_1_MINTHRESHOLD__SHIFT			8
+static inline uint32_t A3XX_VFD_CONTROL_1_MINTHRESHOLD(uint32_t val)
+{
+	return ((val) << A3XX_VFD_CONTROL_1_MINTHRESHOLD__SHIFT) & A3XX_VFD_CONTROL_1_MINTHRESHOLD__MASK;
+}
 #define A3XX_VFD_CONTROL_1_REGID4VTX__MASK			0x00ff0000
 #define A3XX_VFD_CONTROL_1_REGID4VTX__SHIFT			16
 static inline uint32_t A3XX_VFD_CONTROL_1_REGID4VTX(uint32_t val)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 3906c9b996e..b8a31d84b3f 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -81,7 +81,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
 			info->restart_index : 0xffffffff);
 
+	/* points + psize -> spritelist: */
 	if (ctx->rasterizer->point_size_per_vertex &&
+			fd3_emit_get_vp(emit)->writes_psize &&
 			(info->mode == PIPE_PRIM_POINTS))
 		primtype = DI_PT_POINTLIST_PSIZE;
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 8f9c8b0623c..24afbc9e956 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -209,13 +209,19 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 					fd3_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
 			struct fd_resource *rsc = fd_resource(view->base.texture);
-			unsigned start = fd_sampler_first_level(&view->base);
-			unsigned end   = fd_sampler_last_level(&view->base);;
+			if (rsc && rsc->base.b.target == PIPE_BUFFER) {
+				OUT_RELOC(ring, rsc->bo, view->base.u.buf.first_element *
+						  util_format_get_blocksize(view->base.format), 0, 0);
+				j = 1;
+			} else {
+				unsigned start = fd_sampler_first_level(&view->base);
+				unsigned end   = fd_sampler_last_level(&view->base);;
 
-			for (j = 0; j < (end - start + 1); j++) {
-				struct fd_resource_slice *slice =
+				for (j = 0; j < (end - start + 1); j++) {
+					struct fd_resource_slice *slice =
 						fd_resource_slice(rsc, j + start);
-				OUT_RELOC(ring, rsc->bo, slice->offset, 0, 0);
+					OUT_RELOC(ring, rsc->bo, slice->offset, 0, 0);
+				}
 			}
 
 			/* pad the remaining entries w/ null: */
@@ -350,7 +356,10 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 	unsigned instance_regid = regid(63, 0);
 	unsigned vtxcnt_regid = regid(63, 0);
 
+	/* Note that sysvals come *after* normal inputs: */
 	for (i = 0; i < vp->inputs_count; i++) {
+		if (!vp->inputs[i].compmask)
+			continue;
 		if (vp->inputs[i].sysval) {
 			switch(vp->inputs[i].slot) {
 			case SYSTEM_VALUE_BASE_VERTEX:
@@ -369,18 +378,11 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 				unreachable("invalid system value");
 				break;
 			}
-		} else if (i < vtx->vtx->num_elements && vp->inputs[i].compmask) {
+		} else if (i < vtx->vtx->num_elements) {
 			last = i;
 		}
 	}
 
-	/* hw doesn't like to be configured for zero vbo's, it seems: */
-	if ((vtx->vtx->num_elements == 0) &&
-			(vertex_regid == regid(63, 0)) &&
-			(instance_regid == regid(63, 0)) &&
-			(vtxcnt_regid == regid(63, 0)))
-		return;
-
 	for (i = 0, j = 0; i <= last; i++) {
 		assert(!vp->inputs[i].sysval);
 		if (vp->inputs[i].compmask) {
@@ -424,6 +426,38 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 		}
 	}
 
+	/* hw doesn't like to be configured for zero vbo's, it seems: */
+	if (last < 0) {
+		/* just recycle the shader bo, we just need to point to *something*
+		 * valid:
+		 */
+		struct fd_bo *dummy_vbo = vp->bo;
+		bool switchnext = (vertex_regid != regid(63, 0)) ||
+				(instance_regid != regid(63, 0)) ||
+				(vtxcnt_regid != regid(63, 0));
+
+		OUT_PKT0(ring, REG_A3XX_VFD_FETCH(0), 2);
+		OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(0) |
+				A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(0) |
+				COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) |
+				A3XX_VFD_FETCH_INSTR_0_INDEXCODE(0) |
+				A3XX_VFD_FETCH_INSTR_0_STEPRATE(1));
+		OUT_RELOC(ring, dummy_vbo, 0, 0, 0);
+
+		OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(0), 1);
+		OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL |
+				A3XX_VFD_DECODE_INSTR_WRITEMASK(0x1) |
+				A3XX_VFD_DECODE_INSTR_FORMAT(VFMT_8_UNORM) |
+				A3XX_VFD_DECODE_INSTR_SWAP(XYZW) |
+				A3XX_VFD_DECODE_INSTR_REGID(regid(0,0)) |
+				A3XX_VFD_DECODE_INSTR_SHIFTCNT(1) |
+				A3XX_VFD_DECODE_INSTR_LASTCOMPVALID |
+				COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT));
+
+		total_in = 1;
+		j = 1;
+	}
+
 	OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
 	OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(total_in) |
 			A3XX_VFD_CONTROL_0_PACKETSIZE(2) |
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.c b/src/gallium/drivers/freedreno/a3xx/fd3_format.c
index 857d156c869..52ea9444517 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.c
@@ -188,9 +188,13 @@ static struct fd3_format formats[PIPE_FORMAT_COUNT] = {
 	VT(B10G10R10A2_UNORM,   10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ),
 	_T(B10G10R10X2_UNORM,   10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ),
 	V_(R10G10B10A2_SNORM,   10_10_10_2_SNORM, NONE,              WZYX),
+	V_(B10G10R10A2_SNORM,   10_10_10_2_SNORM, NONE,              WXYZ),
 	V_(R10G10B10A2_UINT,    10_10_10_2_UINT,  NONE,              WZYX),
+	V_(B10G10R10A2_UINT,    10_10_10_2_UINT,  NONE,              WXYZ),
 	V_(R10G10B10A2_USCALED, 10_10_10_2_UINT,  NONE,              WZYX),
+	V_(B10G10R10A2_USCALED, 10_10_10_2_UINT,  NONE,              WXYZ),
 	V_(R10G10B10A2_SSCALED, 10_10_10_2_SINT,  NONE,              WZYX),
+	V_(B10G10R10A2_SSCALED, 10_10_10_2_SINT,  NONE,              WXYZ),
 
 	_T(R11G11B10_FLOAT, 11_11_10_FLOAT, R11G11B10_FLOAT, WZYX),
 	_T(R9G9B9E5_FLOAT,  9_9_9_E5_FLOAT, NONE,            WZYX),
@@ -271,6 +275,16 @@ static struct fd3_format formats[PIPE_FORMAT_COUNT] = {
 	_T(DXT3_SRGBA, DXT3, NONE, WZYX),
 	_T(DXT5_RGBA,  DXT5, NONE, WZYX),
 	_T(DXT5_SRGBA, DXT5, NONE, WZYX),
+
+	/* faked */
+	_T(RGTC1_UNORM, 8_8_8_8_UNORM, NONE, WZYX),
+	_T(RGTC1_SNORM, 8_8_8_8_SNORM, NONE, WZYX),
+	_T(RGTC2_UNORM, 8_8_8_8_UNORM, NONE, WZYX),
+	_T(RGTC2_SNORM, 8_8_8_8_SNORM, NONE, WZYX),
+	_T(LATC1_UNORM, 8_8_8_8_UNORM, NONE, WZYX),
+	_T(LATC1_SNORM, 8_8_8_8_SNORM, NONE, WZYX),
+	_T(LATC2_UNORM, 8_8_8_8_UNORM, NONE, WZYX),
+	_T(LATC2_SNORM, 8_8_8_8_SNORM, NONE, WZYX),
 };
 
 enum a3xx_vtx_fmt
@@ -310,6 +324,8 @@ fd3_pipe2fetchsize(enum pipe_format format)
 {
 	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
 		format = PIPE_FORMAT_Z32_FLOAT;
+	else if (util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_RGTC)
+		format = PIPE_FORMAT_R8G8B8A8_UNORM;
 	switch (util_format_get_blocksizebits(format) / util_format_get_blockwidth(format)) {
 	case 8: return TFETCH_1_BYTE;
 	case 16: return TFETCH_2_BYTE;
@@ -324,6 +340,14 @@ fd3_pipe2fetchsize(enum pipe_format format)
 	}
 }
 
+unsigned
+fd3_pipe2nblocksx(enum pipe_format format, unsigned width)
+{
+	if (util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_RGTC)
+		format = PIPE_FORMAT_R8G8B8A8_UNORM;
+	return util_format_get_nblocksx(format, width);
+}
+
 /* we need to special case a bit the depth/stencil restore, because we are
  * using the texture sampler to blit into the depth/stencil buffer, *not*
  * into a color buffer.  Otherwise fd3_tex_swiz() will do the wrong thing,
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.h b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
index 05c5ea3d247..48c503e9a82 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
@@ -37,6 +37,7 @@ enum a3xx_color_fmt fd3_pipe2color(enum pipe_format format);
 enum pipe_format fd3_gmem_restore_format(enum pipe_format format);
 enum a3xx_color_fmt fd3_fs_output_format(enum pipe_format format);
 enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format);
+unsigned fd3_pipe2nblocksx(enum pipe_format format, unsigned width);
 
 uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r,
 		unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a);
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index 2d6ecb2c050..99ae99ea0c1 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -211,8 +211,7 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 {
 	struct fd3_pipe_sampler_view *so = CALLOC_STRUCT(fd3_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
-	unsigned lvl = fd_sampler_first_level(cso);
-	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
+	unsigned lvl;
 	uint32_t sz2 = 0;
 
 	if (!so)
@@ -227,20 +226,34 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	so->texconst0 =
 			A3XX_TEX_CONST_0_TYPE(tex_type(prsc->target)) |
 			A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(cso->format)) |
-			A3XX_TEX_CONST_0_MIPLVLS(miplevels) |
 			fd3_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g,
 						cso->swizzle_b, cso->swizzle_a);
 
 	if (util_format_is_srgb(cso->format))
 		so->texconst0 |= A3XX_TEX_CONST_0_SRGB;
 
-	so->texconst1 =
+	if (prsc->target == PIPE_BUFFER) {
+		lvl = 0;
+		so->texconst1 =
+			A3XX_TEX_CONST_1_FETCHSIZE(fd3_pipe2fetchsize(cso->format)) |
+			A3XX_TEX_CONST_1_WIDTH(cso->u.buf.last_element -
+								   cso->u.buf.first_element + 1) |
+			A3XX_TEX_CONST_1_HEIGHT(1);
+	} else {
+		unsigned miplevels;
+
+		lvl = fd_sampler_first_level(cso);
+		miplevels = fd_sampler_last_level(cso) - lvl;
+
+		so->texconst0 |= A3XX_TEX_CONST_0_MIPLVLS(miplevels);
+		so->texconst1 =
 			A3XX_TEX_CONST_1_FETCHSIZE(fd3_pipe2fetchsize(cso->format)) |
 			A3XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) |
 			A3XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl));
+	}
 	/* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */
 	so->texconst2 =
-			A3XX_TEX_CONST_2_PITCH(util_format_get_nblocksx(cso->format, rsc->slices[lvl].pitch) * rsc->cpp);
+			A3XX_TEX_CONST_2_PITCH(fd3_pipe2nblocksx(cso->format, rsc->slices[lvl].pitch) * rsc->cpp);
 	switch (prsc->target) {
 	case PIPE_TEXTURE_1D_ARRAY:
 	case PIPE_TEXTURE_2D_ARRAY:
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 9f970365464..a450379e98d 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
@@ -47,11 +47,13 @@ enum a4xx_color_fmt {
 	RB4_R8_UNORM = 2,
 	RB4_R4G4B4A4_UNORM = 8,
 	RB4_R5G5B5A1_UNORM = 10,
-	RB4_R5G6R5_UNORM = 14,
+	RB4_R5G6B5_UNORM = 14,
 	RB4_R8G8_UNORM = 15,
 	RB4_R8G8_SNORM = 16,
 	RB4_R8G8_UINT = 17,
 	RB4_R8G8_SINT = 18,
+	RB4_R16_UNORM = 19,
+	RB4_R16_SNORM = 20,
 	RB4_R16_FLOAT = 21,
 	RB4_R16_UINT = 22,
 	RB4_R16_SINT = 23,
@@ -63,12 +65,16 @@ enum a4xx_color_fmt {
 	RB4_R10G10B10A2_UNORM = 31,
 	RB4_R10G10B10A2_UINT = 34,
 	RB4_R11G11B10_FLOAT = 39,
+	RB4_R16G16_UNORM = 40,
+	RB4_R16G16_SNORM = 41,
 	RB4_R16G16_FLOAT = 42,
 	RB4_R16G16_UINT = 43,
 	RB4_R16G16_SINT = 44,
 	RB4_R32_FLOAT = 45,
 	RB4_R32_UINT = 46,
 	RB4_R32_SINT = 47,
+	RB4_R16G16B16A16_UNORM = 52,
+	RB4_R16G16B16A16_SNORM = 53,
 	RB4_R16G16B16A16_FLOAT = 54,
 	RB4_R16G16B16A16_UINT = 55,
 	RB4_R16G16B16A16_SINT = 56,
@@ -106,6 +112,7 @@ enum a4xx_vtx_fmt {
 	VFMT4_32_32_FIXED = 10,
 	VFMT4_32_32_32_FIXED = 11,
 	VFMT4_32_32_32_32_FIXED = 12,
+	VFMT4_11_11_10_FLOAT = 13,
 	VFMT4_16_SINT = 16,
 	VFMT4_16_16_SINT = 17,
 	VFMT4_16_16_16_SINT = 18,
@@ -146,18 +153,19 @@ enum a4xx_vtx_fmt {
 	VFMT4_8_8_SNORM = 53,
 	VFMT4_8_8_8_SNORM = 54,
 	VFMT4_8_8_8_8_SNORM = 55,
-	VFMT4_10_10_10_2_UINT = 60,
-	VFMT4_10_10_10_2_UNORM = 61,
-	VFMT4_10_10_10_2_SINT = 62,
-	VFMT4_10_10_10_2_SNORM = 63,
+	VFMT4_10_10_10_2_UINT = 56,
+	VFMT4_10_10_10_2_UNORM = 57,
+	VFMT4_10_10_10_2_SINT = 58,
+	VFMT4_10_10_10_2_SNORM = 59,
 };
 
 enum a4xx_tex_fmt {
 	TFMT4_5_6_5_UNORM = 11,
-	TFMT4_5_5_5_1_UNORM = 10,
+	TFMT4_5_5_5_1_UNORM = 9,
 	TFMT4_4_4_4_4_UNORM = 8,
 	TFMT4_X8Z24_UNORM = 71,
 	TFMT4_10_10_10_2_UNORM = 33,
+	TFMT4_10_10_10_2_UINT = 34,
 	TFMT4_A8_UNORM = 3,
 	TFMT4_L8_A8_UNORM = 13,
 	TFMT4_8_UNORM = 4,
@@ -172,6 +180,12 @@ enum a4xx_tex_fmt {
 	TFMT4_8_SINT = 7,
 	TFMT4_8_8_SINT = 17,
 	TFMT4_8_8_8_8_SINT = 31,
+	TFMT4_16_UNORM = 18,
+	TFMT4_16_16_UNORM = 38,
+	TFMT4_16_16_16_16_UNORM = 51,
+	TFMT4_16_SNORM = 19,
+	TFMT4_16_16_SNORM = 39,
+	TFMT4_16_16_16_16_SNORM = 52,
 	TFMT4_16_UINT = 21,
 	TFMT4_16_16_UINT = 41,
 	TFMT4_16_16_16_16_UINT = 54,
@@ -190,8 +204,21 @@ enum a4xx_tex_fmt {
 	TFMT4_32_FLOAT = 43,
 	TFMT4_32_32_FLOAT = 56,
 	TFMT4_32_32_32_32_FLOAT = 63,
+	TFMT4_32_32_32_FLOAT = 59,
+	TFMT4_32_32_32_UINT = 60,
+	TFMT4_32_32_32_SINT = 61,
 	TFMT4_9_9_9_E5_FLOAT = 32,
 	TFMT4_11_11_10_FLOAT = 37,
+	TFMT4_DXT1 = 86,
+	TFMT4_DXT3 = 87,
+	TFMT4_DXT5 = 88,
+	TFMT4_RGTC1_UNORM = 90,
+	TFMT4_RGTC1_SNORM = 91,
+	TFMT4_RGTC2_UNORM = 94,
+	TFMT4_RGTC2_SNORM = 95,
+	TFMT4_BPTC_UFLOAT = 97,
+	TFMT4_BPTC_FLOAT = 98,
+	TFMT4_BPTC = 99,
 	TFMT4_ATC_RGB = 100,
 	TFMT4_ATC_RGBA_EXPLICIT = 101,
 	TFMT4_ATC_RGBA_INTERPOLATED = 102,
@@ -400,8 +427,13 @@ static inline uint32_t REG_A4XX_RB_MRT_CONTROL(uint32_t i0) { return 0x000020a4
 #define A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE			0x00000008
 #define A4XX_RB_MRT_CONTROL_BLEND				0x00000010
 #define A4XX_RB_MRT_CONTROL_BLEND2				0x00000020
-#define A4XX_RB_MRT_CONTROL_FASTCLEAR				0x00000400
-#define A4XX_RB_MRT_CONTROL_B11					0x00000800
+#define A4XX_RB_MRT_CONTROL_ROP_ENABLE				0x00000040
+#define A4XX_RB_MRT_CONTROL_ROP_CODE__MASK			0x00000f00
+#define A4XX_RB_MRT_CONTROL_ROP_CODE__SHIFT			8
+static inline uint32_t A4XX_RB_MRT_CONTROL_ROP_CODE(enum a3xx_rop_code val)
+{
+	return ((val) << A4XX_RB_MRT_CONTROL_ROP_CODE__SHIFT) & A4XX_RB_MRT_CONTROL_ROP_CODE__MASK;
+}
 #define A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK		0x0f000000
 #define A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE__SHIFT		24
 static inline uint32_t A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(uint32_t val)
@@ -600,7 +632,7 @@ static inline uint32_t A4XX_RB_FS_OUTPUT_ENABLE_BLEND(uint32_t val)
 {
 	return ((val) << A4XX_RB_FS_OUTPUT_ENABLE_BLEND__SHIFT) & A4XX_RB_FS_OUTPUT_ENABLE_BLEND__MASK;
 }
-#define A4XX_RB_FS_OUTPUT_FAST_CLEAR				0x00000100
+#define A4XX_RB_FS_OUTPUT_INDEPENDENT_BLEND			0x00000100
 #define A4XX_RB_FS_OUTPUT_SAMPLE_MASK__MASK			0xffff0000
 #define A4XX_RB_FS_OUTPUT_SAMPLE_MASK__SHIFT			16
 static inline uint32_t A4XX_RB_FS_OUTPUT_SAMPLE_MASK(uint32_t val)
@@ -2056,6 +2088,8 @@ static inline uint32_t A4XX_TPL1_TP_TEX_COUNT_GS(uint32_t val)
 #define REG_A4XX_GRAS_PERFCTR_TSE_SEL_3				0x00000c8b
 
 #define REG_A4XX_GRAS_CL_CLIP_CNTL				0x00002000
+#define A4XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE			0x00008000
+#define A4XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z			0x00400000
 
 #define REG_A4XX_GRAS_CLEAR_CNTL				0x00002003
 #define A4XX_GRAS_CLEAR_CNTL_NOT_FASTCLEAR			0x00000001
@@ -2596,7 +2630,20 @@ static inline uint32_t A4XX_PC_PRIM_VTX_CNTL_VAROUT(uint32_t val)
 #define A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST		0x02000000
 #define A4XX_PC_PRIM_VTX_CNTL_PSIZE				0x04000000
 
-#define REG_A4XX_UNKNOWN_21C5					0x000021c5
+#define REG_A4XX_PC_PRIM_VTX_CNTL2				0x000021c5
+#define A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE__MASK	0x00000007
+#define A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE__SHIFT	0
+static inline uint32_t A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE(enum adreno_pa_su_sc_draw val)
+{
+	return ((val) << A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE__SHIFT) & A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE__MASK;
+}
+#define A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE__MASK	0x00000038
+#define A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE__SHIFT	3
+static inline uint32_t A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE(enum adreno_pa_su_sc_draw val)
+{
+	return ((val) << A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE__SHIFT) & A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE__MASK;
+}
+#define A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_ENABLE			0x00000040
 
 #define REG_A4XX_PC_RESTART_INDEX				0x000021c6
 
@@ -2738,6 +2785,12 @@ static inline uint32_t A4XX_TEX_SAMP_0_ANISO(enum a4xx_tex_aniso val)
 {
 	return ((val) << A4XX_TEX_SAMP_0_ANISO__SHIFT) & A4XX_TEX_SAMP_0_ANISO__MASK;
 }
+#define A4XX_TEX_SAMP_0_LOD_BIAS__MASK				0xfff80000
+#define A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT				19
+static inline uint32_t A4XX_TEX_SAMP_0_LOD_BIAS(float val)
+{
+	return ((((int32_t)(val * 256.0))) << A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT) & A4XX_TEX_SAMP_0_LOD_BIAS__MASK;
+}
 
 #define REG_A4XX_TEX_SAMP_1					0x00000001
 #define A4XX_TEX_SAMP_1_COMPARE_FUNC__MASK			0x0000000e
@@ -2746,6 +2799,7 @@ static inline uint32_t A4XX_TEX_SAMP_1_COMPARE_FUNC(enum adreno_compare_func val
 {
 	return ((val) << A4XX_TEX_SAMP_1_COMPARE_FUNC__SHIFT) & A4XX_TEX_SAMP_1_COMPARE_FUNC__MASK;
 }
+#define A4XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF			0x00000010
 #define A4XX_TEX_SAMP_1_UNNORM_COORDS				0x00000020
 #define A4XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR			0x00000040
 #define A4XX_TEX_SAMP_1_MAX_LOD__MASK				0x000fff00
@@ -2814,7 +2868,7 @@ static inline uint32_t A4XX_TEX_CONST_1_HEIGHT(uint32_t val)
 {
 	return ((val) << A4XX_TEX_CONST_1_HEIGHT__SHIFT) & A4XX_TEX_CONST_1_HEIGHT__MASK;
 }
-#define A4XX_TEX_CONST_1_WIDTH__MASK				0x1fff8000
+#define A4XX_TEX_CONST_1_WIDTH__MASK				0x3fff8000
 #define A4XX_TEX_CONST_1_WIDTH__SHIFT				15
 static inline uint32_t A4XX_TEX_CONST_1_WIDTH(uint32_t val)
 {
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
index d5e823ef69d..f19702280e0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
@@ -27,6 +27,7 @@
  */
 
 #include "pipe/p_state.h"
+#include "util/u_blend.h"
 #include "util/u_string.h"
 #include "util/u_memory.h"
 
@@ -59,12 +60,12 @@ fd4_blend_state_create(struct pipe_context *pctx,
 		const struct pipe_blend_state *cso)
 {
 	struct fd4_blend_stateobj *so;
-//	enum a3xx_rop_code rop = ROP_COPY;
+	enum a3xx_rop_code rop = ROP_COPY;
 	bool reads_dest = false;
 	unsigned i, mrt_blend = 0;
 
 	if (cso->logicop_enable) {
-//		rop = cso->logicop_func;  /* maps 1:1 */
+		rop = cso->logicop_func;  /* maps 1:1 */
 
 		switch (cso->logicop_func) {
 		case PIPE_LOGICOP_NOR:
@@ -98,16 +99,25 @@ fd4_blend_state_create(struct pipe_context *pctx,
 		else
 			rt = &cso->rt[0];
 
-		so->rb_mrt[i].blend_control =
+		so->rb_mrt[i].blend_control_rgb =
 				A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) |
 				A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) |
-				A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)) |
+				A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor));
+
+		so->rb_mrt[i].blend_control_alpha =
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(fd_blend_factor(rt->alpha_src_factor)) |
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(blend_func(rt->alpha_func)) |
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(fd_blend_factor(rt->alpha_dst_factor));
 
+		so->rb_mrt[i].blend_control_no_alpha_rgb =
+				A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_src_factor))) |
+				A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) |
+				A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_dst_factor)));
+
+
 		so->rb_mrt[i].control =
-				0xc00 | /* XXX ROP_CODE ?? */
+				A4XX_RB_MRT_CONTROL_ROP_CODE(rop) |
+				COND(cso->logicop_enable, A4XX_RB_MRT_CONTROL_ROP_ENABLE) |
 				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask);
 
 		if (rt->blend_enable) {
@@ -118,14 +128,17 @@ fd4_blend_state_create(struct pipe_context *pctx,
 			mrt_blend |= (1 << i);
 		}
 
-		if (reads_dest)
+		if (reads_dest) {
 			so->rb_mrt[i].control |= A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE;
+			mrt_blend |= (1 << i);
+		}
 
 		if (cso->dither)
 			so->rb_mrt[i].buf_info |= A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS);
 	}
 
-	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend);
+	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend) |
+		COND(cso->independent_blend_enable, A4XX_RB_FS_OUTPUT_INDEPENDENT_BLEND);
 
 	return so;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
index 7620d00a625..6230fa7a50e 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
@@ -39,7 +39,12 @@ struct fd4_blend_stateobj {
 	struct {
 		uint32_t control;
 		uint32_t buf_info;
-		uint32_t blend_control;
+		/* Blend control bits for color if there is an alpha channel */
+		uint32_t blend_control_rgb;
+		/* Blend control bits for color if there is no alpha channel */
+		uint32_t blend_control_no_alpha_rgb;
+		/* Blend control bits for alpha channel */
+		uint32_t blend_control_alpha;
 	} rb_mrt[A4XX_MAX_RENDER_TARGETS];
 	uint32_t rb_fs_output;
 };
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index 7bd5163529a..8cbe68d5790 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -47,6 +47,7 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		struct fd4_emit *emit)
 {
 	const struct pipe_draw_info *info = emit->info;
+	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
 
 	if (!(fd4_emit_get_vp(emit) && fd4_emit_get_fp(emit)))
 		return;
@@ -64,7 +65,14 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
 			info->restart_index : 0xffffffff);
 
+	/* points + psize -> spritelist: */
+	if (ctx->rasterizer->point_size_per_vertex &&
+			fd4_emit_get_vp(emit)->writes_psize &&
+			(info->mode == PIPE_PRIM_POINTS))
+		primtype = DI_PT_POINTLIST_PSIZE;
+
 	fd4_draw_emit(ctx, ring,
+			primtype,
 			emit->key.binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY,
 			info);
 }
@@ -263,8 +271,7 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 		mrt_comp[i] = (buffers & (PIPE_CLEAR_COLOR0 << i)) ? 0xf : 0x0;
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
-		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
-				A4XX_RB_MRT_CONTROL_B11 |
+		OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) |
 				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf));
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
index b89a30a7c4b..a6c56404a8a 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
@@ -101,12 +101,12 @@ fd4_size2indextype(unsigned index_size)
 }
 static inline void
 fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		enum pc_di_primtype primtype,
 		enum pc_di_vis_cull_mode vismode,
 		const struct pipe_draw_info *info)
 {
 	struct pipe_index_buffer *idx = &ctx->indexbuf;
 	struct fd_bo *idx_bo = NULL;
-	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
 	enum a4xx_index_size idx_type;
 	enum pc_di_src_sel src_sel;
 	uint32_t idx_size, idx_offset;
@@ -127,11 +127,6 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		src_sel = DI_SRC_SEL_AUTO_INDEX;
 	}
 
-	/* points + psize -> spritelist: */
-	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
-			(info->mode == PIPE_PRIM_POINTS))
-		primtype = DI_PT_POINTLIST_PSIZE;
-
 	fd4_draw(ctx, ring, primtype, vismode, src_sel,
 			info->count, info->instance_count,
 			idx_type, idx_size, idx_offset, idx_bo);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 26b58718cd8..f220fc7ac1f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -185,7 +185,6 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			const struct fd4_pipe_sampler_view *view = tex->textures[i] ?
 					fd4_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
-			unsigned start = fd_sampler_first_level(&view->base);
 
 			OUT_RING(ring, view->texconst0);
 			OUT_RING(ring, view->texconst1);
@@ -193,8 +192,7 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			OUT_RING(ring, view->texconst3);
 			if (view->base.texture) {
 				struct fd_resource *rsc = fd_resource(view->base.texture);
-				uint32_t offset = fd_resource_offset(rsc, start, 0);
-				OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+				OUT_RELOC(ring, rsc->bo, view->offset, view->texconst4, 0);
 			} else {
 				OUT_RING(ring, 0x00000000);
 			}
@@ -286,7 +284,8 @@ fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs,
 							PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA));
 			OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(bufs[i]->width) |
 					A4XX_TEX_CONST_1_HEIGHT(bufs[i]->height));
-			OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp));
+			OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp) |
+					A4XX_TEX_CONST_2_FETCHSIZE(fd4_pipe2fetchsize(format)));
 			OUT_RING(ring, 0x00000000);
 			OUT_RELOC(ring, rsc->bo, offset, 0, 0);
 			OUT_RING(ring, 0x00000000);
@@ -332,7 +331,10 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 	unsigned instance_regid = regid(63, 0);
 	unsigned vtxcnt_regid = regid(63, 0);
 
+	/* Note that sysvals come *after* normal inputs: */
 	for (i = 0; i < vp->inputs_count; i++) {
+		if (!vp->inputs[i].compmask)
+			continue;
 		if (vp->inputs[i].sysval) {
 			switch(vp->inputs[i].slot) {
 			case SYSTEM_VALUE_BASE_VERTEX:
@@ -351,19 +353,11 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 				unreachable("invalid system value");
 				break;
 			}
-		} else if (i < vtx->vtx->num_elements && vp->inputs[i].compmask) {
+		} else if (i < vtx->vtx->num_elements) {
 			last = i;
 		}
 	}
 
-
-	/* hw doesn't like to be configured for zero vbo's, it seems: */
-	if ((vtx->vtx->num_elements == 0) &&
-			(vertex_regid == regid(63, 0)) &&
-			(instance_regid == regid(63, 0)) &&
-			(vtxcnt_regid == regid(63, 0)))
-		return;
-
 	for (i = 0, j = 0; i <= last; i++) {
 		assert(!vp->inputs[i].sysval);
 		if (vp->inputs[i].compmask) {
@@ -408,6 +402,38 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 		}
 	}
 
+	/* hw doesn't like to be configured for zero vbo's, it seems: */
+	if (last < 0) {
+		/* just recycle the shader bo, we just need to point to *something*
+		 * valid:
+		 */
+		struct fd_bo *dummy_vbo = vp->bo;
+		bool switchnext = (vertex_regid != regid(63, 0)) ||
+				(instance_regid != regid(63, 0)) ||
+				(vtxcnt_regid != regid(63, 0));
+
+		OUT_PKT0(ring, REG_A4XX_VFD_FETCH(0), 4);
+		OUT_RING(ring, A4XX_VFD_FETCH_INSTR_0_FETCHSIZE(0) |
+				A4XX_VFD_FETCH_INSTR_0_BUFSTRIDE(0) |
+				COND(switchnext, A4XX_VFD_FETCH_INSTR_0_SWITCHNEXT));
+		OUT_RELOC(ring, dummy_vbo, 0, 0, 0);
+		OUT_RING(ring, A4XX_VFD_FETCH_INSTR_2_SIZE(1));
+		OUT_RING(ring, A4XX_VFD_FETCH_INSTR_3_STEPRATE(1));
+
+		OUT_PKT0(ring, REG_A4XX_VFD_DECODE_INSTR(0), 1);
+		OUT_RING(ring, A4XX_VFD_DECODE_INSTR_CONSTFILL |
+				A4XX_VFD_DECODE_INSTR_WRITEMASK(0x1) |
+				A4XX_VFD_DECODE_INSTR_FORMAT(VFMT4_8_UNORM) |
+				A4XX_VFD_DECODE_INSTR_SWAP(XYZW) |
+				A4XX_VFD_DECODE_INSTR_REGID(regid(0,0)) |
+				A4XX_VFD_DECODE_INSTR_SHIFTCNT(1) |
+				A4XX_VFD_DECODE_INSTR_LASTCOMPVALID |
+				COND(switchnext, A4XX_VFD_DECODE_INSTR_SWITCHNEXT));
+
+		total_in = 1;
+		j = 1;
+	}
+
 	OUT_PKT0(ring, REG_A4XX_VFD_CONTROL_0, 5);
 	OUT_RING(ring, A4XX_VFD_CONTROL_0_TOTALATTRTOVS(total_in) |
 			0xa0000 | /* XXX */
@@ -470,11 +496,16 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RINGP(ring, val, &fd4_context(ctx)->rbrc_patches);
 	}
 
-	if (dirty & FD_DIRTY_ZSA) {
+	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) {
 		struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa);
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		uint32_t rb_alpha_control = zsa->rb_alpha_control;
+
+		if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])))
+			rb_alpha_control &= ~A4XX_RB_ALPHA_CONTROL_ALPHA_TEST;
 
 		OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1);
-		OUT_RING(ring, zsa->rb_alpha_control);
+		OUT_RING(ring, rb_alpha_control);
 
 		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2);
 		OUT_RING(ring, zsa->rb_stencil_control);
@@ -535,8 +566,9 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	 */
 	if (emit->info) {
 		const struct pipe_draw_info *info = emit->info;
-		uint32_t val = fd4_rasterizer_stateobj(ctx->rasterizer)
-				->pc_prim_vtx_cntl;
+		struct fd4_rasterizer_stateobj *rast =
+			fd4_rasterizer_stateobj(ctx->rasterizer);
+		uint32_t val = rast->pc_prim_vtx_cntl;
 
 		if (info->indexed && info->primitive_restart)
 			val |= A4XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART;
@@ -552,7 +584,7 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 		OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 2);
 		OUT_RING(ring, val);
-		OUT_RING(ring, 0x12);     /* XXX UNKNOWN_21C5 */
+		OUT_RING(ring, rast->pc_prim_vtx_cntl2);
 	}
 
 	if (dirty & FD_DIRTY_SCISSOR) {
@@ -581,7 +613,7 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2]));
 	}
 
-	if (dirty & FD_DIRTY_PROG) {
+	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER)) {
 		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 		fd4_program_emit(ring, emit, pfb->nr_cbufs, pfb->cbufs);
 	}
@@ -599,11 +631,30 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		uint32_t i;
 
 		for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+			enum pipe_format format = pipe_surface_format(
+					ctx->framebuffer.cbufs[i]);
+			bool is_int = util_format_is_pure_integer(format);
+			bool has_alpha = util_format_has_alpha(format);
+			uint32_t control = blend->rb_mrt[i].control;
+			uint32_t blend_control = blend->rb_mrt[i].blend_control_alpha;
+
+			if (is_int) {
+				control &= A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK;
+				control |= A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY);
+			}
+
+			if (has_alpha) {
+				blend_control |= blend->rb_mrt[i].blend_control_rgb;
+			} else {
+				blend_control |= blend->rb_mrt[i].blend_control_no_alpha_rgb;
+				control &= ~A4XX_RB_MRT_CONTROL_BLEND2;
+			}
+
 			OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
-			OUT_RING(ring, blend->rb_mrt[i].control);
+			OUT_RING(ring, control);
 
 			OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1);
-			OUT_RING(ring, blend->rb_mrt[i].blend_control);
+			OUT_RING(ring, blend_control);
 		}
 
 		OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1);
@@ -611,19 +662,48 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff));
 	}
 
-	if (dirty & FD_DIRTY_BLEND_COLOR) {
+	if (dirty & (FD_DIRTY_BLEND_COLOR | FD_DIRTY_FRAMEBUFFER)) {
 		struct pipe_blend_color *bcolor = &ctx->blend_color;
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		float factor = 65535.0;
+		int i;
+
+		for (i = 0; i < pfb->nr_cbufs; i++) {
+			enum pipe_format format = pipe_surface_format(pfb->cbufs[i]);
+			const struct util_format_description *desc =
+				util_format_description(format);
+			int j;
+
+			if (desc->is_mixed)
+				continue;
+
+			j = util_format_get_first_non_void_channel(format);
+			if (j == -1)
+				continue;
+
+			if (desc->channel[j].size > 8 || !desc->channel[j].normalized ||
+				desc->channel[j].pure_integer)
+				continue;
+
+			/* Just use the first unorm8/snorm8 render buffer. Can't keep
+			 * everyone happy.
+			 */
+			if (desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED)
+				factor = 32767.0;
+			break;
+		}
+
 		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8);
-		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * factor) |
 				A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]));
 		OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0]));
-		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * factor) |
 				A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]));
 		OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1]));
-		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * factor) |
 				A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]));
 		OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2]));
-		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * factor) |
 				A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
 		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
index 847d4fb6d63..c240745cec1 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
@@ -99,20 +99,26 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	_T(S8_UINT,    8_UINT,  R8_UNORM, WZYX),
 
 	/* 16-bit */
-	V_(R16_UNORM,   16_UNORM, NONE,     WZYX),
-	V_(R16_SNORM,   16_SNORM, NONE,     WZYX),
-	VT(R16_UINT,    16_UINT,  R16_UINT, WZYX),
-	VT(R16_SINT,    16_SINT,  R16_SINT, WZYX),
-	V_(R16_USCALED, 16_UINT,  NONE,     WZYX),
-	V_(R16_SSCALED, 16_UINT,  NONE,     WZYX),
-	VT(R16_FLOAT,   16_FLOAT, R16_FLOAT,WZYX),
-
-	_T(A16_UINT,    16_UINT,  NONE,     WZYX),
-	_T(A16_SINT,    16_SINT,  NONE,     WZYX),
-	_T(L16_UINT,    16_UINT,  NONE,     WZYX),
-	_T(L16_SINT,    16_SINT,  NONE,     WZYX),
-	_T(I16_UINT,    16_UINT,  NONE,     WZYX),
-	_T(I16_SINT,    16_SINT,  NONE,     WZYX),
+	VT(R16_UNORM,   16_UNORM, R16_UNORM, WZYX),
+	VT(R16_SNORM,   16_SNORM, R16_SNORM, WZYX),
+	VT(R16_UINT,    16_UINT,  R16_UINT,  WZYX),
+	VT(R16_SINT,    16_SINT,  R16_SINT,  WZYX),
+	V_(R16_USCALED, 16_UINT,  NONE,      WZYX),
+	V_(R16_SSCALED, 16_UINT,  NONE,      WZYX),
+	VT(R16_FLOAT,   16_FLOAT, R16_FLOAT, WZYX),
+
+	_T(A16_UNORM,   16_UNORM, NONE,      WZYX),
+	_T(A16_SNORM,   16_SNORM, NONE,      WZYX),
+	_T(A16_UINT,    16_UINT,  NONE,      WZYX),
+	_T(A16_SINT,    16_SINT,  NONE,      WZYX),
+	_T(L16_UNORM,   16_UNORM, NONE,      WZYX),
+	_T(L16_SNORM,   16_SNORM, NONE,      WZYX),
+	_T(L16_UINT,    16_UINT,  NONE,      WZYX),
+	_T(L16_SINT,    16_SINT,  NONE,      WZYX),
+	_T(I16_UNORM,   16_UNORM, NONE,      WZYX),
+	_T(I16_SNORM,   16_SNORM, NONE,      WZYX),
+	_T(I16_UINT,    16_UINT,  NONE,      WZYX),
+	_T(I16_SINT,    16_SINT,  NONE,      WZYX),
 
 	VT(R8G8_UNORM,   8_8_UNORM, R8G8_UNORM, WZYX),
 	VT(R8G8_SNORM,   8_8_SNORM, R8G8_SNORM, WZYX),
@@ -124,6 +130,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	_T(L8A8_UINT,    8_8_UINT,  NONE,       WZYX),
 	_T(L8A8_SINT,    8_8_SINT,  NONE,       WZYX),
 
+	_T(B5G6R5_UNORM,   5_6_5_UNORM,   R5G6B5_UNORM,   WXYZ),
 	_T(B5G5R5A1_UNORM, 5_5_5_1_UNORM, R5G5B5A1_UNORM, WXYZ),
 	_T(B5G5R5X1_UNORM, 5_5_5_1_UNORM, R5G5B5A1_UNORM, WXYZ),
 	_T(B4G4R4A4_UNORM, 4_4_4_4_UNORM, R4G4B4A4_UNORM, WXYZ),
@@ -151,16 +158,18 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	_T(I32_UINT,    32_UINT,  NONE,     WZYX),
 	_T(I32_SINT,    32_SINT,  NONE,     WZYX),
 
-	V_(R16G16_UNORM,   16_16_UNORM, NONE,        WZYX),
-	V_(R16G16_SNORM,   16_16_SNORM, NONE,        WZYX),
-	VT(R16G16_UINT,    16_16_UINT,  R16G16_UINT, WZYX),
-	VT(R16G16_SINT,    16_16_SINT,  R16G16_SINT, WZYX),
-	V_(R16G16_USCALED, 16_16_UINT,  NONE,        WZYX),
-	V_(R16G16_SSCALED, 16_16_SINT,  NONE,        WZYX),
-	VT(R16G16_FLOAT,   16_16_FLOAT, R16G16_FLOAT,WZYX),
+	VT(R16G16_UNORM,   16_16_UNORM, R16G16_UNORM, WZYX),
+	VT(R16G16_SNORM,   16_16_SNORM, R16G16_SNORM, WZYX),
+	VT(R16G16_UINT,    16_16_UINT,  R16G16_UINT,  WZYX),
+	VT(R16G16_SINT,    16_16_SINT,  R16G16_SINT,  WZYX),
+	V_(R16G16_USCALED, 16_16_UINT,  NONE,         WZYX),
+	V_(R16G16_SSCALED, 16_16_SINT,  NONE,         WZYX),
+	VT(R16G16_FLOAT,   16_16_FLOAT, R16G16_FLOAT, WZYX),
 
-	_T(L16A16_UINT,    16_16_UINT,  NONE,        WZYX),
-	_T(L16A16_SINT,    16_16_SINT,  NONE,        WZYX),
+	_T(L16A16_UNORM,   16_16_UNORM, NONE,         WZYX),
+	_T(L16A16_SNORM,   16_16_SNORM, NONE,         WZYX),
+	_T(L16A16_UINT,    16_16_UINT,  NONE,         WZYX),
+	_T(L16A16_SINT,    16_16_SINT,  NONE,         WZYX),
 
 	VT(R8G8B8A8_UNORM,   8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX),
 	_T(R8G8B8X8_UNORM,   8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX),
@@ -191,11 +200,15 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(B10G10R10A2_UNORM,   10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ),
 	_T(B10G10R10X2_UNORM,   10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ),
 	V_(R10G10B10A2_SNORM,   10_10_10_2_SNORM, NONE,              WZYX),
-	V_(R10G10B10A2_UINT,    10_10_10_2_UINT,  NONE,              WZYX),
+	V_(B10G10R10A2_SNORM,   10_10_10_2_SNORM, NONE,              WXYZ),
+	VT(R10G10B10A2_UINT,    10_10_10_2_UINT,  R10G10B10A2_UINT,  WZYX),
+	VT(B10G10R10A2_UINT,    10_10_10_2_UINT,  R10G10B10A2_UINT,  WXYZ),
 	V_(R10G10B10A2_USCALED, 10_10_10_2_UINT,  NONE,              WZYX),
+	V_(B10G10R10A2_USCALED, 10_10_10_2_UINT,  NONE,              WXYZ),
 	V_(R10G10B10A2_SSCALED, 10_10_10_2_SINT,  NONE,              WZYX),
+	V_(B10G10R10A2_SSCALED, 10_10_10_2_SINT,  NONE,              WXYZ),
 
-	_T(R11G11B10_FLOAT, 11_11_10_FLOAT, R11G11B10_FLOAT, WZYX),
+	VT(R11G11B10_FLOAT, 11_11_10_FLOAT, R11G11B10_FLOAT, WZYX),
 	_T(R9G9B9E5_FLOAT,  9_9_9_E5_FLOAT, NONE,            WZYX),
 
 	_T(Z24X8_UNORM,       X8Z24_UNORM, R8G8B8A8_UNORM, WZYX),
@@ -213,8 +226,10 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	V_(R16G16B16_FLOAT,   16_16_16_FLOAT, NONE, WZYX),
 
 	/* 64-bit */
-	V_(R16G16B16A16_UNORM,   16_16_16_16_UNORM, NONE,               WZYX),
-	V_(R16G16B16A16_SNORM,   16_16_16_16_SNORM, NONE,               WZYX),
+	VT(R16G16B16A16_UNORM,   16_16_16_16_UNORM, R16G16B16A16_UNORM, WZYX),
+	VT(R16G16B16X16_UNORM,   16_16_16_16_UNORM, R16G16B16A16_UNORM, WZYX),
+	VT(R16G16B16A16_SNORM,   16_16_16_16_SNORM, R16G16B16A16_SNORM, WZYX),
+	VT(R16G16B16X16_SNORM,   16_16_16_16_SNORM, R16G16B16A16_SNORM, WZYX),
 	VT(R16G16B16A16_UINT,    16_16_16_16_UINT,  R16G16B16A16_UINT,  WZYX),
 	_T(R16G16B16X16_UINT,    16_16_16_16_UINT,  R16G16B16A16_UINT,  WZYX),
 	VT(R16G16B16A16_SINT,    16_16_16_16_SINT,  R16G16B16A16_SINT,  WZYX),
@@ -235,11 +250,11 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	_T(L32A32_SINT,    32_32_SINT,  NONE,        WZYX),
 
 	/* 96-bit */
-	V_(R32G32B32_UINT,    32_32_32_UINT,  NONE, WZYX),
-	V_(R32G32B32_SINT,    32_32_32_SINT,  NONE, WZYX),
+	VT(R32G32B32_UINT,    32_32_32_UINT,  NONE, WZYX),
+	VT(R32G32B32_SINT,    32_32_32_SINT,  NONE, WZYX),
 	V_(R32G32B32_USCALED, 32_32_32_UINT,  NONE, WZYX),
 	V_(R32G32B32_SSCALED, 32_32_32_SINT,  NONE, WZYX),
-	V_(R32G32B32_FLOAT,   32_32_32_FLOAT, NONE, WZYX),
+	VT(R32G32B32_FLOAT,   32_32_32_FLOAT, NONE, WZYX),
 	V_(R32G32B32_FIXED,   32_32_32_FIXED, NONE, WZYX),
 
 	/* 128-bit */
@@ -252,6 +267,72 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R32G32B32A32_FLOAT,   32_32_32_32_FLOAT, R32G32B32A32_FLOAT, WZYX),
 	_T(R32G32B32X32_FLOAT,   32_32_32_32_FLOAT, R32G32B32A32_FLOAT, WZYX),
 	V_(R32G32B32A32_FIXED,   32_32_32_32_FIXED, NONE,               WZYX),
+
+	/* compressed */
+	_T(ETC1_RGB8, ETC1, NONE, WZYX),
+	_T(ETC2_RGB8, ETC2_RGB8, NONE, WZYX),
+	_T(ETC2_SRGB8, ETC2_RGB8, NONE, WZYX),
+	_T(ETC2_RGB8A1, ETC2_RGB8A1, NONE, WZYX),
+	_T(ETC2_SRGB8A1, ETC2_RGB8A1, NONE, WZYX),
+	_T(ETC2_RGBA8, ETC2_RGBA8, NONE, WZYX),
+	_T(ETC2_SRGBA8, ETC2_RGBA8, NONE, WZYX),
+	_T(ETC2_R11_UNORM, ETC2_R11_UNORM, NONE, WZYX),
+	_T(ETC2_R11_SNORM, ETC2_R11_SNORM, NONE, WZYX),
+	_T(ETC2_RG11_UNORM, ETC2_RG11_UNORM, NONE, WZYX),
+	_T(ETC2_RG11_SNORM, ETC2_RG11_SNORM, NONE, WZYX),
+
+	_T(DXT1_RGB,   DXT1, NONE, WZYX),
+	_T(DXT1_SRGB,  DXT1, NONE, WZYX),
+	_T(DXT1_RGBA,  DXT1, NONE, WZYX),
+	_T(DXT1_SRGBA, DXT1, NONE, WZYX),
+	_T(DXT3_RGBA,  DXT3, NONE, WZYX),
+	_T(DXT3_SRGBA, DXT3, NONE, WZYX),
+	_T(DXT5_RGBA,  DXT5, NONE, WZYX),
+	_T(DXT5_SRGBA, DXT5, NONE, WZYX),
+
+	_T(BPTC_RGBA_UNORM, BPTC,        NONE, WZYX),
+	_T(BPTC_SRGBA,      BPTC,        NONE, WZYX),
+	_T(BPTC_RGB_FLOAT,  BPTC_FLOAT,  NONE, WZYX),
+	_T(BPTC_RGB_UFLOAT, BPTC_UFLOAT, NONE, WZYX),
+
+	_T(RGTC1_UNORM, RGTC1_UNORM, NONE, WZYX),
+	_T(RGTC1_SNORM, RGTC1_SNORM, NONE, WZYX),
+	_T(RGTC2_UNORM, RGTC2_UNORM, NONE, WZYX),
+	_T(RGTC2_SNORM, RGTC2_SNORM, NONE, WZYX),
+	_T(LATC1_UNORM, RGTC1_UNORM, NONE, WZYX),
+	_T(LATC1_SNORM, RGTC1_SNORM, NONE, WZYX),
+	_T(LATC2_UNORM, RGTC2_UNORM, NONE, WZYX),
+	_T(LATC2_SNORM, RGTC2_SNORM, NONE, WZYX),
+
+	_T(ASTC_4x4,   ASTC_4x4,   NONE, WZYX),
+	_T(ASTC_5x4,   ASTC_5x4,   NONE, WZYX),
+	_T(ASTC_5x5,   ASTC_5x5,   NONE, WZYX),
+	_T(ASTC_6x5,   ASTC_6x5,   NONE, WZYX),
+	_T(ASTC_6x6,   ASTC_6x6,   NONE, WZYX),
+	_T(ASTC_8x5,   ASTC_8x5,   NONE, WZYX),
+	_T(ASTC_8x6,   ASTC_8x6,   NONE, WZYX),
+	_T(ASTC_8x8,   ASTC_8x8,   NONE, WZYX),
+	_T(ASTC_10x5,  ASTC_10x5,  NONE, WZYX),
+	_T(ASTC_10x6,  ASTC_10x6,  NONE, WZYX),
+	_T(ASTC_10x8,  ASTC_10x8,  NONE, WZYX),
+	_T(ASTC_10x10, ASTC_10x10, NONE, WZYX),
+	_T(ASTC_12x10, ASTC_12x10, NONE, WZYX),
+	_T(ASTC_12x12, ASTC_12x12, NONE, WZYX),
+
+	_T(ASTC_4x4_SRGB,   ASTC_4x4,   NONE, WZYX),
+	_T(ASTC_5x4_SRGB,   ASTC_5x4,   NONE, WZYX),
+	_T(ASTC_5x5_SRGB,   ASTC_5x5,   NONE, WZYX),
+	_T(ASTC_6x5_SRGB,   ASTC_6x5,   NONE, WZYX),
+	_T(ASTC_6x6_SRGB,   ASTC_6x6,   NONE, WZYX),
+	_T(ASTC_8x5_SRGB,   ASTC_8x5,   NONE, WZYX),
+	_T(ASTC_8x6_SRGB,   ASTC_8x6,   NONE, WZYX),
+	_T(ASTC_8x8_SRGB,   ASTC_8x8,   NONE, WZYX),
+	_T(ASTC_10x5_SRGB,  ASTC_10x5,  NONE, WZYX),
+	_T(ASTC_10x6_SRGB,  ASTC_10x6,  NONE, WZYX),
+	_T(ASTC_10x8_SRGB,  ASTC_10x8,  NONE, WZYX),
+	_T(ASTC_10x10_SRGB, ASTC_10x10, NONE, WZYX),
+	_T(ASTC_12x10_SRGB, ASTC_12x10, NONE, WZYX),
+	_T(ASTC_12x12_SRGB, ASTC_12x12, NONE, WZYX),
 };
 
 /* convert pipe format to vertex buffer format: */
@@ -295,11 +376,15 @@ fd4_pipe2fetchsize(enum pipe_format format)
 	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
 		format = PIPE_FORMAT_Z32_FLOAT;
 
-	switch (util_format_get_blocksizebits(format)) {
+	if (util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_ASTC)
+		return TFETCH4_16_BYTE;
+
+	switch (util_format_get_blocksizebits(format) / util_format_get_blockwidth(format)) {
 	case 8:   return TFETCH4_1_BYTE;
 	case 16:  return TFETCH4_2_BYTE;
 	case 32:  return TFETCH4_4_BYTE;
 	case 64:  return TFETCH4_8_BYTE;
+	case 96:  return TFETCH4_1_BYTE; /* Does this matter? */
 	case 128: return TFETCH4_16_BYTE;
 	default:
 		debug_printf("Unknown block size for format %s: %d\n",
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 3f8bbf3a124..221608127b4 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -347,8 +347,7 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 		mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0;
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
-		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
-				A4XX_RB_MRT_CONTROL_B11 |
+		OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) |
 				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf));
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index e3d5dabab4c..3df13543148 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -245,13 +245,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 		color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7);
 	}
 
-	/* adjust regids for alpha output formats. there is no alpha render
-	 * format, so it's just treated like red
-	 */
-	for (i = 0; i < nr; i++)
-		if (util_format_is_alpha(pipe_surface_format(bufs[i])))
-			color_regid[i] += 3;
-
 	/* TODO get these dynamically: */
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
 	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
index dc7e98b149d..7456c63febe 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
@@ -77,6 +77,13 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 
 	so->gras_su_mode_control =
 			A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width/2.0);
+	so->pc_prim_vtx_cntl2 =
+		A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) |
+		A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE(fd_polygon_mode(cso->fill_back));
+
+	if (cso->fill_front != PIPE_POLYGON_MODE_FILL ||
+		cso->fill_back != PIPE_POLYGON_MODE_FILL)
+		so->pc_prim_vtx_cntl2 |= A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_ENABLE;
 
 	if (cso->cull_face & PIPE_FACE_FRONT)
 		so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_CULL_FRONT;
@@ -90,5 +97,10 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 	if (cso->offset_tri)
 		so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_POLY_OFFSET;
 
+	if (!cso->depth_clip)
+		so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE;
+	if (cso->clip_halfz)
+		so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z;
+
 	return so;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
index 64e81a9983b..b56a04da6a8 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
@@ -42,6 +42,7 @@ struct fd4_rasterizer_stateobj {
 	uint32_t gras_su_mode_control;
 	uint32_t gras_cl_clip_cntl;
 	uint32_t pc_prim_vtx_cntl;
+	uint32_t pc_prim_vtx_cntl2;
 };
 
 static inline struct fd4_rasterizer_stateobj *
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index d8ea414f300..b2a69cca56c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -57,6 +57,8 @@ fd4_screen_is_format_supported(struct pipe_screen *pscreen,
 	}
 
 	if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
+			(target == PIPE_BUFFER ||
+			 util_format_get_blocksize(format) != 12) &&
 			(fd4_pipe2tex(format) != ~0)) {
 		retval |= PIPE_BIND_SAMPLER_VIEW;
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index dbff5a738fd..0eba75577b0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -124,9 +124,11 @@ fd4_sampler_state_create(struct pipe_context *pctx,
 
 	so->texsamp1 =
 //		COND(miplinear, A4XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR) |
+		COND(!cso->seamless_cube_map, A4XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) |
 		COND(!cso->normalized_coords, A4XX_TEX_SAMP_1_UNNORM_COORDS);
 
 	if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+		so->texsamp0 |= A4XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias);
 		so->texsamp1 |=
 			A4XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) |
 			A4XX_TEX_SAMP_1_MAX_LOD(cso->max_lod);
@@ -210,8 +212,8 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 {
 	struct fd4_pipe_sampler_view *so = CALLOC_STRUCT(fd4_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
-	unsigned lvl = fd_sampler_first_level(cso);
-	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
+	unsigned lvl, layers;
+	uint32_t sz2 = 0;
 
 	if (!so)
 		return NULL;
@@ -223,39 +225,65 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	so->base.context = pctx;
 
 	so->texconst0 =
-		A4XX_TEX_CONST_0_TYPE(tex_type(prsc->target)) |
+		A4XX_TEX_CONST_0_TYPE(tex_type(cso->target)) |
 		A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(cso->format)) |
-		A4XX_TEX_CONST_0_MIPLVLS(miplevels) |
 		fd4_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g,
 				cso->swizzle_b, cso->swizzle_a);
 
 	if (util_format_is_srgb(cso->format))
 		so->texconst0 |= A4XX_TEX_CONST_0_SRGB;
 
-	so->texconst1 =
-		A4XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) |
-		A4XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl));
-	so->texconst2 =
-		A4XX_TEX_CONST_2_FETCHSIZE(fd4_pipe2fetchsize(cso->format)) |
-		A4XX_TEX_CONST_2_PITCH(rsc->slices[lvl].pitch * rsc->cpp);
+	if (cso->target == PIPE_BUFFER) {
+		unsigned elements = cso->u.buf.last_element -
+			cso->u.buf.first_element + 1;
+		lvl = 0;
+		so->texconst1 =
+			A4XX_TEX_CONST_1_WIDTH(elements) |
+			A4XX_TEX_CONST_1_HEIGHT(1);
+		so->texconst2 =
+			A4XX_TEX_CONST_2_FETCHSIZE(fd4_pipe2fetchsize(cso->format)) |
+			A4XX_TEX_CONST_2_PITCH(elements * rsc->cpp);
+		so->offset = cso->u.buf.first_element *
+			util_format_get_blocksize(cso->format);
+	} else {
+		unsigned miplevels;
 
-	switch (prsc->target) {
+		lvl = fd_sampler_first_level(cso);
+		miplevels = fd_sampler_last_level(cso) - lvl;
+		layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1;
+
+		so->texconst0 |= A4XX_TEX_CONST_0_MIPLVLS(miplevels);
+		so->texconst1 =
+			A4XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) |
+			A4XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl));
+		so->texconst2 =
+			A4XX_TEX_CONST_2_FETCHSIZE(fd4_pipe2fetchsize(cso->format)) |
+			A4XX_TEX_CONST_2_PITCH(
+					util_format_get_nblocksx(
+							cso->format, rsc->slices[lvl].pitch) * rsc->cpp);
+		so->offset = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer);
+	}
+
+	switch (cso->target) {
 	case PIPE_TEXTURE_1D_ARRAY:
 	case PIPE_TEXTURE_2D_ARRAY:
 		so->texconst3 =
-			A4XX_TEX_CONST_3_DEPTH(prsc->array_size) |
+			A4XX_TEX_CONST_3_DEPTH(layers) |
 			A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size);
 		break;
 	case PIPE_TEXTURE_CUBE:
 	case PIPE_TEXTURE_CUBE_ARRAY:
 		so->texconst3 =
-			A4XX_TEX_CONST_3_DEPTH(prsc->array_size / 6) |
+			A4XX_TEX_CONST_3_DEPTH(layers / 6) |
 			A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size);
 		break;
 	case PIPE_TEXTURE_3D:
 		so->texconst3 =
 			A4XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) |
-			A4XX_TEX_CONST_3_LAYERSZ(rsc->slices[0].size0);
+			A4XX_TEX_CONST_3_LAYERSZ(rsc->slices[lvl].size0);
+		while (lvl < cso->u.tex.last_level && sz2 != rsc->slices[lvl+1].size0)
+			sz2 = rsc->slices[++lvl].size0;
+		so->texconst4 = A4XX_TEX_CONST_4_LAYERSZ(sz2);
 		break;
 	default:
 		so->texconst3 = 0x00000000;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
index 31955770a85..6ca34ade60d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
@@ -51,7 +51,8 @@ fd4_sampler_stateobj(struct pipe_sampler_state *samp)
 
 struct fd4_pipe_sampler_view {
 	struct pipe_sampler_view base;
-	uint32_t texconst0, texconst1, texconst2, texconst3, textconst4;
+	uint32_t texconst0, texconst1, texconst2, texconst3, texconst4;
+	uint32_t offset;
 };
 
 static inline struct fd4_pipe_sampler_view *
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index ca3d2ac3fca..0e0f0e65e9b 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
@@ -119,6 +119,25 @@ enum adreno_rb_copy_control_mode {
 	RB_COPY_DEPTH_STENCIL = 5,
 };
 
+enum a3xx_rop_code {
+	ROP_CLEAR = 0,
+	ROP_NOR = 1,
+	ROP_AND_INVERTED = 2,
+	ROP_COPY_INVERTED = 3,
+	ROP_AND_REVERSE = 4,
+	ROP_INVERT = 5,
+	ROP_XOR = 6,
+	ROP_NAND = 7,
+	ROP_AND = 8,
+	ROP_EQUIV = 9,
+	ROP_NOOP = 10,
+	ROP_OR_INVERTED = 11,
+	ROP_COPY = 12,
+	ROP_OR_REVERSE = 13,
+	ROP_OR = 14,
+	ROP_SET = 15,
+};
+
 enum a3xx_render_mode {
 	RB_RENDERING_PASS = 0,
 	RB_TILING_PASS = 1,
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index f095e3061b2..4aabc086607 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 61c4c6d6e24..571c8142bf7 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -359,6 +359,10 @@ struct fd_context {
 	struct fd_streamout_stateobj streamout;
 	struct pipe_clip_state ucp;
 
+	struct pipe_query *cond_query;
+	bool cond_cond; /* inverted rendering condition */
+	uint cond_mode;
+
 	/* GMEM/tile handling fxns: */
 	void (*emit_tile_init)(struct fd_context *ctx);
 	void (*emit_tile_prep)(struct fd_context *ctx, struct fd_tile *tile);
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index 7bf3343f43a..bf803cc77bc 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -88,6 +88,10 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 		return;
 	}
 
+	/* TODO: push down the region versions into the tiles */
+	if (!fd_render_condition_check(pctx))
+		return;
+
 	/* emulate unsupported primitives: */
 	if (!fd_supported_prim(ctx, info->mode)) {
 		if (ctx->streamout.num_targets > 0)
@@ -220,6 +224,10 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 	unsigned cleared_buffers;
 	int i;
 
+	/* TODO: push down the region versions into the tiles */
+	if (!fd_render_condition_check(pctx))
+		return;
+
 	/* for bookkeeping about which buffers have been cleared (and thus
 	 * can fully or partially skip mem2gmem) we need to ignore buffers
 	 * that have already had a draw, in case apps do silly things like
diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c
index db2683c9b6f..b87e8250719 100644
--- a/src/gallium/drivers/freedreno/freedreno_query.c
+++ b/src/gallium/drivers/freedreno/freedreno_query.c
@@ -81,6 +81,16 @@ fd_get_query_result(struct pipe_context *pctx, struct pipe_query *pq,
 	return q->funcs->get_query_result(fd_context(pctx), q, wait, result);
 }
 
+static void
+fd_render_condition(struct pipe_context *pctx, struct pipe_query *pq,
+					boolean condition, uint mode)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->cond_query = pq;
+	ctx->cond_cond = condition;
+	ctx->cond_mode = mode;
+}
+
 static int
 fd_get_driver_query_info(struct pipe_screen *pscreen,
 		unsigned index, struct pipe_driver_query_info *info)
@@ -118,4 +128,5 @@ fd_query_context_init(struct pipe_context *pctx)
 	pctx->begin_query = fd_begin_query;
 	pctx->end_query = fd_end_query;
 	pctx->get_query_result = fd_get_query_result;
+	pctx->render_condition = fd_render_condition;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 98de0969cab..63ca9e30620 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -27,6 +27,7 @@
  */
 
 #include "util/u_format.h"
+#include "util/u_format_rgtc.h"
 #include "util/u_format_zs.h"
 #include "util/u_inlines.h"
 #include "util/u_transfer.h"
@@ -111,11 +112,19 @@ realloc_bo(struct fd_resource *rsc, uint32_t size)
 	util_range_set_empty(&rsc->valid_buffer_range);
 }
 
-/* Currently this is only used for flushing Z32_S8 texture transfers, but
- * eventually it should handle everything.
- */
+static unsigned
+fd_resource_layer_offset(struct fd_resource *rsc,
+						 struct fd_resource_slice *slice,
+						 unsigned layer)
+{
+	if (rsc->layer_first)
+		return layer * rsc->layer_size;
+	else
+		return layer * slice->size0;
+}
+
 static void
-fd_resource_flush(struct fd_transfer *trans, const struct pipe_box *box)
+fd_resource_flush_z32s8(struct fd_transfer *trans, const struct pipe_box *box)
 {
 	struct fd_resource *rsc = fd_resource(trans->base.resource);
 	struct fd_resource_slice *slice = fd_resource_slice(rsc, trans->base.level);
@@ -123,13 +132,12 @@ fd_resource_flush(struct fd_transfer *trans, const struct pipe_box *box)
 	enum pipe_format format = trans->base.resource->format;
 
 	float *depth = fd_bo_map(rsc->bo) + slice->offset +
+		fd_resource_layer_offset(rsc, slice, trans->base.box.z) +
 		(trans->base.box.y + box->y) * slice->pitch * 4 + (trans->base.box.x + box->x) * 4;
 	uint8_t *stencil = fd_bo_map(rsc->stencil->bo) + sslice->offset +
+		fd_resource_layer_offset(rsc->stencil, sslice, trans->base.box.z) +
 		(trans->base.box.y + box->y) * sslice->pitch + trans->base.box.x + box->x;
 
-	assert(format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT ||
-		   format == PIPE_FORMAT_X32_S8X24_UINT);
-
 	if (format != PIPE_FORMAT_X32_S8X24_UINT)
 		util_format_z32_float_s8x24_uint_unpack_z_float(
 				depth, slice->pitch * 4,
@@ -142,6 +150,73 @@ fd_resource_flush(struct fd_transfer *trans, const struct pipe_box *box)
 			box->width, box->height);
 }
 
+static void
+fd_resource_flush_rgtc(struct fd_transfer *trans, const struct pipe_box *box)
+{
+	struct fd_resource *rsc = fd_resource(trans->base.resource);
+	struct fd_resource_slice *slice = fd_resource_slice(rsc, trans->base.level);
+	enum pipe_format format = trans->base.resource->format;
+
+	uint8_t *data = fd_bo_map(rsc->bo) + slice->offset +
+		fd_resource_layer_offset(rsc, slice, trans->base.box.z) +
+		((trans->base.box.y + box->y) * slice->pitch +
+		 trans->base.box.x + box->x) * rsc->cpp;
+
+	uint8_t *source = trans->staging +
+		util_format_get_nblocksy(format, box->y) * trans->base.stride +
+		util_format_get_stride(format, box->x);
+
+	switch (format) {
+	case PIPE_FORMAT_RGTC1_UNORM:
+	case PIPE_FORMAT_RGTC1_SNORM:
+	case PIPE_FORMAT_LATC1_UNORM:
+	case PIPE_FORMAT_LATC1_SNORM:
+		util_format_rgtc1_unorm_unpack_rgba_8unorm(
+				data, slice->pitch * rsc->cpp,
+				source, trans->base.stride,
+				box->width, box->height);
+		break;
+	case PIPE_FORMAT_RGTC2_UNORM:
+	case PIPE_FORMAT_RGTC2_SNORM:
+	case PIPE_FORMAT_LATC2_UNORM:
+	case PIPE_FORMAT_LATC2_SNORM:
+		util_format_rgtc2_unorm_unpack_rgba_8unorm(
+				data, slice->pitch * rsc->cpp,
+				source, trans->base.stride,
+				box->width, box->height);
+		break;
+	default:
+		assert(!"Unexpected format\n");
+		break;
+	}
+}
+
+static void
+fd_resource_flush(struct fd_transfer *trans, const struct pipe_box *box)
+{
+	enum pipe_format format = trans->base.resource->format;
+
+	switch (format) {
+	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+	case PIPE_FORMAT_X32_S8X24_UINT:
+		fd_resource_flush_z32s8(trans, box);
+		break;
+	case PIPE_FORMAT_RGTC1_UNORM:
+	case PIPE_FORMAT_RGTC1_SNORM:
+	case PIPE_FORMAT_RGTC2_UNORM:
+	case PIPE_FORMAT_RGTC2_SNORM:
+	case PIPE_FORMAT_LATC1_UNORM:
+	case PIPE_FORMAT_LATC1_SNORM:
+	case PIPE_FORMAT_LATC2_UNORM:
+	case PIPE_FORMAT_LATC2_SNORM:
+		fd_resource_flush_rgtc(trans, box);
+		break;
+	default:
+		assert(!"Unexpected staging transfer type");
+		break;
+	}
+}
+
 static void fd_resource_transfer_flush_region(struct pipe_context *pctx,
 		struct pipe_transfer *ptrans,
 		const struct pipe_box *box)
@@ -267,20 +342,15 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 		return NULL;
 	}
 
-	if (rsc->layer_first) {
-		offset = slice->offset +
-			box->y / util_format_get_blockheight(format) * ptrans->stride +
-			box->x / util_format_get_blockwidth(format) * rsc->cpp +
-			box->z * rsc->layer_size;
-	} else {
-		offset = slice->offset +
-			box->y / util_format_get_blockheight(format) * ptrans->stride +
-			box->x / util_format_get_blockwidth(format) * rsc->cpp +
-			box->z * slice->size0;
-	}
+	offset = slice->offset +
+		box->y / util_format_get_blockheight(format) * ptrans->stride +
+		box->x / util_format_get_blockwidth(format) * rsc->cpp +
+		fd_resource_layer_offset(rsc, slice, box->z);
 
 	if (prsc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT ||
 		prsc->format == PIPE_FORMAT_X32_S8X24_UINT) {
+		assert(trans->base.box.depth == 1);
+
 		trans->base.stride = trans->base.box.width * rsc->cpp * 2;
 		trans->staging = malloc(trans->base.stride * trans->base.box.height);
 		if (!trans->staging)
@@ -298,8 +368,10 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 				goto fail;
 
 			float *depth = (float *)(buf + slice->offset +
+				fd_resource_layer_offset(rsc, slice, box->z) +
 				box->y * slice->pitch * 4 + box->x * 4);
 			uint8_t *stencil = sbuf + sslice->offset +
+				fd_resource_layer_offset(rsc->stencil, sslice, box->z) +
 				box->y * sslice->pitch + box->x;
 
 			if (format != PIPE_FORMAT_X32_S8X24_UINT)
@@ -316,6 +388,54 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 
 		buf = trans->staging;
 		offset = 0;
+	} else if (rsc->internal_format != format &&
+			   util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_RGTC) {
+		assert(trans->base.box.depth == 1);
+
+		trans->base.stride = util_format_get_stride(
+				format, trans->base.box.width);
+		trans->staging = malloc(
+				util_format_get_2d_size(format, trans->base.stride,
+										trans->base.box.height));
+		if (!trans->staging)
+			goto fail;
+
+		/* if we're not discarding the whole range (or resource), we must copy
+		 * the real data in.
+		 */
+		if (!(usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+					   PIPE_TRANSFER_DISCARD_RANGE))) {
+			uint8_t *rgba8 = (uint8_t *)buf + slice->offset +
+				fd_resource_layer_offset(rsc, slice, box->z) +
+				box->y * slice->pitch * rsc->cpp + box->x * rsc->cpp;
+
+			switch (format) {
+			case PIPE_FORMAT_RGTC1_UNORM:
+			case PIPE_FORMAT_RGTC1_SNORM:
+			case PIPE_FORMAT_LATC1_UNORM:
+			case PIPE_FORMAT_LATC1_SNORM:
+				util_format_rgtc1_unorm_pack_rgba_8unorm(
+					trans->staging, trans->base.stride,
+					rgba8, slice->pitch * rsc->cpp,
+					box->width, box->height);
+				break;
+			case PIPE_FORMAT_RGTC2_UNORM:
+			case PIPE_FORMAT_RGTC2_SNORM:
+			case PIPE_FORMAT_LATC2_UNORM:
+			case PIPE_FORMAT_LATC2_SNORM:
+				util_format_rgtc2_unorm_pack_rgba_8unorm(
+					trans->staging, trans->base.stride,
+					rgba8, slice->pitch * rsc->cpp,
+					box->width, box->height);
+				break;
+			default:
+				assert(!"Unexpected format");
+				break;
+			}
+		}
+
+		buf = trans->staging;
+		offset = 0;
 	}
 
 	*pptrans = ptrans;
@@ -361,9 +481,10 @@ static const struct u_resource_vtbl fd_resource_vtbl = {
 };
 
 static uint32_t
-setup_slices(struct fd_resource *rsc, uint32_t alignment)
+setup_slices(struct fd_resource *rsc, uint32_t alignment, enum pipe_format format)
 {
 	struct pipe_resource *prsc = &rsc->base.b;
+	enum util_format_layout layout = util_format_description(format)->layout;
 	uint32_t level, size = 0;
 	uint32_t width = prsc->width0;
 	uint32_t height = prsc->height0;
@@ -377,9 +498,13 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment)
 		struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
 		uint32_t blocks;
 
-		slice->pitch = width = align(width, 32);
+		if (layout == UTIL_FORMAT_LAYOUT_ASTC)
+			slice->pitch = width =
+				util_align_npot(width, 32 * util_format_get_blockwidth(format));
+		else
+			slice->pitch = width = align(width, 32);
 		slice->offset = size;
-		blocks = util_format_get_nblocks(prsc->format, width, height);
+		blocks = util_format_get_nblocks(format, width, height);
 		/* 1d array and 2d array textures must all have the same layer size
 		 * for each miplevel on a3xx. 3d textures can have different layer
 		 * sizes for high levels, but the hw auto-sizer is buggy (or at least
@@ -430,11 +555,12 @@ fd_resource_create(struct pipe_screen *pscreen,
 {
 	struct fd_resource *rsc = CALLOC_STRUCT(fd_resource);
 	struct pipe_resource *prsc = &rsc->base.b;
-	uint32_t size;
+	enum pipe_format format = tmpl->format;
+	uint32_t size, alignment;
 
 	DBG("target=%d, format=%s, %ux%ux%u, array_size=%u, last_level=%u, "
 			"nr_samples=%u, usage=%u, bind=%x, flags=%x",
-			tmpl->target, util_format_name(tmpl->format),
+			tmpl->target, util_format_name(format),
 			tmpl->width0, tmpl->height0, tmpl->depth0,
 			tmpl->array_size, tmpl->last_level, tmpl->nr_samples,
 			tmpl->usage, tmpl->bind, tmpl->flags);
@@ -451,13 +577,18 @@ fd_resource_create(struct pipe_screen *pscreen,
 	util_range_init(&rsc->valid_buffer_range);
 
 	rsc->base.vtbl = &fd_resource_vtbl;
-	if (tmpl->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
-		rsc->cpp = util_format_get_blocksize(PIPE_FORMAT_Z32_FLOAT);
-	else
-		rsc->cpp = util_format_get_blocksize(tmpl->format);
+
+	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+		format = PIPE_FORMAT_Z32_FLOAT;
+	else if (fd_screen(pscreen)->gpu_id < 400 &&
+			 util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_RGTC)
+		format = PIPE_FORMAT_R8G8B8A8_UNORM;
+	rsc->internal_format = format;
+	rsc->cpp = util_format_get_blocksize(format);
 
 	assert(rsc->cpp);
 
+	alignment = slice_alignment(pscreen, tmpl);
 	if (is_a4xx(fd_screen(pscreen))) {
 		switch (tmpl->target) {
 		case PIPE_TEXTURE_3D:
@@ -465,11 +596,12 @@ fd_resource_create(struct pipe_screen *pscreen,
 			break;
 		default:
 			rsc->layer_first = true;
+			alignment = 1;
 			break;
 		}
 	}
 
-	size = setup_slices(rsc, slice_alignment(pscreen, tmpl));
+	size = setup_slices(rsc, alignment, format);
 
 	if (rsc->layer_first) {
 		rsc->layer_size = align(size, 4096);
@@ -548,7 +680,7 @@ fail:
 	return NULL;
 }
 
-static void fd_blitter_pipe_begin(struct fd_context *ctx);
+static void fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond);
 static void fd_blitter_pipe_end(struct fd_context *ctx);
 
 /**
@@ -570,7 +702,7 @@ fd_blitter_pipe_copy_region(struct fd_context *ctx,
 	if (!util_blitter_is_copy_supported(ctx->blitter, dst, src))
 		return false;
 
-	fd_blitter_pipe_begin(ctx);
+	fd_blitter_pipe_begin(ctx, false);
 	util_blitter_copy_texture(ctx->blitter,
 			dst, dst_level, dstx, dsty, dstz,
 			src, src_level, src_box);
@@ -612,6 +744,25 @@ fd_resource_copy_region(struct pipe_context *pctx,
 			src, src_level, src_box);
 }
 
+bool
+fd_render_condition_check(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+
+	if (!ctx->cond_query)
+		return true;
+
+	union pipe_query_result res = { 0 };
+	bool wait =
+		ctx->cond_mode != PIPE_RENDER_COND_NO_WAIT &&
+		ctx->cond_mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
+
+	if (pctx->get_query_result(pctx, ctx->cond_query, wait, &res))
+			return (bool)res.u64 != ctx->cond_cond;
+
+	return true;
+}
+
 /**
  * Optimal hardware path for blitting pixels.
  * Scaling, format conversion, up- and downsampling (resolve) are allowed.
@@ -630,6 +781,9 @@ fd_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
 		return;
 	}
 
+	if (info.render_condition_enable && !fd_render_condition_check(pctx))
+		return;
+
 	if (util_try_blit_via_copy_region(pctx, &info)) {
 		return; /* done */
 	}
@@ -646,13 +800,13 @@ fd_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
 		return;
 	}
 
-	fd_blitter_pipe_begin(ctx);
+	fd_blitter_pipe_begin(ctx, info.render_condition_enable);
 	util_blitter_blit(ctx->blitter, &info);
 	fd_blitter_pipe_end(ctx);
 }
 
 static void
-fd_blitter_pipe_begin(struct fd_context *ctx)
+fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond)
 {
 	util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb);
 	util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx);
@@ -673,6 +827,9 @@ fd_blitter_pipe_begin(struct fd_context *ctx)
 			(void **)ctx->fragtex.samplers);
 	util_blitter_save_fragment_sampler_views(ctx->blitter,
 			ctx->fragtex.num_textures, ctx->fragtex.textures);
+	if (!render_cond)
+		util_blitter_save_render_condition(ctx->blitter,
+			ctx->cond_query, ctx->cond_cond, ctx->cond_mode);
 
 	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_BLIT);
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index 7549becaa1f..9a9b0d08244 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -73,6 +73,7 @@ struct fd_resource {
 	struct u_resource base;
 	struct fd_bo *bo;
 	uint32_t cpp;
+	enum pipe_format internal_format;
 	bool layer_first;        /* see above description */
 	uint32_t layer_size;
 	struct fd_resource_slice slices[MAX_MIP_LEVELS];
@@ -135,4 +136,6 @@ fd_resource_offset(struct fd_resource *rsc, unsigned level, unsigned layer)
 void fd_resource_screen_init(struct pipe_screen *pscreen);
 void fd_resource_context_init(struct pipe_context *pctx);
 
+bool fd_render_condition_check(struct pipe_context *pctx);
+
 #endif /* FREEDRENO_RESOURCE_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 56d1834ef9c..5bbe4016a2a 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -160,11 +160,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHADER_STENCIL_EXPORT:
 	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
-	case PIPE_CAP_CONDITIONAL_RENDER:
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-	case PIPE_CAP_START_INSTANCE:
 	case PIPE_CAP_COMPUTE:
 		return 0;
 
@@ -176,27 +174,31 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_INDEP_BLEND_FUNC:
 	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_CONDITIONAL_RENDER:
+	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+	case PIPE_CAP_FAKE_SW_MSAA:
+	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+	case PIPE_CAP_DEPTH_CLIP_DISABLE:
+	case PIPE_CAP_CLIP_HALFZ:
 		return is_a3xx(screen) || is_a4xx(screen);
 
 	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
-		/* ignoring first/last_element.. but I guess that should be
-		 * easy to add..
-		 */
+		if (is_a3xx(screen)) return 16;
+		if (is_a4xx(screen)) return 32;
 		return 0;
 	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
-		/* I think 32k on a4xx.. and we could possibly emulate more
-		 * by pretending 2d/rect textures and splitting high bits
-		 * of index into 2nd dimension..
+		/* We could possibly emulate more by pretending 2d/rect textures and
+		 * splitting high bits of index into 2nd dimension..
 		 */
-		return 16383;
-
-	case PIPE_CAP_DEPTH_CLIP_DISABLE:
-	case PIPE_CAP_CLIP_HALFZ:
-	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-		return is_a3xx(screen);
+		if (is_a3xx(screen)) return 8192;
+		if (is_a4xx(screen)) return 16384;
+		return 0;
 
 	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
 	case PIPE_CAP_CUBE_MAP_ARRAY:
+	case PIPE_CAP_START_INSTANCE:
+	case PIPE_CAP_SAMPLER_VIEW_TARGET:
+	case PIPE_CAP_TEXTURE_QUERY_LOD:
 		return is_a4xx(screen);
 
 	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
@@ -205,7 +207,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
 		if (glsl120)
 			return 120;
-		return is_ir3(screen) ? 130 : 120;
+		return is_ir3(screen) ? 140 : 120;
 
 	/* Unsupported features. */
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
@@ -220,15 +222,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
 	case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
 	case PIPE_CAP_TEXTURE_GATHER_SM5:
-	case PIPE_CAP_FAKE_SW_MSAA:
-	case PIPE_CAP_TEXTURE_QUERY_LOD:
 	case PIPE_CAP_SAMPLE_SHADING:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
 	case PIPE_CAP_DRAW_INDIRECT:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
-	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
diff --git a/src/gallium/drivers/freedreno/freedreno_texture.c b/src/gallium/drivers/freedreno/freedreno_texture.c
index 04e4643b4c9..f5611abaec8 100644
--- a/src/gallium/drivers/freedreno/freedreno_texture.c
+++ b/src/gallium/drivers/freedreno/freedreno_texture.c
@@ -197,33 +197,15 @@ fd_setup_border_colors(struct fd_texture_stateobj *tex, void *ptr,
 					continue;
 
 				const struct util_format_channel_description *chan =
-						&desc->channel[desc->swizzle[j]];
-				int size = chan->size;
-
-				/* The Z16 texture format we use seems to look in the
-				 * 32-bit border color slots
-				 */
-				if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS)
-					size = 32;
-
-				/* Formats like R11G11B10 or RGB9_E5 don't specify
-				 * per-channel sizes properly.
-				 */
-				if (desc->layout == UTIL_FORMAT_LAYOUT_OTHER)
-					size = 16;
-
-				if (chan->pure_integer && size > 16)
-					bcolor32[desc->swizzle[j] + 4] =
-							sampler->border_color.i[j];
-				else if (size > 16)
-					bcolor32[desc->swizzle[j]] =
-							fui(sampler->border_color.f[j]);
-				else if (chan->pure_integer)
-					bcolor[desc->swizzle[j] + 8] =
-							sampler->border_color.i[j];
-				else
+					&desc->channel[desc->swizzle[j]];
+				if (chan->pure_integer) {
+					bcolor32[desc->swizzle[j] + 4] = sampler->border_color.i[j];
+					bcolor[desc->swizzle[j] + 8] = sampler->border_color.i[j];
+				} else {
+					bcolor32[desc->swizzle[j]] = fui(sampler->border_color.f[j]);
 					bcolor[desc->swizzle[j]] =
-							util_float_to_half(sampler->border_color.f[j]);
+						util_float_to_half(sampler->border_color.f[j]);
+				}
 			}
 		}
 	}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 157dc73a3c6..156bb0be247 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1177,6 +1177,33 @@ emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu)
 		dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0);
 		break;
 
+	case nir_op_bit_count:
+		dst[0] = ir3_CBITS_B(b, src[0], 0);
+		break;
+	case nir_op_ifind_msb: {
+		struct ir3_instruction *cmp;
+		dst[0] = ir3_CLZ_S(b, src[0], 0);
+		cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
+		cmp->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_SEL_B32(b,
+				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+				cmp, 0, dst[0], 0);
+		break;
+	}
+	case nir_op_ufind_msb:
+		dst[0] = ir3_CLZ_B(b, src[0], 0);
+		dst[0] = ir3_SEL_B32(b,
+				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+				src[0], 0, dst[0], 0);
+		break;
+	case nir_op_find_lsb:
+		dst[0] = ir3_BFREV_B(b, src[0], 0);
+		dst[0] = ir3_CLZ_B(b, dst[0], 0);
+		break;
+	case nir_op_bitfield_reverse:
+		dst[0] = ir3_BFREV_B(b, src[0], 0);
+		break;
+
 	default:
 		compile_error(ctx, "Unhandled ALU op: %s\n",
 				nir_op_infos[alu->op].name);
@@ -1547,10 +1574,10 @@ tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
 		unreachable("bad sampler_dim");
 	}
 
-	if (tex->is_shadow)
+	if (tex->is_shadow && tex->op != nir_texop_lod)
 		flags |= IR3_INSTR_S;
 
-	if (tex->is_array)
+	if (tex->is_array && tex->op != nir_texop_lod)
 		flags |= IR3_INSTR_A;
 
 	*flagsp = flags;
@@ -1618,12 +1645,13 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	case nir_texop_txl:      opc = OPC_SAML;     break;
 	case nir_texop_txd:      opc = OPC_SAMGQ;    break;
 	case nir_texop_txf:      opc = OPC_ISAML;    break;
+	case nir_texop_lod:      opc = OPC_GETLOD;   break;
 	case nir_texop_txf_ms:
 	case nir_texop_txs:
-	case nir_texop_lod:
 	case nir_texop_tg4:
 	case nir_texop_query_levels:
 	case nir_texop_texture_samples:
+	case nir_texop_samples_identical:
 		compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
 		return;
 	}
@@ -1665,10 +1693,10 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 		src0[nsrc0++] = create_immed(b, fui(0.5));
 	}
 
-	if (tex->is_shadow)
+	if (tex->is_shadow && tex->op != nir_texop_lod)
 		src0[nsrc0++] = compare;
 
-	if (tex->is_array)
+	if (tex->is_array && tex->op != nir_texop_lod)
 		src0[nsrc0++] = coord[coords];
 
 	if (has_proj) {
@@ -1717,7 +1745,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	case nir_type_int:
 		type = TYPE_S32;
 		break;
-	case nir_type_unsigned:
+	case nir_type_uint:
 	case nir_type_bool:
 		type = TYPE_U32;
 		break;
@@ -1725,12 +1753,26 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 		unreachable("bad dest_type");
 	}
 
+	if (opc == OPC_GETLOD)
+		type = TYPE_U32;
+
 	sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW,
 			flags, tex->sampler_index, tex->sampler_index,
 			create_collect(b, src0, nsrc0),
 			create_collect(b, src1, nsrc1));
 
 	split_dest(b, dst, sam, 4);
+
+	/* GETLOD returns results in 4.8 fixed point */
+	if (opc == OPC_GETLOD) {
+		struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
+
+		compile_assert(ctx, tex->dest_type == nir_type_float);
+		for (i = 0; i < 2; i++) {
+			dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
+							   factor, 0);
+		}
+	}
 }
 
 static void
@@ -1889,6 +1931,8 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 		case nir_texop_query_levels:
 			emit_tex_query_levels(ctx, tex);
 			break;
+		case nir_texop_samples_identical:
+			unreachable("nir_texop_samples_identical");
 		default:
 			emit_tex(ctx, tex);
 			break;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 7e2c27d9765..5d1cccb0daa 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -166,7 +166,9 @@ struct ir3_shader_variant {
 	} outputs[16 + 2];  /* +POSITION +PSIZE */
 	bool writes_pos, writes_psize;
 
-	/* vertices/inputs: */
+	/* attributes (VS) / varyings (FS):
+	 * Note that sysval's should come *after* normal inputs.
+	 */
 	unsigned inputs_count;
 	struct {
 		uint8_t slot;
@@ -229,7 +231,7 @@ struct ir3_shader {
 
 	struct ir3_compiler *compiler;
 
-	struct pipe_context *pctx;
+	struct pipe_context *pctx;    /* TODO replace w/ pipe_screen */
 	const struct tgsi_token *tokens;
 	struct pipe_stream_output_info stream_output;
 
diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index 83f81135590..31a93659647 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -64,6 +64,8 @@ NV50_C_SOURCES := \
 	nv50/nv50_3ddefs.xml.h \
 	nv50/nv50_3d.xml.h \
 	nv50/nv50_blit.h \
+	nv50/nv50_compute.c \
+	nv50/nv50_compute.xml.h \
 	nv50/nv50_context.c \
 	nv50/nv50_context.h \
 	nv50/nv50_defs.xml.h \
@@ -76,6 +78,10 @@ NV50_C_SOURCES := \
 	nv50/nv50_query.h \
 	nv50/nv50_query_hw.c \
 	nv50/nv50_query_hw.h \
+	nv50/nv50_query_hw_metric.c \
+	nv50/nv50_query_hw_metric.h \
+	nv50/nv50_query_hw_sm.c \
+	nv50/nv50_query_hw_sm.h \
 	nv50/nv50_resource.c \
 	nv50/nv50_resource.h \
 	nv50/nv50_screen.c \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 2a13e1086a0..9f84de03a4a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2357,6 +2357,9 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
    case OP_PFETCH:
       emitPFETCH(insn);
       break;
+   case OP_AFETCH:
+      emitAFETCH(insn);
+      break;
    case OP_EMIT:
    case OP_RESTART:
       emitOUT(insn);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 7859c8e79bd..41d2cc9167c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1573,10 +1573,28 @@ SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)
 
    Instruction *st;
    if (slot->reg.file == FILE_MEMORY_LOCAL) {
-      st = new_Instruction(func, OP_STORE, ty);
-      st->setSrc(0, slot);
-      st->setSrc(1, lval);
       lval->noSpill = 1;
+      if (ty != TYPE_B96) {
+         st = new_Instruction(func, OP_STORE, ty);
+         st->setSrc(0, slot);
+         st->setSrc(1, lval);
+      } else {
+         st = new_Instruction(func, OP_SPLIT, ty);
+         st->setSrc(0, lval);
+         for (int d = 0; d < lval->reg.size / 4; ++d)
+            st->setDef(d, new_LValue(func, FILE_GPR));
+
+         for (int d = lval->reg.size / 4 - 1; d >= 0; --d) {
+            Value *tmp = cloneShallow(func, slot);
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32);
+            s->setSrc(0, tmp);
+            s->setSrc(1, st->getDef(d));
+            defi->bb->insertAfter(defi, s);
+         }
+      }
    } else {
       st = new_Instruction(func, OP_CVT, ty);
       st->setDef(0, slot);
@@ -1596,7 +1614,27 @@ SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
    Instruction *ld;
    if (slot->reg.file == FILE_MEMORY_LOCAL) {
       lval->noSpill = 1;
-      ld = new_Instruction(func, OP_LOAD, ty);
+      if (ty != TYPE_B96) {
+         ld = new_Instruction(func, OP_LOAD, ty);
+      } else {
+         ld = new_Instruction(func, OP_MERGE, ty);
+         for (int d = 0; d < lval->reg.size / 4; ++d) {
+            Value *tmp = cloneShallow(func, slot);
+            LValue *val;
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32);
+            l->setDef(0, (val = new_LValue(func, FILE_GPR)));
+            l->setSrc(0, tmp);
+            usei->bb->insertBefore(usei, l);
+            ld->setSrc(d, val);
+            val->noSpill = 1;
+         }
+         ld->setDef(0, lval);
+         usei->bb->insertBefore(usei, ld);
+         return lval;
+      }
    } else {
       ld = new_Instruction(func, OP_CVT, ty);
    }
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 68e69beb08f..1695553d793 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -657,8 +657,8 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
    if (buffer->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
                              PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
       buffer->domain = NOUVEAU_BO_GART;
-   } else if (buffer->base.bind &
-              (screen->vidmem_bindings & screen->sysmem_bindings)) {
+   } else if (buffer->base.bind == 0 || (buffer->base.bind &
+              (screen->vidmem_bindings & screen->sysmem_bindings))) {
       switch (buffer->base.usage) {
       case PIPE_USAGE_DEFAULT:
       case PIPE_USAGE_IMMUTABLE:
@@ -685,6 +685,10 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
       if (buffer->base.bind & screen->sysmem_bindings)
          buffer->domain = NOUVEAU_BO_GART;
    }
+   /* There can be very special situations where we want non-gpu-mapped
+    * buffers, but never through this interface.
+    */
+   assert(buffer->domain);
    ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);
 
    if (ret == false)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
new file mode 100644
index 00000000000..6d23fd66945
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2012 Francisco Jerez
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_compute.xml.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+int
+nv50_screen_compute_setup(struct nv50_screen *screen,
+                          struct nouveau_pushbuf *push)
+{
+   struct nouveau_device *dev = screen->base.device;
+   struct nouveau_object *chan = screen->base.channel;
+   struct nv04_fifo *fifo = (struct nv04_fifo *)chan->data;
+   unsigned obj_class;
+   int i, ret;
+
+   switch (dev->chipset & 0xf0) {
+   case 0x50:
+   case 0x80:
+   case 0x90:
+      obj_class = NV50_COMPUTE_CLASS;
+      break;
+   case 0xa0:
+      switch (dev->chipset) {
+      case 0xa3:
+      case 0xa5:
+      case 0xa8:
+         obj_class = NVA3_COMPUTE_CLASS;
+         break;
+      default:
+         obj_class = NV50_COMPUTE_CLASS;
+         break;
+      }
+      break;
+   default:
+      NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
+      return -1;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef50c0, obj_class, NULL, 0,
+                            &screen->compute);
+   if (ret)
+      return ret;
+
+   BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->compute->handle);
+
+   BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->stack_bo->offset);
+   PUSH_DATA (push, screen->stack_bo->offset);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1);
+   PUSH_DATA (push, 4);
+
+   BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1);
+   PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
+   BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1);
+   PUSH_DATA (push, 0x100);
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1);
+   PUSH_DATA (push, fifo->vram);
+
+   for (i = 0; i < 15; i++) {
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1);
+      PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+   }
+
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1);
+   PUSH_DATA (push, ~0);
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1);
+   PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1);
+   PUSH_DATA (push, 7);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1);
+   PUSH_DATA (push, 7);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1);
+   PUSH_DATA (push, 0x54);
+   BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1);
+   PUSH_DATA (push, fifo->vram);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->tls_bo->offset + 65536);
+   PUSH_DATA (push, screen->tls_bo->offset + 65536);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1);
+   PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));
+
+   return 0;
+}
+
+static bool
+nv50_compute_validate_program(struct nv50_context *nv50)
+{
+   struct nv50_program *prog = nv50->compprog;
+
+   if (prog->mem)
+      return true;
+
+   if (!prog->translated) {
+      prog->translated = nv50_program_translate(
+         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
+      if (!prog->translated)
+         return false;
+   }
+   if (unlikely(!prog->code_size))
+      return false;
+
+   if (likely(prog->code_size)) {
+      if (nv50_program_upload_code(nv50, prog)) {
+         struct nouveau_pushbuf *push = nv50->base.pushbuf;
+         BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1);
+         PUSH_DATA (push, 0);
+         return true;
+      }
+   }
+   return false;
+}
+
+static void
+nv50_compute_validate_globals(struct nv50_context *nv50)
+{
+   unsigned i;
+
+   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource *res = *util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, i);
+      if (res)
+         nv50_add_bufctx_resident(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL,
+                                  nv04_resource(res), NOUVEAU_BO_RDWR);
+   }
+}
+
+static bool
+nv50_compute_state_validate(struct nv50_context *nv50)
+{
+   if (!nv50_compute_validate_program(nv50))
+      return false;
+
+   if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
+      nv50_compute_validate_globals(nv50);
+
+   /* TODO: validate textures, samplers, surfaces */
+
+   nv50_bufctx_fence(nv50->bufctx_cp, false);
+
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp);
+   if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf)))
+      return false;
+   if (unlikely(nv50->state.flushed))
+      nv50_bufctx_fence(nv50->bufctx_cp, true);
+
+   return true;
+}
+
+static void
+nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+   unsigned size = align(nv50->compprog->parm_size, 0x4);
+
+   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   PUSH_DATA (push, (size / 4) << 8);
+
+   if (size) {
+      struct nouveau_mm_allocation *mm;
+      struct nouveau_bo *bo = NULL;
+      unsigned offset;
+
+      mm = nouveau_mm_allocate(screen->base.mm_GART, size, &bo, &offset);
+      assert(mm);
+
+      nouveau_bo_map(bo, 0, screen->base.client);
+      memcpy(bo->map + offset, input, size);
+
+      nouveau_bufctx_refn(nv50->bufctx, 0, bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+      nouveau_pushbuf_bufctx(push, nv50->bufctx);
+      nouveau_pushbuf_validate(push);
+
+      BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4);
+      nouveau_pushbuf_data(push, bo, offset, size);
+
+      nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
+      nouveau_bo_ref(NULL, &bo);
+      nouveau_bufctx_reset(nv50->bufctx, 0);
+   }
+}
+
+static uint32_t
+nv50_compute_find_symbol(struct nv50_context *nv50, uint32_t label)
+{
+   struct nv50_program *prog = nv50->compprog;
+   const struct nv50_ir_prog_symbol *syms =
+      (const struct nv50_ir_prog_symbol *)prog->cp.syms;
+   unsigned i;
+
+   for (i = 0; i < prog->cp.num_syms; ++i) {
+      if (syms[i].label == label)
+         return prog->code_base + syms[i].offset;
+   }
+   return prog->code_base; /* no symbols or symbol not found */
+}
+
+void
+nv50_launch_grid(struct pipe_context *pipe,
+                 const uint *block_layout, const uint *grid_layout,
+                 uint32_t label, const void *input)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned block_size = block_layout[0] * block_layout[1] * block_layout[2];
+   struct nv50_program *cp = nv50->compprog;
+   bool ret;
+
+   ret = !nv50_compute_state_validate(nv50);
+   if (ret) {
+      NOUVEAU_ERR("Failed to launch grid !\n");
+      return;
+   }
+
+   nv50_compute_upload_input(nv50, input);
+
+   BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1);
+   PUSH_DATA (push, nv50_compute_find_symbol(nv50, label));
+
+   BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1);
+   PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
+   BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1);
+   PUSH_DATA (push, cp->max_gpr);
+
+   /* grid/block setup */
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2);
+   PUSH_DATA (push, block_layout[1] << 16 | block_layout[0]);
+   PUSH_DATA (push, block_layout[2]);
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1);
+   PUSH_DATA (push, 1 << 16 | block_size);
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1);
+   PUSH_DATA (push, grid_layout[1] << 16 | grid_layout[0]);
+   BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1);
+   PUSH_DATA (push, 1);
+
+   /* kernel launching */
+   BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+
+   /* bind a compute shader clobbers fragment shader state */
+   nv50->dirty |= NV50_NEW_FRAGPROG;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h
new file mode 100644
index 00000000000..268d11253b6
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h
@@ -0,0 +1,444 @@
+#ifndef NV50_COMPUTE_XML
+#define NV50_COMPUTE_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://github.com/envytools/envytools/
+git clone https://github.com/envytools/envytools.git
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/graph/g80_compute.xml (  14027 bytes, from 2015-02-14 02:01:36)
+- rnndb/copyright.xml         (   6456 bytes, from 2015-02-14 02:01:36)
+- rnndb/nvchipsets.xml        (   2833 bytes, from 2015-04-28 16:28:33)
+- rnndb/fifo/nv_object.xml    (  15390 bytes, from 2015-04-22 20:36:09)
+- rnndb/g80_defs.xml          (  18210 bytes, from 2015-10-19 20:49:59)
+
+Copyright (C) 2006-2015 by the following authors:
+- Artur Huillet <[email protected]> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <[email protected]> (koala_br)
+- Carlos Martin <[email protected]> (carlosmn)
+- Christoph Bumiller <[email protected]> (calim, chrisbmr)
+- Dawid Gajownik <[email protected]> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <[email protected]> (lumag)
+- EdB <[email protected]> (edb_)
+- Erik Waling <[email protected]> (erikwaling)
+- Francisco Jerez <[email protected]> (curro)
+- Ilia Mirkin <[email protected]> (imirkin)
+- jb17bsome <[email protected]> (jb17bsome)
+- Jeremy Kolb <[email protected]> (kjeremy)
+- Laurent Carlier <[email protected]> (lordheavy)
+- Luca Barbieri <[email protected]> (lb, lb1)
+- Maarten Maathuis <[email protected]> (stillunknown)
+- Marcin Kościelnicki <[email protected]> (mwk, koriakin)
+- Mark Carey <[email protected]> (careym)
+- Matthieu Castet <[email protected]> (mat-c)
+- nvidiaman <[email protected]> (nvidiaman)
+- Patrice Mandin <[email protected]> (pmandin, pmdata)
+- Pekka Paalanen <[email protected]> (pq, ppaalanen)
+- Peter Popov <[email protected]> (ironpeter)
+- Richard Hughes <[email protected]> (hughsient)
+- Rudi Cilibrasi <[email protected]> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <[email protected]> (leroutier)
+- Stephane Marchesin <[email protected]> (marcheu)
+- sturmflut <[email protected]> (sturmflut)
+- Sylvain Munaut <[email protected]>
+- Victor Stinner <[email protected]> (haypo)
+- Wladmir van der Laan <[email protected]> (miathan6)
+- Younes Manton <[email protected]> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NV50_COMPUTE_DMA_NOTIFY					0x00000180
+
+#define NV50_COMPUTE_DMA_GLOBAL					0x000001a0
+
+#define NV50_COMPUTE_DMA_QUERY					0x000001a4
+
+#define NV50_COMPUTE_DMA_LOCAL					0x000001b8
+
+#define NV50_COMPUTE_DMA_STACK					0x000001bc
+
+#define NV50_COMPUTE_DMA_CODE_CB					0x000001c0
+
+#define NV50_COMPUTE_DMA_TSC					0x000001c4
+
+#define NV50_COMPUTE_DMA_TIC					0x000001c8
+
+#define NV50_COMPUTE_DMA_TEXTURE					0x000001cc
+
+#define NV50_COMPUTE_UNK0200					0x00000200
+#define NV50_COMPUTE_UNK0200_UNK1__MASK				0x0000ffff
+#define NV50_COMPUTE_UNK0200_UNK1__SHIFT				0
+#define NV50_COMPUTE_UNK0200_UNK2__MASK				0x00ff0000
+#define NV50_COMPUTE_UNK0200_UNK2__SHIFT				16
+
+#define NV50_COMPUTE_UNK0204					0x00000204
+
+#define NV50_COMPUTE_UNK0208					0x00000208
+
+#define NV50_COMPUTE_UNK020C					0x0000020c
+
+#define NV50_COMPUTE_CP_ADDRESS_HIGH				0x00000210
+
+#define NV50_COMPUTE_CP_ADDRESS_LOW				0x00000214
+
+#define NV50_COMPUTE_STACK_ADDRESS_HIGH				0x00000218
+
+#define NV50_COMPUTE_STACK_ADDRESS_LOW				0x0000021c
+
+#define NV50_COMPUTE_STACK_SIZE_LOG				0x00000220
+
+#define NV50_COMPUTE_CALL_LIMIT_LOG				0x00000224
+
+#define NV50_COMPUTE_UNK0228					0x00000228
+#define NV50_COMPUTE_UNK0228_UNK0				0x00000001
+#define NV50_COMPUTE_UNK0228_UNK4__MASK				0x00000ff0
+#define NV50_COMPUTE_UNK0228_UNK4__SHIFT				4
+#define NV50_COMPUTE_UNK0228_UNK12__MASK				0x000ff000
+#define NV50_COMPUTE_UNK0228_UNK12__SHIFT			12
+
+#define NV50_COMPUTE_TSC_ADDRESS_HIGH				0x0000022c
+
+#define NV50_COMPUTE_TSC_ADDRESS_LOW				0x00000230
+#define NV50_COMPUTE_TSC_ADDRESS_LOW__ALIGN			0x00000020
+
+#define NV50_COMPUTE_TSC_LIMIT					0x00000234
+#define NV50_COMPUTE_TSC_LIMIT__MAX				0x00001fff
+
+#define NV50_COMPUTE_CB_ADDR					0x00000238
+#define NV50_COMPUTE_CB_ADDR_ID__MASK				0x003fff00
+#define NV50_COMPUTE_CB_ADDR_ID__SHIFT				8
+#define NV50_COMPUTE_CB_ADDR_BUFFER__MASK			0x0000007f
+#define NV50_COMPUTE_CB_ADDR_BUFFER__SHIFT			0
+
+#define NV50_COMPUTE_CB_DATA(i0)				       (0x0000023c + 0x4*(i0))
+#define NV50_COMPUTE_CB_DATA__ESIZE				0x00000004
+#define NV50_COMPUTE_CB_DATA__LEN				0x00000010
+
+#define NV50_COMPUTE_TSC_FLUSH					0x0000027c
+#define NV50_COMPUTE_TSC_FLUSH_SPECIFIC				0x00000001
+#define NV50_COMPUTE_TSC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NV50_COMPUTE_TSC_FLUSH_ENTRY__SHIFT			4
+
+#define NV50_COMPUTE_TIC_FLUSH					0x00000280
+#define NV50_COMPUTE_TIC_FLUSH_SPECIFIC				0x00000001
+#define NV50_COMPUTE_TIC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NV50_COMPUTE_TIC_FLUSH_ENTRY__SHIFT			4
+
+#define NV50_COMPUTE_DELAY1					0x00000284
+
+#define NV50_COMPUTE_WATCHDOG_TIMER				0x00000288
+
+#define NV50_COMPUTE_DELAY2					0x0000028c
+
+#define NV50_COMPUTE_UNK0290					0x00000290
+
+#define NV50_COMPUTE_LOCAL_ADDRESS_HIGH				0x00000294
+
+#define NV50_COMPUTE_LOCAL_ADDRESS_LOW				0x00000298
+#define NV50_COMPUTE_LOCAL_ADDRESS_LOW__ALIGN			0x00000100
+
+#define NV50_COMPUTE_LOCAL_SIZE_LOG				0x0000029c
+
+#define NV50_COMPUTE_UNK02A0					0x000002a0
+
+#define NV50_COMPUTE_CB_DEF_ADDRESS_HIGH				0x000002a4
+
+#define NV50_COMPUTE_CB_DEF_ADDRESS_LOW				0x000002a8
+
+#define NV50_COMPUTE_CB_DEF_SET					0x000002ac
+#define NV50_COMPUTE_CB_DEF_SET_SIZE__MASK			0x0000ffff
+#define NV50_COMPUTE_CB_DEF_SET_SIZE__SHIFT			0
+#define NV50_COMPUTE_CB_DEF_SET_BUFFER__MASK			0x007f0000
+#define NV50_COMPUTE_CB_DEF_SET_BUFFER__SHIFT			16
+
+#define NV50_COMPUTE_UNK02B0					0x000002b0
+
+#define NV50_COMPUTE_BLOCK_ALLOC					0x000002b4
+#define NV50_COMPUTE_BLOCK_ALLOC_THREADS__MASK			0x0000ffff
+#define NV50_COMPUTE_BLOCK_ALLOC_THREADS__SHIFT			0
+#define NV50_COMPUTE_BLOCK_ALLOC_BARRIERS__MASK			0x00ff0000
+#define NV50_COMPUTE_BLOCK_ALLOC_BARRIERS__SHIFT			16
+
+#define NV50_COMPUTE_LANES32_ENABLE				0x000002b8
+
+#define NV50_COMPUTE_UNK02BC					0x000002bc
+#define NV50_COMPUTE_UNK02BC_UNK1__MASK				0x00000007
+#define NV50_COMPUTE_UNK02BC_UNK1__SHIFT				0
+#define NV50_COMPUTE_UNK02BC_UNK2__MASK				0x00000070
+#define NV50_COMPUTE_UNK02BC_UNK2__SHIFT				4
+
+#define NV50_COMPUTE_CP_REG_ALLOC_TEMP				0x000002c0
+
+#define NV50_COMPUTE_TIC_ADDRESS_HIGH				0x000002c4
+
+#define NV50_COMPUTE_TIC_ADDRESS_LOW				0x000002c8
+
+#define NV50_COMPUTE_TIC_LIMIT					0x000002cc
+
+#define NV50_COMPUTE_MP_PM_SET(i0)			       (0x000002d0 + 0x4*(i0))
+#define NV50_COMPUTE_MP_PM_SET__ESIZE				0x00000004
+#define NV50_COMPUTE_MP_PM_SET__LEN				0x00000004
+
+#define NV50_COMPUTE_MP_PM_CONTROL(i0)			       (0x000002e0 + 0x4*(i0))
+#define NV50_COMPUTE_MP_PM_CONTROL__ESIZE			0x00000004
+#define NV50_COMPUTE_MP_PM_CONTROL__LEN				0x00000004
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE__MASK			0x00000001
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE__SHIFT			0
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE_LOGOP			0x00000000
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE_LOGOP_PULSE		0x00000001
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT__MASK			0x00000070
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT__SHIFT			4
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK0			0x00000000
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK1			0x00000010
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK2			0x00000020
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK3			0x00000030
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK4			0x00000040
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK5			0x00000050
+#define NV50_COMPUTE_MP_PM_CONTROL_FUNC__MASK			0x00ffff00
+#define NV50_COMPUTE_MP_PM_CONTROL_FUNC__SHIFT			8
+#define NV50_COMPUTE_MP_PM_CONTROL_SIG__MASK			0xff000000
+#define NV50_COMPUTE_MP_PM_CONTROL_SIG__SHIFT			24
+
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE			0x000002f0
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_0		0x00000001
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_1		0x00000002
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_2		0x00000004
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_3		0x00000008
+
+#define NV50_COMPUTE_UNK02F4					0x000002f4
+
+#define NV50_COMPUTE_BLOCKDIM_LATCH				0x000002f8
+
+#define NV50_COMPUTE_LOCAL_WARPS_LOG_ALLOC			0x000002fc
+
+#define NV50_COMPUTE_LOCAL_WARPS_NO_CLAMP			0x00000300
+
+#define NV50_COMPUTE_STACK_WARPS_LOG_ALLOC			0x00000304
+
+#define NV50_COMPUTE_STACK_WARPS_NO_CLAMP			0x00000308
+
+#define NV50_COMPUTE_UNK030C					0x0000030c
+
+#define NV50_COMPUTE_QUERY_ADDRESS_HIGH				0x00000310
+
+#define NV50_COMPUTE_QUERY_ADDRESS_LOW				0x00000314
+
+#define NV50_COMPUTE_QUERY_SEQUENCE				0x00000318
+
+#define NV50_COMPUTE_QUERY_GET					0x0000031c
+#define NV50_COMPUTE_QUERY_GET_INTR				0x00000200
+#define NV50_COMPUTE_QUERY_GET_SHORT				0x00008000
+
+#define NV50_COMPUTE_COND_ADDRESS_HIGH				0x00000320
+
+#define NV50_COMPUTE_COND_ADDRESS_LOW				0x00000324
+
+#define NV50_COMPUTE_COND_MODE					0x00000328
+#define NV50_COMPUTE_COND_MODE_NEVER				0x00000000
+#define NV50_COMPUTE_COND_MODE_ALWAYS				0x00000001
+#define NV50_COMPUTE_COND_MODE_RES_NON_ZERO			0x00000002
+#define NV50_COMPUTE_COND_MODE_EQUAL				0x00000003
+#define NV50_COMPUTE_COND_MODE_NOT_EQUAL				0x00000004
+
+#define NV50_COMPUTE_UNK032C					0x0000032c
+
+#define NV50_COMPUTE_UNK0330					0x00000330
+
+#define NV50_COMPUTE_UNK0334(i0)				       (0x00000334 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0334__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0334__LEN				0x00000003
+
+#define NV50_COMPUTE_UNK0340(i0)				       (0x00000340 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0340__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0340__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0348(i0)				       (0x00000348 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0348__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0348__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0350(i0)				       (0x00000350 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0350__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0350__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0358					0x00000358
+
+#define NV50_COMPUTE_UNK035C					0x0000035c
+
+#define NV50_COMPUTE_UNK0360					0x00000360
+#define NV50_COMPUTE_UNK0360_UNK0__MASK				0x000000f0
+#define NV50_COMPUTE_UNK0360_UNK0__SHIFT				4
+#define NV50_COMPUTE_UNK0360_UNK1__MASK				0x00000f00
+#define NV50_COMPUTE_UNK0360_UNK1__SHIFT				8
+
+#define NV50_COMPUTE_UNK0364					0x00000364
+
+#define NV50_COMPUTE_LAUNCH					0x00000368
+
+#define NV50_COMPUTE_UNK036C					0x0000036c
+
+#define NV50_COMPUTE_UNK0370					0x00000370
+
+#define NV50_COMPUTE_USER_PARAM_COUNT				0x00000374
+#define NV50_COMPUTE_USER_PARAM_COUNT_UNK0__MASK			0x000000ff
+#define NV50_COMPUTE_USER_PARAM_COUNT_UNK0__SHIFT		0
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__MASK		0x0000ff00
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__SHIFT		8
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__MAX			0x00000040
+
+#define NV50_COMPUTE_LINKED_TSC					0x00000378
+
+#define NV50_COMPUTE_UNK037C					0x0000037c
+#define NV50_COMPUTE_UNK037C_ALWAYS_DERIV			0x00000001
+#define NV50_COMPUTE_UNK037C_UNK16				0x00010000
+
+#define NV50_COMPUTE_CODE_CB_FLUSH				0x00000380
+
+#define NV50_COMPUTE_UNK0384					0x00000384
+
+#define NV50_COMPUTE_GRIDID					0x00000388
+
+#define NV50_COMPUTE_UNK038C(i0)				       (0x0000038c + 0x4*(i0))
+#define NV50_COMPUTE_UNK038C__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK038C__LEN				0x00000003
+
+#define NV50_COMPUTE_WRCACHE_FLUSH				0x00000398
+
+#define NV50_COMPUTE_UNK039C(i0)				       (0x0000039c + 0x4*(i0))
+#define NV50_COMPUTE_UNK039C__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK039C__LEN				0x00000002
+
+#define NV50_COMPUTE_GRIDDIM					0x000003a4
+#define NV50_COMPUTE_GRIDDIM_X__MASK				0x0000ffff
+#define NV50_COMPUTE_GRIDDIM_X__SHIFT				0
+#define NV50_COMPUTE_GRIDDIM_Y__MASK				0xffff0000
+#define NV50_COMPUTE_GRIDDIM_Y__SHIFT				16
+
+#define NV50_COMPUTE_SHARED_SIZE					0x000003a8
+#define NV50_COMPUTE_SHARED_SIZE__MAX				0x00004000
+#define NV50_COMPUTE_SHARED_SIZE__ALIGN				0x00000040
+
+#define NV50_COMPUTE_BLOCKDIM_XY					0x000003ac
+#define NV50_COMPUTE_BLOCKDIM_XY_X__MASK				0x0000ffff
+#define NV50_COMPUTE_BLOCKDIM_XY_X__SHIFT			0
+#define NV50_COMPUTE_BLOCKDIM_XY_Y__MASK				0xffff0000
+#define NV50_COMPUTE_BLOCKDIM_XY_Y__SHIFT			16
+
+#define NV50_COMPUTE_BLOCKDIM_Z					0x000003b0
+#define NV50_COMPUTE_BLOCKDIM_Z__MIN				0x00000001
+#define NV50_COMPUTE_BLOCKDIM_Z__MAX				0x00000040
+
+#define NV50_COMPUTE_CP_START_ID					0x000003b4
+
+#define NV50_COMPUTE_REG_MODE					0x000003b8
+#define NV50_COMPUTE_REG_MODE_PACKED				0x00000001
+#define NV50_COMPUTE_REG_MODE_STRIPED				0x00000002
+
+#define NV50_COMPUTE_TEX_LIMITS					0x000003bc
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MASK		0x0000000f
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__SHIFT		0
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MIN		0x00000000
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MAX		0x00000004
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MASK		0x000000f0
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__SHIFT		4
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MIN		0x00000000
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MAX		0x00000007
+
+#define NV50_COMPUTE_BIND_TSC					0x000003c0
+#define NV50_COMPUTE_BIND_TSC_VALID				0x00000001
+#define NV50_COMPUTE_BIND_TSC_SAMPLER__MASK			0x000000f0
+#define NV50_COMPUTE_BIND_TSC_SAMPLER__SHIFT			4
+#define NV50_COMPUTE_BIND_TSC_TSC__MASK				0x001ff000
+#define NV50_COMPUTE_BIND_TSC_TSC__SHIFT				12
+
+#define NV50_COMPUTE_BIND_TIC					0x000003c4
+#define NV50_COMPUTE_BIND_TIC_VALID				0x00000001
+#define NV50_COMPUTE_BIND_TIC_TEXTURE__MASK			0x000001fe
+#define NV50_COMPUTE_BIND_TIC_TEXTURE__SHIFT			1
+#define NV50_COMPUTE_BIND_TIC_TIC__MASK				0x7ffffe00
+#define NV50_COMPUTE_BIND_TIC_TIC__SHIFT				9
+
+#define NV50_COMPUTE_SET_PROGRAM_CB				0x000003c8
+#define NV50_COMPUTE_SET_PROGRAM_CB_INDEX__MASK			0x00000f00
+#define NV50_COMPUTE_SET_PROGRAM_CB_INDEX__SHIFT			8
+#define NV50_COMPUTE_SET_PROGRAM_CB_BUFFER__MASK			0x0007f000
+#define NV50_COMPUTE_SET_PROGRAM_CB_BUFFER__SHIFT		12
+#define NV50_COMPUTE_SET_PROGRAM_CB_VALID			0x000000ff
+
+#define NV50_COMPUTE_UNK03CC					0x000003cc
+
+#define NV50_COMPUTE_TEX_CACHE_CTL				0x000003d0
+#define NV50_COMPUTE_TEX_CACHE_CTL_UNK1__MASK			0x00000030
+#define NV50_COMPUTE_TEX_CACHE_CTL_UNK1__SHIFT			4
+
+#define NV50_COMPUTE_UNK03D4					0x000003d4
+
+#define NV50_COMPUTE_UNK03D8					0x000003d8
+
+#define NV50_COMPUTE_UNK03DC					0x000003dc
+
+#define NV50_COMPUTE_UNK03E0					0x000003e0
+
+#define NV50_COMPUTE_UNK03E4					0x000003e4
+
+#define NVA3_COMPUTE_TEX_MISC					0x000003e8
+#define NVA3_COMPUTE_TEX_MISC_UNK1				0x00000001
+#define NVA3_COMPUTE_TEX_MISC_SEAMLESS_CUBE_MAP		0x00000002
+
+#define NV50_COMPUTE_GLOBAL(i0)				       (0x00000400 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL__ESIZE				0x00000020
+#define NV50_COMPUTE_GLOBAL__LEN					0x00000010
+
+#define NV50_COMPUTE_GLOBAL_ADDRESS_HIGH(i0)		       (0x00000400 + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_ADDRESS_LOW(i0)		       (0x00000404 + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_PITCH(i0)			       (0x00000408 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL_PITCH__MAX				0x00800000
+#define NV50_COMPUTE_GLOBAL_PITCH__ALIGN				0x00000100
+
+#define NV50_COMPUTE_GLOBAL_LIMIT(i0)			       (0x0000040c + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_MODE(i0)			       (0x00000410 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL_MODE_LINEAR				0x00000001
+#define NV50_COMPUTE_GLOBAL_MODE_UNK1__MASK			0x000000f0
+#define NV50_COMPUTE_GLOBAL_MODE_UNK1__SHIFT			4
+#define NV50_COMPUTE_GLOBAL_MODE_TILE_MODE__MASK			0x00000f00
+#define NV50_COMPUTE_GLOBAL_MODE_TILE_MODE__SHIFT		8
+
+#define NV50_COMPUTE_USER_PARAM(i0)			       (0x00000600 + 0x4*(i0))
+#define NV50_COMPUTE_USER_PARAM__ESIZE				0x00000004
+#define NV50_COMPUTE_USER_PARAM__LEN				0x00000040
+
+#define NV50_COMPUTE_UNK0700(i0)				       (0x00000700 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0700__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0700__LEN				0x00000010
+
+
+#endif /* NV50_COMPUTE_XML */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 7867c2df7f3..4874b77b1e1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -113,6 +113,7 @@ nv50_context_unreference_resources(struct nv50_context *nv50)
 
    nouveau_bufctx_del(&nv50->bufctx_3d);
    nouveau_bufctx_del(&nv50->bufctx);
+   nouveau_bufctx_del(&nv50->bufctx_cp);
 
    util_unreference_framebuffer_state(&nv50->framebuffer);
 
@@ -131,6 +132,14 @@ nv50_context_unreference_resources(struct nv50_context *nv50)
          if (!nv50->constbuf[s][i].user)
             pipe_resource_reference(&nv50->constbuf[s][i].u.buf, NULL);
    }
+
+   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource **res = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, i);
+      pipe_resource_reference(res, NULL);
+   }
+   util_dynarray_fini(&nv50->global_residents);
 }
 
 static void
@@ -159,9 +168,10 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
                                  int ref)
 {
    struct nv50_context *nv50 = nv50_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
    unsigned s, i;
 
-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
       assert(nv50->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
       for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
          if (nv50->framebuffer.cbufs[i] &&
@@ -173,7 +183,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
          }
       }
    }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nv50->framebuffer.zsbuf &&
           nv50->framebuffer.zsbuf->texture == res) {
          nv50->dirty |= NV50_NEW_FRAMEBUFFER;
@@ -183,11 +193,11 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       }
    }
 
-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_SAMPLER_VIEW)) {
 
       assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
@@ -263,10 +273,13 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    nv50->base.pushbuf = screen->base.pushbuf;
    nv50->base.client = screen->base.client;
 
-   ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_COUNT,
-                            &nv50->bufctx_3d);
+   ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+   if (!ret)
+      ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_3D_COUNT,
+                               &nv50->bufctx_3d);
    if (!ret)
-      ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+      ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_CP_COUNT,
+                               &nv50->bufctx_cp);
    if (ret)
       goto out_err;
 
@@ -290,6 +303,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 
    pipe->draw_vbo = nv50_draw_vbo;
    pipe->clear = nv50_clear;
+   pipe->launch_grid = nv50_launch_grid;
 
    pipe->flush = nv50_flush;
    pipe->texture_barrier = nv50_texture_barrier;
@@ -335,19 +349,30 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
+   if (screen->compute) {
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code);
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc);
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->stack_bo);
+   }
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
 
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
    BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
+   if (screen->compute)
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
 
    nv50->base.scratch.bo_size = 2 << 20;
 
+   util_dynarray_init(&nv50->global_residents);
+
    return pipe;
 
 out_err:
    if (nv50->bufctx_3d)
       nouveau_bufctx_del(&nv50->bufctx_3d);
+   if (nv50->bufctx_cp)
+      nouveau_bufctx_del(&nv50->bufctx_cp);
    if (nv50->bufctx)
       nouveau_bufctx_del(&nv50->bufctx);
    FREE(nv50->blit);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index fb74a9748a3..2cebcd99423 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -49,6 +49,10 @@
 #define NV50_NEW_MIN_SAMPLES  (1 << 22)
 #define NV50_NEW_CONTEXT      (1 << 31)
 
+#define NV50_NEW_CP_PROGRAM   (1 << 0)
+#define NV50_NEW_CP_GLOBALS   (1 << 1)
+
+/* 3d bufctx (during draw_vbo, blit_3d) */
 #define NV50_BIND_FB          0
 #define NV50_BIND_VERTEX      1
 #define NV50_BIND_VERTEX_TMP  2
@@ -58,7 +62,15 @@
 #define NV50_BIND_SO         53
 #define NV50_BIND_SCREEN     54
 #define NV50_BIND_TLS        55
-#define NV50_BIND_COUNT      56
+#define NV50_BIND_3D_COUNT   56
+
+/* compute bufctx (during launch_grid) */
+#define NV50_BIND_CP_GLOBAL   0
+#define NV50_BIND_CP_SCREEN   1
+#define NV50_BIND_CP_QUERY    2
+#define NV50_BIND_CP_COUNT    3
+
+/* bufctx for other operations */
 #define NV50_BIND_2D          0
 #define NV50_BIND_M2MF        0
 #define NV50_BIND_FENCE       1
@@ -101,8 +113,10 @@ struct nv50_context {
 
    struct nouveau_bufctx *bufctx_3d;
    struct nouveau_bufctx *bufctx;
+   struct nouveau_bufctx *bufctx_cp;
 
    uint32_t dirty;
+   uint32_t dirty_cp; /* dirty flags for compute state */
    bool cb_dirty;
 
    struct nv50_graph_state state;
@@ -115,6 +129,7 @@ struct nv50_context {
    struct nv50_program *vertprog;
    struct nv50_program *gmtyprog;
    struct nv50_program *fragprog;
+   struct nv50_program *compprog;
 
    struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[3];
@@ -163,6 +178,8 @@ struct nv50_context {
    uint32_t cond_condmode; /* the calculated condition */
 
    struct nv50_blitctx *blit;
+
+   struct util_dynarray global_residents;
 };
 
 static inline struct nv50_context *
@@ -302,4 +319,9 @@ struct pipe_video_buffer *
 nv98_video_buffer_create(struct pipe_context *pipe,
                          const struct pipe_video_buffer *template);
 
+/* nv50_compute.c */
+void
+nv50_launch_grid(struct pipe_context *, const uint *, const uint *,
+                 uint32_t, const void *);
+
 #endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 89e7a338283..a4b8ddfda95 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -66,7 +66,6 @@ nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
       case TGSI_SEMANTIC_VERTEXID:
          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
-         prog->vp.vertexid = 1;
          continue;
       default:
          break;
@@ -259,6 +258,8 @@ nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
       return nv50_vertprog_assign_slots(info);
    case PIPE_SHADER_FRAGMENT:
       return nv50_fragprog_assign_slots(info);
+   case PIPE_SHADER_COMPUTE:
+      return 0;
    default:
       return -1;
    }
@@ -355,6 +356,9 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    prog->gp.has_layer = 0;
    prog->gp.has_viewport = 0;
 
+   if (prog->type == PIPE_SHADER_COMPUTE)
+      info->prop.cp.inputOffset = 0x10;
+
    info->driverPriv = prog;
 
 #ifdef DEBUG
@@ -378,6 +382,8 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
    prog->tls_space = info->bin.tlsSpace;
 
+   prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+
    if (prog->type == PIPE_SHADER_FRAGMENT) {
       if (info->prop.fp.writesDepth) {
          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
@@ -401,6 +407,10 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
          break;
       }
       prog->gp.vert_count = info->prop.gp.maxVertices;
+   } else
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      prog->cp.syms = info->bin.syms;
+      prog->cp.num_syms = info->bin.numSyms;
    }
 
    if (prog->pipe.stream_output.num_outputs)
@@ -423,11 +433,13 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    struct nouveau_heap *heap;
    int ret;
    uint32_t size = align(prog->code_size, 0x40);
+   uint8_t prog_type;
 
    switch (prog->type) {
    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
    case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
+   case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
    default:
       assert(!"invalid program type");
       return false;
@@ -450,7 +462,14 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
          return false;
       }
    }
-   prog->code_base = prog->mem->start;
+
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      /* CP code must be uploaded in FP code segment. */
+      prog_type = 1;
+   } else {
+      prog->code_base = prog->mem->start;
+      prog_type = prog->type;
+   }
 
    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    if (ret < 0) {
@@ -468,7 +487,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
                             false /* flatshade */);
 
    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
-                       (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
+                       (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
 
    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
@@ -489,7 +508,7 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
    FREE(p->code);
 
    FREE(p->fixups);
-
+   FREE(p->interps);
    FREE(p->so);
 
    memset(p, 0, sizeof(*p));
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index 7a33eb11d6d..1de5122a56e 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -76,9 +76,9 @@ struct nv50_program {
       ubyte psiz;        /* output slot of point size */
       ubyte bfc[2];      /* indices into varying for FFC (FP) or BFC (VP) */
       ubyte edgeflag;
-      ubyte vertexid;
       ubyte clpd[2];     /* output slot of clip distance[i]'s 1st component */
       ubyte clpd_nr;
+      bool need_vertex_id;
    } vp;
 
    struct {
@@ -98,6 +98,13 @@ struct nv50_program {
       ubyte viewportid; /* hw value of viewport index output */
    } gp;
 
+   struct {
+      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
+      uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
+      void *syms;
+      unsigned num_syms;
+   } cp;
+
    void *fixups; /* relocation records */
    void *interps; /* interpolation records */
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c
index f31eaa0e314..cbef95d07f6 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_push.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c
@@ -24,6 +24,10 @@ struct push_context {
    struct translate *translate;
 
    bool primitive_restart;
+
+   bool need_vertex_id;
+   int32_t index_bias;
+
    uint32_t prim;
    uint32_t restart_index;
    uint32_t instance_id;
@@ -74,6 +78,11 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
 
       size = ctx->vertex_words * nr;
 
+      if (unlikely(ctx->need_vertex_id)) {
+         BEGIN_NV04(ctx->push, NV84_3D(VERTEX_ID_BASE), 1);
+         PUSH_DATA (ctx->push, *elts + ctx->index_bias);
+      }
+
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
       ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->instance_id,
@@ -107,6 +116,11 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
 
       size = ctx->vertex_words * nr;
 
+      if (unlikely(ctx->need_vertex_id)) {
+         BEGIN_NV04(ctx->push, NV84_3D(VERTEX_ID_BASE), 1);
+         PUSH_DATA (ctx->push, *elts + ctx->index_bias);
+      }
+
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
       ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->instance_id,
@@ -140,6 +154,11 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
 
       size = ctx->vertex_words * nr;
 
+      if (unlikely(ctx->need_vertex_id)) {
+         BEGIN_NV04(ctx->push, NV84_3D(VERTEX_ID_BASE), 1);
+         PUSH_DATA (ctx->push, *elts + ctx->index_bias);
+      }
+
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
       ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->instance_id,
@@ -161,10 +180,18 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
 static void
 emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 {
+   uint32_t elts = 0;
+
    while (count) {
       unsigned push = MIN2(count, ctx->packet_vertex_limit);
       unsigned size = ctx->vertex_words * push;
 
+      if (unlikely(ctx->need_vertex_id)) {
+         /* For non-indexed draws, gl_VertexID goes up after each vertex. */
+         BEGIN_NV04(ctx->push, NV84_3D(VERTEX_ID_BASE), 1);
+         PUSH_DATA (ctx->push, elts++);
+      }
+
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
       ctx->translate->run(ctx->translate, start, push, 0, ctx->instance_id,
@@ -216,7 +243,14 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
 
    ctx.push = nv50->base.pushbuf;
    ctx.translate = nv50->vertex->translate;
-   ctx.packet_vertex_limit = nv50->vertex->packet_vertex_limit;
+
+   ctx.need_vertex_id = nv50->screen->base.class_3d >= NV84_3D_CLASS &&
+      nv50->vertprog->vp.need_vertex_id && (nv50->vertex->num_elements < 32);
+   ctx.index_bias = info->index_bias;
+
+   /* For indexed draws, gl_VertexID must be emitted for every vertex. */
+   ctx.packet_vertex_limit =
+      ctx.need_vertex_id ? 1 : nv50->vertex->packet_vertex_limit;
    ctx.vertex_words = nv50->vertex->vertex_size;
 
    assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
@@ -307,4 +341,10 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
       ctx.instance_id++;
       ctx.prim |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
    }
+
+   if (unlikely(ctx.need_vertex_id)) {
+      /* Reset gl_VertexID to prevent future indexed draws to be confused. */
+      BEGIN_NV04(ctx.push, NV84_3D(VERTEX_ID_BASE), 1);
+      PUSH_DATA (ctx.push, nv50->state.index_bias);
+   }
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index dd9b85b7208..4cd3b615606 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -27,6 +27,8 @@
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_query.h"
 #include "nv50/nv50_query_hw.h"
+#include "nv50/nv50_query_hw_metric.h"
+#include "nv50/nv50_query_hw_sm.h"
 
 static struct pipe_query *
 nv50_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
@@ -152,4 +154,79 @@ nv50_init_query_functions(struct nv50_context *nv50)
    pipe->end_query = nv50_end_query;
    pipe->get_query_result = nv50_get_query_result;
    pipe->render_condition = nv50_render_condition;
+   nv50->cond_condmode = NV50_3D_COND_MODE_ALWAYS;
+}
+
+int
+nv50_screen_get_driver_query_info(struct pipe_screen *pscreen,
+                                  unsigned id,
+                                  struct pipe_driver_query_info *info)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+   int num_hw_queries = 0;
+
+   num_hw_queries = nv50_hw_get_driver_query_info(screen, 0, NULL);
+
+   if (!info)
+      return num_hw_queries;
+
+   /* Init default values. */
+   info->name = "this_is_not_the_query_you_are_looking_for";
+   info->query_type = 0xdeadd01d;
+   info->max_value.u64 = 0;
+   info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+   info->group_id = -1;
+   info->flags = 0;
+
+   return nv50_hw_get_driver_query_info(screen, id, info);
+}
+
+int
+nv50_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
+                                        unsigned id,
+                                        struct pipe_driver_query_group_info *info)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+   int count = 0;
+
+   if (screen->compute)
+      if (screen->base.class_3d >= NV84_3D_CLASS)
+         count += 2;
+
+   if (!info)
+      return count;
+
+   if (id == NV50_HW_SM_QUERY_GROUP) {
+      if (screen->compute) {
+         if (screen->base.class_3d >= NV84_3D_CLASS) {
+            info->name = "MP counters";
+
+            /* Because we can't expose the number of hardware counters needed
+             * for each different query, we don't want to allow more than one
+             * active query simultaneously to avoid failure when the maximum
+             * number of counters is reached. Note that these groups of GPU
+             * counters are currently only used by AMD_performance_monitor.
+             */
+            info->max_active_queries = 1;
+            info->num_queries = NV50_HW_SM_QUERY_COUNT;
+            return 1;
+         }
+      }
+   } else
+   if (id == NV50_HW_METRIC_QUERY_GROUP) {
+      if (screen->compute) {
+         if (screen->base.class_3d >= NV84_3D_CLASS) {
+            info->name = "Performance metrics";
+            info->max_active_queries = 1;
+            info->num_queries = NV50_HW_METRIC_QUERY_COUNT;
+            return 1;
+         }
+      }
+   }
+
+   /* user asked for info about non-existing query group */
+   info->name = "this_is_not_the_query_group_you_are_looking_for";
+   info->max_active_queries = 0;
+   info->num_queries = 0;
+   return 0;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.h b/src/gallium/drivers/nouveau/nv50/nv50_query.h
index d990285c857..bd4c0a386f6 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.h
@@ -28,6 +28,12 @@ nv50_query(struct pipe_query *pipe)
    return (struct nv50_query *)pipe;
 }
 
+/*
+ * Driver queries groups:
+ */
+#define NV50_HW_SM_QUERY_GROUP       0
+#define NV50_HW_METRIC_QUERY_GROUP   1
+
 void nv50_init_query_functions(struct nv50_context *);
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
index 945ce7abe50..b6ebbbf1010 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -25,6 +25,8 @@
 
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_query_hw.h"
+#include "nv50/nv50_query_hw_metric.h"
+#include "nv50/nv50_query_hw_sm.h"
 #include "nv_object.xml.h"
 
 #define NV50_HW_QUERY_STATE_READY   0
@@ -41,7 +43,7 @@
 
 #define NV50_HW_QUERY_ALLOC_SPACE 256
 
-static bool
+bool
 nv50_hw_query_allocate(struct nv50_context *nv50, struct nv50_query *q,
                        int size)
 {
@@ -122,6 +124,9 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_hw_query *hq = nv50_hw_query(q);
 
+   if (hq->funcs && hq->funcs->begin_query)
+      return hq->funcs->begin_query(nv50, hq);
+
    /* For occlusion queries we have to change the storage, because a previous
     * query might set the initial render condition to false even *after* we re-
     * initialized it to true.
@@ -193,6 +198,11 @@ nv50_hw_end_query(struct nv50_context *nv50, struct nv50_query *q)
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_hw_query *hq = nv50_hw_query(q);
 
+   if (hq->funcs && hq->funcs->end_query) {
+      hq->funcs->end_query(nv50, hq);
+      return;
+   }
+
    hq->state = NV50_HW_QUERY_STATE_ENDED;
 
    switch (q->type) {
@@ -261,6 +271,9 @@ nv50_hw_get_query_result(struct nv50_context *nv50, struct nv50_query *q,
    uint64_t *data64 = (uint64_t *)hq->data;
    int i;
 
+   if (hq->funcs && hq->funcs->get_query_result)
+      return hq->funcs->get_query_result(nv50, hq, wait, result);
+
    if (hq->state != NV50_HW_QUERY_STATE_READY)
       nv50_hw_query_update(q);
 
@@ -331,6 +344,18 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
    struct nv50_hw_query *hq;
    struct nv50_query *q;
 
+   hq = nv50_hw_sm_create_query(nv50, type);
+   if (hq) {
+      hq->base.funcs = &hw_query_funcs;
+      return (struct nv50_query *)hq;
+   }
+
+   hq = nv50_hw_metric_create_query(nv50, type);
+   if (hq) {
+      hq->base.funcs = &hw_query_funcs;
+      return (struct nv50_query *)hq;
+   }
+
    hq = CALLOC_STRUCT(nv50_hw_query);
    if (!hq)
       return NULL;
@@ -375,6 +400,26 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
    return q;
 }
 
+int
+nv50_hw_get_driver_query_info(struct nv50_screen *screen, unsigned id,
+                              struct pipe_driver_query_info *info)
+{
+   int num_hw_sm_queries = 0, num_hw_metric_queries = 0;
+
+   num_hw_sm_queries = nv50_hw_sm_get_driver_query_info(screen, 0, NULL);
+   num_hw_metric_queries =
+      nv50_hw_metric_get_driver_query_info(screen, 0, NULL);
+
+   if (!info)
+      return num_hw_sm_queries + num_hw_metric_queries;
+
+   if (id < num_hw_sm_queries)
+      return nv50_hw_sm_get_driver_query_info(screen, id, info);
+
+   return nv50_hw_metric_get_driver_query_info(screen,
+                                               id - num_hw_sm_queries, info);
+}
+
 void
 nv50_hw_query_pushbuf_submit(struct nouveau_pushbuf *push, uint16_t method,
                              struct nv50_query *q, unsigned result_offset)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
index 294c67de9a4..82ec6bd2d96 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
@@ -8,8 +8,19 @@
 
 #define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
+struct nv50_hw_query;
+
+struct nv50_hw_query_funcs {
+   void (*destroy_query)(struct nv50_context *, struct nv50_hw_query *);
+   boolean (*begin_query)(struct nv50_context *, struct nv50_hw_query *);
+   void (*end_query)(struct nv50_context *, struct nv50_hw_query *);
+   boolean (*get_query_result)(struct nv50_context *, struct nv50_hw_query *,
+                               boolean, union pipe_query_result *);
+};
+
 struct nv50_hw_query {
    struct nv50_query base;
+   const struct nv50_hw_query_funcs *funcs;
    uint32_t *data;
    uint32_t sequence;
    struct nouveau_bo *bo;
@@ -31,6 +42,11 @@ nv50_hw_query(struct nv50_query *q)
 
 struct nv50_query *
 nv50_hw_create_query(struct nv50_context *, unsigned, unsigned);
+int
+nv50_hw_get_driver_query_info(struct nv50_screen *, unsigned,
+                              struct pipe_driver_query_info *);
+bool
+nv50_hw_query_allocate(struct nv50_context *, struct nv50_query *, int);
 void
 nv50_hw_query_pushbuf_submit(struct nouveau_pushbuf *, uint16_t,
                              struct nv50_query *, unsigned);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
new file mode 100644
index 00000000000..d1bccb94193
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw_metric.h"
+#include "nv50/nv50_query_hw_sm.h"
+
+/* === PERFORMANCE MONITORING METRICS for NV84+ === */
+static const char *nv50_hw_metric_names[] =
+{
+   "metric-branch_efficiency",
+};
+
+struct nv50_hw_metric_query_cfg {
+   uint32_t queries[4];
+   uint32_t num_queries;
+};
+
+#define _SM(n) NV50_HW_SM_QUERY(NV50_HW_SM_QUERY_ ##n)
+#define _M(n, c) [NV50_HW_METRIC_QUERY_##n] = c
+
+/* ==== Compute capability 1.1 (G84+) ==== */
+static const struct nv50_hw_metric_query_cfg
+sm11_branch_efficiency =
+{
+   .queries[0]  = _SM(BRANCH),
+   .queries[1]  = _SM(DIVERGENT_BRANCH),
+   .num_queries = 2,
+};
+
+static const struct nv50_hw_metric_query_cfg *sm11_hw_metric_queries[] =
+{
+   _M(BRANCH_EFFICIENCY, &sm11_branch_efficiency),
+};
+
+#undef _SM
+#undef _M
+
+static const struct nv50_hw_metric_query_cfg *
+nv50_hw_metric_query_get_cfg(struct nv50_context *nv50,
+                             struct nv50_hw_query *hq)
+{
+   struct nv50_query *q = &hq->base;
+   return sm11_hw_metric_queries[q->type - NV50_HW_METRIC_QUERY(0)];
+}
+
+static void
+nv50_hw_metric_destroy_query(struct nv50_context *nv50,
+                             struct nv50_hw_query *hq)
+{
+   struct nv50_hw_metric_query *hmq = nv50_hw_metric_query(hq);
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++)
+      hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
+   FREE(hmq);
+}
+
+static boolean
+nv50_hw_metric_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_hw_metric_query *hmq = nv50_hw_metric_query(hq);
+   boolean ret = false;
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++) {
+      ret = hmq->queries[i]->funcs->begin_query(nv50, hmq->queries[i]);
+      if (!ret)
+         return ret;
+   }
+   return ret;
+}
+
+static void
+nv50_hw_metric_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_hw_metric_query *hmq = nv50_hw_metric_query(hq);
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++)
+      hmq->queries[i]->funcs->end_query(nv50, hmq->queries[i]);
+}
+
+static uint64_t
+sm11_hw_metric_calc_result(struct nv50_hw_query *hq, uint64_t res64[8])
+{
+   switch (hq->base.type - NV50_HW_METRIC_QUERY(0)) {
+   case NV50_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
+      /* (branch / (branch + divergent_branch)) * 100 */
+      if (res64[0] + res64[1])
+         return (res64[0] / (double)(res64[0] + res64[1])) * 100;
+      break;
+   default:
+      debug_printf("invalid metric type: %d\n",
+                   hq->base.type - NV50_HW_METRIC_QUERY(0));
+      break;
+   }
+   return 0;
+}
+
+static boolean
+nv50_hw_metric_get_query_result(struct nv50_context *nv50,
+                                struct nv50_hw_query *hq, boolean wait,
+                                union pipe_query_result *result)
+{
+   struct nv50_hw_metric_query *hmq = nv50_hw_metric_query(hq);
+   union pipe_query_result results[4] = {};
+   uint64_t res64[4] = {};
+   boolean ret = false;
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++) {
+      ret = hmq->queries[i]->funcs->get_query_result(nv50, hmq->queries[i],
+                                                     wait, &results[i]);
+      if (!ret)
+         return ret;
+      res64[i] = *(uint64_t *)&results[i];
+   }
+
+   *(uint64_t *)result = sm11_hw_metric_calc_result(hq, res64);
+   return ret;
+}
+
+static const struct nv50_hw_query_funcs hw_metric_query_funcs = {
+   .destroy_query = nv50_hw_metric_destroy_query,
+   .begin_query = nv50_hw_metric_begin_query,
+   .end_query = nv50_hw_metric_end_query,
+   .get_query_result = nv50_hw_metric_get_query_result,
+};
+
+struct nv50_hw_query *
+nv50_hw_metric_create_query(struct nv50_context *nv50, unsigned type)
+{
+   const struct nv50_hw_metric_query_cfg *cfg;
+   struct nv50_hw_metric_query *hmq;
+   struct nv50_hw_query *hq;
+   unsigned i;
+
+   if (type < NV50_HW_METRIC_QUERY(0) || type > NV50_HW_METRIC_QUERY_LAST)
+      return NULL;
+
+   hmq = CALLOC_STRUCT(nv50_hw_metric_query);
+   if (!hmq)
+      return NULL;
+
+   hq = &hmq->base;
+   hq->funcs = &hw_metric_query_funcs;
+   hq->base.type = type;
+
+   cfg = nv50_hw_metric_query_get_cfg(nv50, hq);
+
+   for (i = 0; i < cfg->num_queries; i++) {
+      hmq->queries[i] = nv50_hw_sm_create_query(nv50, cfg->queries[i]);
+      if (!hmq->queries[i]) {
+         nv50_hw_metric_destroy_query(nv50, hq);
+         return NULL;
+      }
+      hmq->num_queries++;
+   }
+
+   return hq;
+}
+
+int
+nv50_hw_metric_get_driver_query_info(struct nv50_screen *screen, unsigned id,
+                                     struct pipe_driver_query_info *info)
+{
+   int count = 0;
+
+   if (screen->compute)
+      if (screen->base.class_3d >= NV84_3D_CLASS)
+         count += NV50_HW_METRIC_QUERY_COUNT;
+
+   if (!info)
+      return count;
+
+   if (id < count) {
+      if (screen->compute) {
+         if (screen->base.class_3d >= NV84_3D_CLASS) {
+            info->name = nv50_hw_metric_names[id];
+            info->query_type = NV50_HW_METRIC_QUERY(id);
+            info->group_id = NV50_HW_METRIC_QUERY_GROUP;
+            return 1;
+         }
+      }
+   }
+   return 0;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.h
new file mode 100644
index 00000000000..f8cfc04084f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.h
@@ -0,0 +1,34 @@
+#ifndef __NV50_QUERY_HW_METRIC_H__
+#define __NV50_QUERY_HW_METRIC_H__
+
+#include "nv50_query_hw.h"
+
+struct nv50_hw_metric_query {
+   struct nv50_hw_query base;
+   struct nv50_hw_query *queries[4];
+   unsigned num_queries;
+};
+
+static inline struct nv50_hw_metric_query *
+nv50_hw_metric_query(struct nv50_hw_query *hq)
+{
+   return (struct nv50_hw_metric_query *)hq;
+}
+
+/*
+ * Driver metrics queries:
+ */
+#define NV50_HW_METRIC_QUERY(i)   (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i))
+#define NV50_HW_METRIC_QUERY_LAST  NV50_HW_METRIC_QUERY(NV50_HW_METRIC_QUERY_COUNT - 1)
+enum nv50_hw_metric_queries
+{
+    NV50_HW_METRIC_QUERY_BRANCH_EFFICIENCY = 0,
+    NV50_HW_METRIC_QUERY_COUNT
+};
+
+struct nv50_hw_query *
+nv50_hw_metric_create_query(struct nv50_context *, unsigned);
+int
+nv50_hw_metric_get_driver_query_info(struct nv50_screen *, unsigned,
+                                     struct pipe_driver_query_info *);
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
new file mode 100644
index 00000000000..8453ce76095
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define NV50_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw_sm.h"
+
+#include "nv_object.xml.h"
+#include "nv50/nv50_compute.xml.h"
+
+/* === PERFORMANCE MONITORING COUNTERS for NV84+ === */
+
+/* NOTE: intentionally using the same names as NV */
+static const char *nv50_hw_sm_query_names[] =
+{
+   "branch",
+   "divergent_branch",
+   "instructions",
+   "prof_trigger_00",
+   "prof_trigger_01",
+   "prof_trigger_02",
+   "prof_trigger_03",
+   "prof_trigger_04",
+   "prof_trigger_05",
+   "prof_trigger_06",
+   "prof_trigger_07",
+   "sm_cta_launched",
+   "warp_serialize",
+};
+
+static const uint64_t nv50_read_hw_sm_counters_code[] =
+{
+   /* and b32 $r0 $r0 0x0000ffff
+    * add b32 $c0 $r0 $r0 $r0
+    * (lg $c0) ret
+    * mov $r0 $pm0
+    * mov $r1 $pm1
+    * mov $r2 $pm2
+    * mov $r3 $pm3
+    * mov $r4 $physid
+    * ld $r5 b32 s[0x10]
+    * ld $r6 b32 s[0x14]
+    * and b32 $r4 $r4 0x000f0000
+    * shr u32 $r4 $r4 0x10
+    * mul $r4 u24 $r4 0x14
+    * add b32 $r5 $r5 $r4
+    * st b32 g15[$r5] $r0
+    * add b32 $r5 $r5 0x04
+    * st b32 g15[$r5] $r1
+    * add b32 $r5 $r5 0x04
+    * st b32 g15[$r5] $r2
+    * add b32 $r5 $r5 0x04
+    * st b32 g15[$r5] $r3
+    * add b32 $r5 $r5 0x04
+    * exit st b32 g15[$r5] $r6 */
+   0x00000fffd03f0001ULL,
+   0x040007c020000001ULL,
+   0x0000028030000003ULL,
+   0x6001078000000001ULL,
+   0x6001478000000005ULL,
+   0x6001878000000009ULL,
+   0x6001c7800000000dULL,
+   0x6000078000000011ULL,
+   0x4400c78010000815ULL,
+   0x4400c78010000a19ULL,
+   0x0000f003d0000811ULL,
+   0xe410078030100811ULL,
+   0x0000000340540811ULL,
+   0x0401078020000a15ULL,
+   0xa0c00780d00f0a01ULL,
+   0x0000000320048a15ULL,
+   0xa0c00780d00f0a05ULL,
+   0x0000000320048a15ULL,
+   0xa0c00780d00f0a09ULL,
+   0x0000000320048a15ULL,
+   0xa0c00780d00f0a0dULL,
+   0x0000000320048a15ULL,
+   0xa0c00781d00f0a19ULL,
+};
+
+struct nv50_hw_sm_counter_cfg
+{
+   uint32_t mode : 4;    /* LOGOP, LOGOP_PULSE */
+   uint32_t unit : 8;    /* UNK[0-5] */
+   uint32_t sig  : 8;    /* signal selection */
+};
+
+struct nv50_hw_sm_query_cfg
+{
+   struct nv50_hw_sm_counter_cfg ctr[4];
+   uint8_t num_counters;
+};
+
+#define _Q(n, m, u, s) [NV50_HW_SM_QUERY_##n] = { { { NV50_COMPUTE_MP_PM_CONTROL_MODE_##m, NV50_COMPUTE_MP_PM_CONTROL_UNIT_##u, s, }, {}, {}, {} }, 1 }
+
+/* ==== Compute capability 1.1 (G84+) ==== */
+static const struct nv50_hw_sm_query_cfg sm11_hw_sm_queries[] =
+{
+   _Q(BRANCH,           LOGOP, UNK4, 0x02),
+   _Q(DIVERGENT_BRANCH, LOGOP, UNK4, 0x09),
+   _Q(INSTRUCTIONS,     LOGOP, UNK4, 0x04),
+   _Q(PROF_TRIGGER_0,   LOGOP, UNK1, 0x26),
+   _Q(PROF_TRIGGER_1,   LOGOP, UNK1, 0x27),
+   _Q(PROF_TRIGGER_2,   LOGOP, UNK1, 0x28),
+   _Q(PROF_TRIGGER_3,   LOGOP, UNK1, 0x29),
+   _Q(PROF_TRIGGER_4,   LOGOP, UNK1, 0x2a),
+   _Q(PROF_TRIGGER_5,   LOGOP, UNK1, 0x2b),
+   _Q(PROF_TRIGGER_6,   LOGOP, UNK1, 0x2c),
+   _Q(PROF_TRIGGER_7,   LOGOP, UNK1, 0x2d),
+   _Q(SM_CTA_LAUNCHED,  LOGOP, UNK1, 0x33),
+   _Q(WARP_SERIALIZE,   LOGOP, UNK0, 0x0b),
+};
+
+static inline uint16_t nv50_hw_sm_get_func(uint8_t slot)
+{
+   switch (slot) {
+   case 0: return 0xaaaa;
+   case 1: return 0xcccc;
+   case 2: return 0xf0f0;
+   case 3: return 0xff00;
+   }
+   return 0;
+}
+
+static const struct nv50_hw_sm_query_cfg *
+nv50_hw_sm_query_get_cfg(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_query *q = &hq->base;
+   return &sm11_hw_sm_queries[q->type - NV50_HW_SM_QUERY(0)];
+}
+
+static void
+nv50_hw_sm_destroy_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_query *q = &hq->base;
+   q->funcs->destroy_query(nv50, q);
+}
+
+static boolean
+nv50_hw_sm_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq);
+   const struct nv50_hw_sm_query_cfg *cfg;
+   uint16_t func;
+   int i, c;
+
+   cfg = nv50_hw_sm_query_get_cfg(nv50, hq);
+
+   /* check if we have enough free counter slots */
+   if (screen->pm.num_hw_sm_active + cfg->num_counters > 4) {
+      NOUVEAU_ERR("Not enough free MP counter slots !\n");
+      return false;
+   }
+
+   assert(cfg->num_counters <= 4);
+   PUSH_SPACE(push, 4 * 4);
+
+   /* set sequence field to 0 (used to check if result is available) */
+   for (i = 0; i < screen->MPsInTP; ++i) {
+      const unsigned b = (0x14 / 4) * i;
+      hq->data[b + 16] = 0;
+   }
+   hq->sequence++;
+
+   for (i = 0; i < cfg->num_counters; i++) {
+      screen->pm.num_hw_sm_active++;
+
+      /* find free counter slots */
+      for (c = 0; c < 4; ++c) {
+         if (!screen->pm.mp_counter[c]) {
+            hsq->ctr[i] = c;
+            screen->pm.mp_counter[c] = hsq;
+            break;
+         }
+      }
+
+      /* select func to aggregate counters */
+      func = nv50_hw_sm_get_func(c);
+
+      /* configure and reset the counter(s) */
+      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+      PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
+                        | cfg->ctr[i].unit | cfg->ctr[i].mode);
+      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_SET(c)), 1);
+      PUSH_DATA (push, 0);
+   }
+   return true;
+}
+
+static void
+nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct pipe_context *pipe = &nv50->base.pipe;
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq);
+   uint32_t mask;
+   uint32_t input[3];
+   const uint block[3] = { 32, 1, 1 };
+   const uint grid[3] = { screen->MPsInTP, screen->TPs, 1 };
+   int c;
+
+   if (unlikely(!screen->pm.prog)) {
+      struct nv50_program *prog = CALLOC_STRUCT(nv50_program);
+      prog->type = PIPE_SHADER_COMPUTE;
+      prog->translated = true;
+      prog->max_gpr = 7;
+      prog->parm_size = 8;
+      prog->code = (uint32_t *)nv50_read_hw_sm_counters_code;
+      prog->code_size = sizeof(nv50_read_hw_sm_counters_code);
+      screen->pm.prog = prog;
+   }
+
+   /* disable all counting */
+   PUSH_SPACE(push, 8);
+   for (c = 0; c < 4; c++) {
+      if (screen->pm.mp_counter[c]) {
+         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+         PUSH_DATA (push, 0);
+      }
+   }
+
+   /* release counters for this query */
+   for (c = 0; c < 4; c++) {
+      if (screen->pm.mp_counter[c] == hsq) {
+         screen->pm.num_hw_sm_active--;
+         screen->pm.mp_counter[c] = NULL;
+      }
+   }
+
+   BCTX_REFN_bo(nv50->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
+                hq->bo);
+
+   PUSH_SPACE(push, 2);
+   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+
+   pipe->bind_compute_state(pipe, screen->pm.prog);
+   input[0] = hq->bo->offset + hq->base_offset;
+   input[1] = hq->sequence;
+   pipe->launch_grid(pipe, block, grid, 0, input);
+
+   nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_QUERY);
+
+   /* re-active other counters */
+   PUSH_SPACE(push, 8);
+   mask = 0;
+   for (c = 0; c < 4; c++) {
+      const struct nv50_hw_sm_query_cfg *cfg;
+      unsigned i;
+
+      hsq = screen->pm.mp_counter[c];
+      if (!hsq)
+         continue;
+
+      cfg = nv50_hw_sm_query_get_cfg(nv50, &hsq->base);
+      for (i = 0; i < cfg->num_counters; i++) {
+         uint16_t func;
+
+         if (mask & (1 << hsq->ctr[i]))
+            break;
+
+         mask |= 1 << hsq->ctr[i];
+         func  = nv50_hw_sm_get_func(hsq->ctr[i]);
+
+         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1);
+         PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
+                    | cfg->ctr[i].unit | cfg->ctr[i].mode);
+      }
+   }
+}
+
+static inline bool
+nv50_hw_sm_query_read_data(uint32_t count[32][4],
+                           struct nv50_context *nv50, bool wait,
+                           struct nv50_hw_query *hq,
+                           const struct nv50_hw_sm_query_cfg *cfg,
+                           unsigned mp_count)
+{
+   struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq);
+   unsigned p, c;
+
+   for (p = 0; p < mp_count; ++p) {
+      const unsigned b = (0x14 / 4) * p;
+
+      for (c = 0; c < cfg->num_counters; ++c) {
+         if (hq->data[b + 4] != hq->sequence) {
+            if (!wait)
+               return false;
+            if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nv50->base.client))
+               return false;
+         }
+         count[p][c] = hq->data[b + hsq->ctr[c]];
+      }
+   }
+   return true;
+}
+
+static boolean
+nv50_hw_sm_get_query_result(struct nv50_context *nv50, struct nv50_hw_query *hq,
+                            boolean wait, union pipe_query_result *result)
+{
+   uint32_t count[32][4];
+   uint64_t value = 0;
+   unsigned mp_count = MIN2(nv50->screen->MPsInTP, 32);
+   unsigned p, c;
+   const struct nv50_hw_sm_query_cfg *cfg;
+   bool ret;
+
+   cfg = nv50_hw_sm_query_get_cfg(nv50, hq);
+
+   ret = nv50_hw_sm_query_read_data(count, nv50, wait, hq, cfg, mp_count);
+   if (!ret)
+      return false;
+
+   for (c = 0; c < cfg->num_counters; ++c)
+      for (p = 0; p < mp_count; ++p)
+         value += count[p][c];
+
+   /* We only count a single TP, and simply multiply by the total number of
+    * TPs to compute result over all TPs. This is inaccurate, but enough! */
+   value *= nv50->screen->TPs;
+
+   *(uint64_t *)result = value;
+   return true;
+}
+
+static const struct nv50_hw_query_funcs hw_sm_query_funcs = {
+   .destroy_query = nv50_hw_sm_destroy_query,
+   .begin_query = nv50_hw_sm_begin_query,
+   .end_query = nv50_hw_sm_end_query,
+   .get_query_result = nv50_hw_sm_get_query_result,
+};
+
+struct nv50_hw_query *
+nv50_hw_sm_create_query(struct nv50_context *nv50, unsigned type)
+{
+   struct nv50_hw_sm_query *hsq;
+   struct nv50_hw_query *hq;
+   unsigned space;
+
+   if (type < NV50_HW_SM_QUERY(0) || type > NV50_HW_SM_QUERY_LAST)
+      return NULL;
+
+   hsq = CALLOC_STRUCT(nv50_hw_sm_query);
+   if (!hsq)
+      return NULL;
+
+   hq = &hsq->base;
+   hq->funcs = &hw_sm_query_funcs;
+   hq->base.type = type;
+
+   /*
+    * for each MP:
+    * [00] = MP.C0
+    * [04] = MP.C1
+    * [08] = MP.C2
+    * [0c] = MP.C3
+    * [10] = MP.sequence
+    */
+   space = (4 + 1) * nv50->screen->MPsInTP * sizeof(uint32_t);
+
+   if (!nv50_hw_query_allocate(nv50, &hq->base, space)) {
+      FREE(hq);
+      return NULL;
+   }
+
+   return hq;
+}
+
+int
+nv50_hw_sm_get_driver_query_info(struct nv50_screen *screen, unsigned id,
+                                 struct pipe_driver_query_info *info)
+{
+   int count = 0;
+
+   if (screen->compute)
+      if (screen->base.class_3d >= NV84_3D_CLASS)
+         count += NV50_HW_SM_QUERY_COUNT;
+
+   if (!info)
+      return count;
+
+   if (id < count) {
+      if (screen->compute) {
+         if (screen->base.class_3d >= NV84_3D_CLASS) {
+            info->name = nv50_hw_sm_query_names[id];
+            info->query_type = NV50_HW_SM_QUERY(id);
+            info->group_id = NV50_HW_SM_QUERY_GROUP;
+            return 1;
+         }
+      }
+   }
+   return 0;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.h
new file mode 100644
index 00000000000..c1a1cd175e3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.h
@@ -0,0 +1,45 @@
+#ifndef __NV50_QUERY_HW_SM_H__
+#define __NV50_QUERY_HW_SM_H__
+
+#include "nv50_query_hw.h"
+
+struct nv50_hw_sm_query {
+   struct nv50_hw_query base;
+   uint8_t ctr[4];
+};
+
+static inline struct nv50_hw_sm_query *
+nv50_hw_sm_query(struct nv50_hw_query *hq)
+{
+   return (struct nv50_hw_sm_query *)hq;
+}
+
+/*
+ * Performance counter queries:
+ */
+#define NV50_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
+#define NV50_HW_SM_QUERY_LAST   NV50_HW_SM_QUERY(NV50_HW_SM_QUERY_COUNT - 1)
+enum nv50_hw_sm_queries
+{
+   NV50_HW_SM_QUERY_BRANCH = 0,
+   NV50_HW_SM_QUERY_DIVERGENT_BRANCH,
+   NV50_HW_SM_QUERY_INSTRUCTIONS,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_0,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_1,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_2,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_3,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_4,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_5,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_6,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_7,
+   NV50_HW_SM_QUERY_SM_CTA_LAUNCHED,
+   NV50_HW_SM_QUERY_WARP_SERIALIZE,
+   NV50_HW_SM_QUERY_COUNT,
+};
+
+struct nv50_hw_query *
+nv50_hw_sm_create_query(struct nv50_context *, unsigned);
+int
+nv50_hw_sm_get_driver_query_info(struct nv50_screen *, unsigned,
+                                 struct pipe_driver_query_info *);
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index f47e998ab1e..1e4b75f18e0 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -41,8 +41,6 @@
 
 #define THREADS_IN_WARP 32
 
-#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
-
 static boolean
 nv50_screen_is_format_supported(struct pipe_screen *pscreen,
                                 enum pipe_format format,
@@ -183,6 +181,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_COMPUTE:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -212,7 +211,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-   case PIPE_CAP_COMPUTE:
    case PIPE_CAP_DRAW_INDIRECT:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
@@ -251,6 +249,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_VERTEX:
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_COMPUTE:
       break;
    default:
       return 0;
@@ -336,6 +335,52 @@ nv50_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
    return 0.0f;
 }
 
+static int
+nv50_screen_get_compute_param(struct pipe_screen *pscreen,
+                              enum pipe_compute_cap param, void *data)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+
+#define RET(x) do {                  \
+   if (data)                         \
+      memcpy(data, x, sizeof(x));    \
+   return sizeof(x);                 \
+} while (0)
+
+   switch (param) {
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      RET((uint64_t []) { 2 });
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      RET(((uint64_t []) { 65535, 65535 }));
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      RET(((uint64_t []) { 512, 512, 64 }));
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      RET((uint64_t []) { 512 });
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g0-15[] */
+      RET((uint64_t []) { 1ULL << 32 });
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */
+      RET((uint64_t []) { 16 << 10 });
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */
+      RET((uint64_t []) { 16 << 10 });
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
+      RET((uint64_t []) { 4096 });
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      RET((uint32_t []) { 32 });
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+      RET((uint64_t []) { 1ULL << 40 });
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      RET((uint32_t []) { 0 });
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+      RET((uint32_t []) { screen->mp_count });
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+      RET((uint32_t []) { 512 }); /* FIXME: arbitrary limit */
+   default:
+      return 0;
+   }
+
+#undef RET
+}
+
 static void
 nv50_screen_destroy(struct pipe_screen *pscreen)
 {
@@ -377,6 +422,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
    nouveau_object_del(&screen->tesla);
    nouveau_object_del(&screen->eng2d);
    nouveau_object_del(&screen->m2mf);
+   nouveau_object_del(&screen->compute);
    nouveau_object_del(&screen->sync);
 
    nouveau_screen_fini(&screen->base);
@@ -640,7 +686,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
    BEGIN_NV04(push, NV50_3D(VB_ELEMENT_BASE), 1);
    PUSH_DATA (push, 0);
    if (screen->base.class_3d >= NV84_3D_CLASS) {
-      BEGIN_NV04(push, SUBC_3D(NV84_3D_VERTEX_ID_BASE), 1);
+      BEGIN_NV04(push, NV84_3D(VERTEX_ID_BASE), 1);
       PUSH_DATA (push, 0);
    }
 
@@ -742,6 +788,9 @@ nv50_screen_create(struct nouveau_device *dev)
    pscreen->get_param = nv50_screen_get_param;
    pscreen->get_shader_param = nv50_screen_get_shader_param;
    pscreen->get_paramf = nv50_screen_get_paramf;
+   pscreen->get_compute_param = nv50_screen_get_compute_param;
+   pscreen->get_driver_query_info = nv50_screen_get_driver_query_info;
+   pscreen->get_driver_query_group_info = nv50_screen_get_driver_query_group_info;
 
    nv50_screen_init_resource_functions(pscreen);
 
@@ -851,6 +900,8 @@ nv50_screen_create(struct nouveau_device *dev)
    screen->TPs = util_bitcount(value & 0xffff);
    screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
 
+   screen->mp_count = screen->TPs * screen->MPsInTP;
+
    stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP *
          STACK_WARPS_ALLOC * 64 * 8;
 
@@ -902,6 +953,12 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nv50_screen_init_hwctx(screen);
 
+   ret = nv50_screen_compute_setup(screen, screen->base.pushbuf);
+   if (ret) {
+      NOUVEAU_ERR("Failed to init compute context: %d\n", ret);
+      goto fail;
+   }
+
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index ce51f0fc254..2a4983d1020 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -23,6 +23,10 @@ struct nv50_context;
 
 #define NV50_MAX_VIEWPORTS 16
 
+#define NV50_MAX_GLOBALS 16
+
+#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
+
 struct nv50_blitter;
 
 struct nv50_graph_state {
@@ -66,6 +70,7 @@ struct nv50_screen {
    unsigned MPsInTP;
    unsigned max_tls_space;
    unsigned cur_tls_space;
+   unsigned mp_count;
 
    struct nouveau_heap *vp_code_heap;
    struct nouveau_heap *gp_code_heap;
@@ -90,9 +95,16 @@ struct nv50_screen {
       struct nouveau_bo *bo;
    } fence;
 
+   struct {
+      struct nv50_program *prog; /* compute state object to read MP counters */
+      struct nv50_hw_sm_query *mp_counter[4]; /* counter to query allocation */
+      uint8_t num_hw_sm_active;
+   } pm;
+
    struct nouveau_object *sync;
 
    struct nouveau_object *tesla;
+   struct nouveau_object *compute;
    struct nouveau_object *eng2d;
    struct nouveau_object *m2mf;
 };
@@ -103,12 +115,19 @@ nv50_screen(struct pipe_screen *screen)
    return (struct nv50_screen *)screen;
 }
 
+int nv50_screen_get_driver_query_info(struct pipe_screen *, unsigned,
+                                      struct pipe_driver_query_info *);
+int nv50_screen_get_driver_query_group_info(struct pipe_screen *, unsigned,
+                                            struct pipe_driver_query_group_info *);
+
 bool nv50_blitter_create(struct nv50_screen *);
 void nv50_blitter_destroy(struct nv50_screen *);
 
 int nv50_screen_tic_alloc(struct nv50_screen *, void *);
 int nv50_screen_tsc_alloc(struct nv50_screen *, void *);
 
+int nv50_screen_compute_setup(struct nv50_screen *, struct nouveau_pushbuf *);
+
 static inline void
 nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index d27f12ca94b..b4ea08d4d13 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -792,6 +792,35 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     nv50->dirty |= NV50_NEW_GMTYPROG;
 }
 
+static void *
+nv50_cp_state_create(struct pipe_context *pipe,
+                     const struct pipe_compute_state *cso)
+{
+   struct nv50_program *prog;
+
+   prog = CALLOC_STRUCT(nv50_program);
+   if (!prog)
+      return NULL;
+   prog->type = PIPE_SHADER_COMPUTE;
+
+   prog->cp.smem_size = cso->req_local_mem;
+   prog->cp.lmem_size = cso->req_private_mem;
+   prog->parm_size = cso->req_input_mem;
+
+   prog->pipe.tokens = tgsi_dup_tokens((const struct tgsi_token *)cso->prog);
+
+   return (void *)prog;
+}
+
+static void
+nv50_cp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->compprog = hwcso;
+   nv50->dirty_cp |= NV50_NEW_CP_PROGRAM;
+}
+
 static void
 nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
                          struct pipe_constant_buffer *cb)
@@ -1134,6 +1163,70 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
       nv50->dirty |= NV50_NEW_STRMOUT;
 }
 
+static void
+nv50_set_compute_resources(struct pipe_context *pipe,
+                           unsigned start, unsigned nr,
+                           struct pipe_surface **resources)
+{
+   /* TODO: bind surfaces */
+}
+
+static inline void
+nv50_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
+{
+   struct nv04_resource *buf = nv04_resource(res);
+   if (buf) {
+      uint64_t limit = (buf->address + buf->base.width0) - 1;
+      if (limit < (1ULL << 32)) {
+         *phandle = (uint32_t)buf->address;
+      } else {
+         NOUVEAU_ERR("Cannot map into TGSI_RESOURCE_GLOBAL: "
+                     "resource not contained within 32-bit address space !\n");
+         *phandle = 0;
+      }
+   } else {
+      *phandle = 0;
+   }
+}
+
+static void
+nv50_set_global_bindings(struct pipe_context *pipe,
+                         unsigned start, unsigned nr,
+                         struct pipe_resource **resources,
+                         uint32_t **handles)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct pipe_resource **ptr;
+   unsigned i;
+   const unsigned end = start + nr;
+
+   if (nv50->global_residents.size <= (end * sizeof(struct pipe_resource *))) {
+      const unsigned old_size = nv50->global_residents.size;
+      const unsigned req_size = end * sizeof(struct pipe_resource *);
+      util_dynarray_resize(&nv50->global_residents, req_size);
+      memset((uint8_t *)nv50->global_residents.data + old_size, 0,
+             req_size - old_size);
+   }
+
+   if (resources) {
+      ptr = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i) {
+         pipe_resource_reference(&ptr[i], resources[i]);
+         nv50_set_global_handle(handles[i], resources[i]);
+      }
+   } else {
+      ptr = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i)
+         pipe_resource_reference(&ptr[i], NULL);
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL);
+
+   nv50->dirty_cp = NV50_NEW_CP_GLOBALS;
+}
+
 void
 nv50_init_state_functions(struct nv50_context *nv50)
 {
@@ -1162,12 +1255,15 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->create_vs_state = nv50_vp_state_create;
    pipe->create_fs_state = nv50_fp_state_create;
    pipe->create_gs_state = nv50_gp_state_create;
+   pipe->create_compute_state = nv50_cp_state_create;
    pipe->bind_vs_state = nv50_vp_state_bind;
    pipe->bind_fs_state = nv50_fp_state_bind;
    pipe->bind_gs_state = nv50_gp_state_bind;
+   pipe->bind_compute_state = nv50_cp_state_bind;
    pipe->delete_vs_state = nv50_sp_state_delete;
    pipe->delete_fs_state = nv50_sp_state_delete;
    pipe->delete_gs_state = nv50_sp_state_delete;
+   pipe->delete_compute_state = nv50_sp_state_delete;
 
    pipe->set_blend_color = nv50_set_blend_color;
    pipe->set_stencil_ref = nv50_set_stencil_ref;
@@ -1191,6 +1287,9 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->stream_output_target_destroy = nv50_so_target_destroy;
    pipe->set_stream_output_targets = nv50_set_stream_output_targets;
 
+   pipe->set_global_binding = nv50_set_global_bindings;
+   pipe->set_compute_resources = nv50_set_compute_resources;
+
    nv50->sample_mask = ~0;
    nv50->min_samples = 1;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index b6181edf24f..02a759c23ad 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -503,8 +503,7 @@ static struct state_validate {
     { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
     { nv50_stream_output_validate, NV50_NEW_STRMOUT |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS |
-                                   NV50_NEW_VERTPROG },
+    { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS },
     { nv50_validate_min_samples,   NV50_NEW_MIN_SAMPLES },
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 916a7d44a31..8ba19d2cc90 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -339,12 +339,18 @@ nv50_clear_render_target(struct pipe_context *pipe,
    PUSH_DATA (push, (width << 16) | dstx);
    PUSH_DATA (push, (height << 16) | dsty);
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+
    BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
       PUSH_DATA (push, 0x3c |
                  (z << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
    }
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, nv50->cond_condmode);
+
    nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
 
@@ -415,12 +421,18 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
    PUSH_DATA (push, (width << 16) | dstx);
    PUSH_DATA (push, (height << 16) | dsty);
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+
    BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
       PUSH_DATA (push, mode |
                  (z << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
    }
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, nv50->cond_condmode);
+
    nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
 
@@ -673,6 +685,9 @@ nv50_clear_buffer(struct pipe_context *pipe,
    PUSH_DATA (push, (width << 16));
    PUSH_DATA (push, (height << 16));
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+
    BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
    PUSH_DATA (push, 0x3c);
 
@@ -690,6 +705,9 @@ nv50_clear_buffer(struct pipe_context *pipe,
       PUSH_DATA (push, 0x3c);
    }
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, nv50->cond_condmode);
+
    nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
    nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 9aa593f919e..85878d5fcc7 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -294,8 +294,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
    uint64_t addrs[PIPE_MAX_ATTRIBS];
    uint32_t limits[PIPE_MAX_ATTRIBS];
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
-   struct nv50_vertex_stateobj dummy = {};
-   struct nv50_vertex_stateobj *vertex = nv50->vertex ? nv50->vertex : &dummy;
+   struct nv50_vertex_stateobj *vertex = nv50->vertex;
    struct pipe_vertex_buffer *vb;
    struct nv50_vertex_element *ve;
    uint32_t mask;
@@ -303,14 +302,6 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
    unsigned i;
    const unsigned n = MAX2(vertex->num_elements, nv50->state.num_vtxelts);
 
-   /* A vertexid is not generated for inline data uploads. Have to use a
-    * VBO. This check must come after the vertprog has been validated,
-    * otherwise vertexid may be unset.
-    */
-   assert(nv50->vertprog->translated);
-   if (nv50->vertprog->vp.vertexid)
-      nv50->vbo_push_hint = 0;
-
    if (unlikely(vertex->need_conversion))
       nv50->vbo_fifo = ~0;
    else
@@ -487,7 +478,7 @@ nv50_draw_arrays(struct nv50_context *nv50,
       BEGIN_NV04(push, NV50_3D(VB_ELEMENT_BASE), 1);
       PUSH_DATA (push, 0);
       if (nv50->screen->base.class_3d >= NV84_3D_CLASS) {
-         BEGIN_NV04(push, SUBC_3D(NV84_3D_VERTEX_ID_BASE), 1);
+         BEGIN_NV04(push, NV84_3D(VERTEX_ID_BASE), 1);
          PUSH_DATA (push, 0);
       }
       nv50->state.index_bias = 0;
@@ -613,7 +604,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
       BEGIN_NV04(push, NV50_3D(VB_ELEMENT_BASE), 1);
       PUSH_DATA (push, index_bias);
       if (nv50->screen->base.class_3d >= NV84_3D_CLASS) {
-         BEGIN_NV04(push, SUBC_3D(NV84_3D_VERTEX_ID_BASE), 1);
+         BEGIN_NV04(push, NV84_3D(VERTEX_ID_BASE), 1);
          PUSH_DATA (push, index_bias);
       }
       nv50->state.index_bias = index_bias;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
index 76f1b41ea70..68002305d72 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
@@ -49,6 +49,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 
 #define SUBC_3D(m) 3, (m)
 #define NV50_3D(n) SUBC_3D(NV50_3D_##n)
+#define NV84_3D(n) SUBC_3D(NV84_3D_##n)
 #define NVA0_3D(n) SUBC_3D(NVA0_3D_##n)
 
 #define SUBC_2D(m) 4, (m)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 82ed5a1864e..162661ff2a7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -180,9 +180,10 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
                                  int ref)
 {
    struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
    unsigned s, i;
 
-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
       for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
          if (nvc0->framebuffer.cbufs[i] &&
              nvc0->framebuffer.cbufs[i]->texture == res) {
@@ -193,7 +194,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
          }
       }
    }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nvc0->framebuffer.zsbuf &&
           nvc0->framebuffer.zsbuf->texture == res) {
          nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -203,12 +204,12 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
       }
    }
 
-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_COMMAND_ARGS_BUFFER |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_COMMAND_ARGS_BUFFER |
+               PIPE_BIND_SAMPLER_VIEW)) {
       for (i = 0; i < nvc0->num_vtxbufs; ++i) {
          if (nvc0->vtxbuf[i].buffer == res) {
             nvc0->dirty |= NVC0_NEW_ARRAYS;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index f53921092a5..d992b10a23c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -162,6 +162,7 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
    info->max_value.u64 = 0;
    info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
    info->group_id = -1;
+   info->flags = 0;
 
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
    if (id < num_sw_queries)
@@ -200,7 +201,6 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
    if (id == NVC0_HW_SM_QUERY_GROUP) {
       if (screen->compute) {
          info->name = "MP counters";
-         info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
 
          /* Because we can't expose the number of hardware counters needed for
           * each different query, we don't want to allow more than one active
@@ -224,7 +224,6 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
       if (screen->compute) {
          if (screen->base.class_3d < NVE4_3D_CLASS) {
             info->name = "Performance metrics";
-            info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
             info->max_active_queries = 1;
             info->num_queries = NVC0_HW_METRIC_QUERY_COUNT;
             return 1;
@@ -234,7 +233,6 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
    else if (id == NVC0_SW_QUERY_DRV_STAT_GROUP) {
       info->name = "Driver statistics";
-      info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_CPU;
       info->max_active_queries = NVC0_SW_QUERY_DRV_STAT_COUNT;
       info->num_queries = NVC0_SW_QUERY_DRV_STAT_COUNT;
       return 1;
@@ -245,7 +243,6 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
    info->name = "this_is_not_the_query_group_you_are_looking_for";
    info->max_active_queries = 0;
    info->num_queries = 0;
-   info->type = 0;
    return 0;
 }
 
@@ -260,4 +257,5 @@ nvc0_init_query_functions(struct nvc0_context *nvc0)
    pipe->end_query = nvc0_end_query;
    pipe->get_query_result = nvc0_get_query_result;
    pipe->render_condition = nvc0_render_condition;
+   nvc0->cond_condmode = NVC0_3D_COND_MODE_ALWAYS;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index 44b222e5134..7962143d45a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -1014,14 +1014,15 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
       struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
       prog->type = PIPE_SHADER_COMPUTE;
       prog->translated = true;
-      prog->num_gprs = 14;
       prog->parm_size = 12;
       if (is_nve4) {
          prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
          prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
+         prog->num_gprs = 14;
       } else {
          prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
          prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
+         prog->num_gprs = 12;
       }
       screen->pm.prog = prog;
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index cdb1fc1145f..6a4ae5be2ab 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -341,12 +341,16 @@ nvc0_clear_render_target(struct pipe_context *pipe,
       nvc0_resource_fence(res, NOUVEAU_BO_WR);
    }
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
+
    BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
       PUSH_DATA (push, 0x3c |
                  (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT));
    }
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
    nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
 }
 
@@ -470,6 +474,8 @@ nvc0_clear_buffer(struct pipe_context *pipe,
    IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
    IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0);
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
+
    IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
 
    if (width * height != elements) {
@@ -486,6 +492,8 @@ nvc0_clear_buffer(struct pipe_context *pipe,
       IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
    }
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
    nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
    nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
    nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -545,12 +553,16 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe,
    PUSH_DATA (push, dst->u.tex.first_layer);
    IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode);
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
+
    BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
       PUSH_DATA (push, mode |
                  (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT));
    }
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
    nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
 }
 
diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources
index f63790c329e..1dbad2f39e3 100644
--- a/src/gallium/drivers/radeon/Makefile.sources
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -7,12 +7,14 @@ C_SOURCES := \
 	r600_pipe_common.c \
 	r600_pipe_common.h \
 	r600_query.c \
+	r600_query.h \
 	r600_streamout.c \
 	r600_texture.c \
 	radeon_uvd.c \
 	radeon_uvd.h \
 	radeon_vce_40_2_2.c \
 	radeon_vce_50.c \
+	radeon_vce_52.c \
 	radeon_vce.c \
 	radeon_vce.h \
 	radeon_video.c \
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 3599692a857..7464f677398 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -27,6 +27,7 @@
 #include "r600_pipe_common.h"
 #include "r600_cs.h"
 #include "tgsi/tgsi_parse.h"
+#include "util/list.h"
 #include "util/u_draw_quad.h"
 #include "util/u_memory.h"
 #include "util/u_format_s3tc.h"
@@ -135,12 +136,10 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
 	/* suspend queries */
-	ctx->queries_suspended_for_flush = false;
-	if (ctx->num_cs_dw_nontimer_queries_suspend) {
+	if (!LIST_IS_EMPTY(&ctx->active_nontimer_queries))
 		r600_suspend_nontimer_queries(ctx);
+	if (!LIST_IS_EMPTY(&ctx->active_timer_queries))
 		r600_suspend_timer_queries(ctx);
-		ctx->queries_suspended_for_flush = true;
-	}
 
 	ctx->streamout.suspended = false;
 	if (ctx->streamout.begin_emitted) {
@@ -157,10 +156,10 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 	}
 
 	/* resume queries */
-	if (ctx->queries_suspended_for_flush) {
-		r600_resume_nontimer_queries(ctx);
+	if (!LIST_IS_EMPTY(&ctx->active_timer_queries))
 		r600_resume_timer_queries(ctx);
-	}
+	if (!LIST_IS_EMPTY(&ctx->active_nontimer_queries))
+		r600_resume_nontimer_queries(ctx);
 }
 
 static void r600_flush_from_st(struct pipe_context *ctx,
@@ -718,50 +717,6 @@ static uint64_t r600_get_timestamp(struct pipe_screen *screen)
 			rscreen->info.r600_clock_crystal_freq;
 }
 
-static int r600_get_driver_query_info(struct pipe_screen *screen,
-				      unsigned index,
-				      struct pipe_driver_query_info *info)
-{
-	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
-	struct pipe_driver_query_info list[] = {
-		{"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
-		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
-		{"num-shaders-created", R600_QUERY_NUM_SHADERS_CREATED, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
-		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
-		{"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
-		{"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS,
-		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
-		{"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}},
-		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES,
-		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
-		{"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"GPU-load", R600_QUERY_GPU_LOAD, {100}},
-		{"temperature", R600_QUERY_GPU_TEMPERATURE, {125}},
-		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
-		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
-	};
-	unsigned num_queries;
-
-	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
-		num_queries = Elements(list);
-	else if (rscreen->info.drm_major == 3)
-		num_queries = Elements(list) - 3;
-	else
-		num_queries = Elements(list) - 4;
-
-	if (!info)
-		return num_queries;
-
-	if (index >= num_queries)
-		return 0;
-
-	*info = list[index];
-	return 1;
-}
-
 static void r600_fence_reference(struct pipe_screen *screen,
 				 struct pipe_fence_handle **dst,
 				 struct pipe_fence_handle *src)
@@ -949,7 +904,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->b.get_device_vendor = r600_get_device_vendor;
 	rscreen->b.get_compute_param = r600_get_compute_param;
 	rscreen->b.get_paramf = r600_get_paramf;
-	rscreen->b.get_driver_query_info = r600_get_driver_query_info;
 	rscreen->b.get_timestamp = r600_get_timestamp;
 	rscreen->b.fence_finish = r600_fence_finish;
 	rscreen->b.fence_reference = r600_fence_reference;
@@ -965,6 +919,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	}
 
 	r600_init_screen_texture_functions(rscreen);
+	r600_init_screen_query_functions(rscreen);
 
 	rscreen->ws = ws;
 	rscreen->family = rscreen->info.family;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index ebe633b9125..fbdc5c410ae 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -47,21 +47,6 @@
 #define R600_RESOURCE_FLAG_FLUSHED_DEPTH	(PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 #define R600_RESOURCE_FLAG_FORCE_TILING		(PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
 
-#define R600_QUERY_DRAW_CALLS		(PIPE_QUERY_DRIVER_SPECIFIC + 0)
-#define R600_QUERY_REQUESTED_VRAM	(PIPE_QUERY_DRIVER_SPECIFIC + 1)
-#define R600_QUERY_REQUESTED_GTT	(PIPE_QUERY_DRIVER_SPECIFIC + 2)
-#define R600_QUERY_BUFFER_WAIT_TIME	(PIPE_QUERY_DRIVER_SPECIFIC + 3)
-#define R600_QUERY_NUM_CS_FLUSHES	(PIPE_QUERY_DRIVER_SPECIFIC + 4)
-#define R600_QUERY_NUM_BYTES_MOVED	(PIPE_QUERY_DRIVER_SPECIFIC + 5)
-#define R600_QUERY_VRAM_USAGE		(PIPE_QUERY_DRIVER_SPECIFIC + 6)
-#define R600_QUERY_GTT_USAGE		(PIPE_QUERY_DRIVER_SPECIFIC + 7)
-#define R600_QUERY_GPU_TEMPERATURE	(PIPE_QUERY_DRIVER_SPECIFIC + 8)
-#define R600_QUERY_CURRENT_GPU_SCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 9)
-#define R600_QUERY_CURRENT_GPU_MCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 10)
-#define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
-#define R600_QUERY_NUM_SHADERS_CREATED	(PIPE_QUERY_DRIVER_SPECIFIC + 13)
-
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1u << 0)
 #define R600_CONTEXT_PRIVATE_FLAG		(1u << 1)
 
@@ -408,8 +393,6 @@ struct r600_common_context {
 	struct list_head		active_timer_queries;
 	unsigned			num_cs_dw_nontimer_queries_suspend;
 	unsigned			num_cs_dw_timer_queries_suspend;
-	/* If queries have been suspended. */
-	bool				queries_suspended_for_flush;
 	/* Additional hardware info. */
 	unsigned			backend_mask;
 	unsigned			max_db; /* for OQ */
@@ -526,6 +509,7 @@ uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen);
 unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
 
 /* r600_query.c */
+void r600_init_screen_query_functions(struct r600_common_screen *rscreen);
 void r600_query_init(struct r600_common_context *rctx);
 void r600_suspend_nontimer_queries(struct r600_common_context *ctx);
 void r600_resume_nontimer_queries(struct r600_common_context *ctx);
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 8c2b601a96c..b1cfb6e462b 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -22,81 +22,218 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "r600_query.h"
 #include "r600_cs.h"
 #include "util/u_memory.h"
 
+/* Queries without buffer handling or suspend/resume. */
+struct r600_query_sw {
+	struct r600_query b;
 
-struct r600_query_buffer {
-	/* The buffer where query results are stored. */
-	struct r600_resource			*buf;
-	/* Offset of the next free result after current query data */
-	unsigned				results_end;
-	/* If a query buffer is full, a new buffer is created and the old one
-	 * is put in here. When we calculate the result, we sum up the samples
-	 * from all buffers. */
-	struct r600_query_buffer		*previous;
-};
-
-struct r600_query {
-	/* The query buffer and how many results are in it. */
-	struct r600_query_buffer		buffer;
-	/* The type of query */
-	unsigned				type;
-	/* Size of the result in memory for both begin_query and end_query,
-	 * this can be one or two numbers, or it could even be a size of a structure. */
-	unsigned				result_size;
-	/* The number of dwords for begin_query or end_query. */
-	unsigned				num_cs_dw;
-	/* linked list of queries */
-	struct list_head			list;
-	/* for custom non-GPU queries */
 	uint64_t begin_result;
 	uint64_t end_result;
 	/* Fence for GPU_FINISHED. */
 	struct pipe_fence_handle *fence;
-	/* For transform feedback: which stream the query is for */
-	unsigned stream;
 };
 
-
-static bool r600_is_timer_query(unsigned type)
+static void r600_query_sw_destroy(struct r600_common_context *rctx,
+				  struct r600_query *rquery)
 {
-	return type == PIPE_QUERY_TIME_ELAPSED ||
-	       type == PIPE_QUERY_TIMESTAMP;
+	struct pipe_screen *screen = rctx->b.screen;
+	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+	screen->fence_reference(screen, &query->fence, NULL);
+	FREE(query);
 }
 
-static bool r600_query_needs_begin(unsigned type)
+static enum radeon_value_id winsys_id_from_type(unsigned type)
 {
-	return type != PIPE_QUERY_GPU_FINISHED &&
-	       type != PIPE_QUERY_TIMESTAMP;
+	switch (type) {
+	case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
+	case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
+	case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
+	case R600_QUERY_NUM_CS_FLUSHES: return RADEON_NUM_CS_FLUSHES;
+	case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
+	case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
+	case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
+	case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
+	case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
+	case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
+	default: unreachable("query type does not correspond to winsys id");
+	}
 }
 
-static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx, unsigned type)
+static boolean r600_query_sw_begin(struct r600_common_context *rctx,
+				   struct r600_query *rquery)
 {
-	unsigned j, i, num_results, buf_size = 4096;
-	uint32_t *results;
+	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
 
-	/* Non-GPU queries. */
-	switch (type) {
+	switch(query->b.type) {
 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
 	case PIPE_QUERY_GPU_FINISHED:
+		break;
 	case R600_QUERY_DRAW_CALLS:
+		query->begin_result = rctx->num_draw_calls;
+		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
+	case R600_QUERY_VRAM_USAGE:
+	case R600_QUERY_GTT_USAGE:
+	case R600_QUERY_GPU_TEMPERATURE:
+	case R600_QUERY_CURRENT_GPU_SCLK:
+	case R600_QUERY_CURRENT_GPU_MCLK:
+		query->begin_result = 0;
+		break;
 	case R600_QUERY_BUFFER_WAIT_TIME:
 	case R600_QUERY_NUM_CS_FLUSHES:
-	case R600_QUERY_NUM_BYTES_MOVED:
+	case R600_QUERY_NUM_BYTES_MOVED: {
+		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
+		break;
+	}
+	case R600_QUERY_GPU_LOAD:
+		query->begin_result = r600_gpu_load_begin(rctx->screen);
+		break;
+	case R600_QUERY_NUM_COMPILATIONS:
+		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+		break;
+	case R600_QUERY_NUM_SHADERS_CREATED:
+		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		break;
+	default:
+		unreachable("r600_query_sw_begin: bad query type");
+	}
+
+	return TRUE;
+}
+
+static void r600_query_sw_end(struct r600_common_context *rctx,
+			      struct r600_query *rquery)
+{
+	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+	switch(query->b.type) {
+	case PIPE_QUERY_TIMESTAMP_DISJOINT:
+		break;
+	case PIPE_QUERY_GPU_FINISHED:
+		rctx->b.flush(&rctx->b, &query->fence, 0);
+		break;
+	case R600_QUERY_DRAW_CALLS:
+		query->begin_result = rctx->num_draw_calls;
+		break;
+	case R600_QUERY_REQUESTED_VRAM:
+	case R600_QUERY_REQUESTED_GTT:
 	case R600_QUERY_VRAM_USAGE:
 	case R600_QUERY_GTT_USAGE:
 	case R600_QUERY_GPU_TEMPERATURE:
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
+	case R600_QUERY_BUFFER_WAIT_TIME:
+	case R600_QUERY_NUM_CS_FLUSHES:
+	case R600_QUERY_NUM_BYTES_MOVED: {
+		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
+		break;
+	}
 	case R600_QUERY_GPU_LOAD:
+		query->end_result = r600_gpu_load_end(rctx->screen,
+						      query->begin_result);
+		query->begin_result = 0;
+		break;
 	case R600_QUERY_NUM_COMPILATIONS:
+		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+		break;
 	case R600_QUERY_NUM_SHADERS_CREATED:
+		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		break;
+	default:
+		unreachable("r600_query_sw_end: bad query type");
+	}
+}
+
+static boolean r600_query_sw_get_result(struct r600_common_context *rctx,
+					struct r600_query *rquery,
+					boolean wait,
+					union pipe_query_result *result)
+{
+	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+	switch (query->b.type) {
+	case PIPE_QUERY_TIMESTAMP_DISJOINT:
+		/* Convert from cycles per millisecond to cycles per second (Hz). */
+		result->timestamp_disjoint.frequency =
+			(uint64_t)rctx->screen->info.r600_clock_crystal_freq * 1000;
+		result->timestamp_disjoint.disjoint = FALSE;
+		return TRUE;
+	case PIPE_QUERY_GPU_FINISHED: {
+		struct pipe_screen *screen = rctx->b.screen;
+		result->b = screen->fence_finish(screen, query->fence,
+						 wait ? PIPE_TIMEOUT_INFINITE : 0);
+		return result->b;
+	}
+	}
+
+	result->u64 = query->end_result - query->begin_result;
+
+	switch (query->b.type) {
+	case R600_QUERY_BUFFER_WAIT_TIME:
+	case R600_QUERY_GPU_TEMPERATURE:
+		result->u64 /= 1000;
+		break;
+	case R600_QUERY_CURRENT_GPU_SCLK:
+	case R600_QUERY_CURRENT_GPU_MCLK:
+		result->u64 *= 1000000;
+		break;
+	}
+
+	return TRUE;
+}
+
+static struct r600_query_ops sw_query_ops = {
+	.destroy = r600_query_sw_destroy,
+	.begin = r600_query_sw_begin,
+	.end = r600_query_sw_end,
+	.get_result = r600_query_sw_get_result
+};
+
+static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx,
+					       unsigned query_type)
+{
+	struct r600_query_sw *query;
+
+	query = CALLOC_STRUCT(r600_query_sw);
+	if (query == NULL)
 		return NULL;
+
+	query->b.type = query_type;
+	query->b.ops = &sw_query_ops;
+
+	return (struct pipe_query *)query;
+}
+
+void r600_query_hw_destroy(struct r600_common_context *rctx,
+			   struct r600_query *rquery)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
+	struct r600_query_buffer *prev = query->buffer.previous;
+
+	/* Release all query buffers. */
+	while (prev) {
+		struct r600_query_buffer *qbuf = prev;
+		prev = prev->previous;
+		pipe_resource_reference((struct pipe_resource**)&qbuf->buf, NULL);
+		FREE(qbuf);
 	}
 
+	pipe_resource_reference((struct pipe_resource**)&query->buffer.buf, NULL);
+	FREE(rquery);
+}
+
+static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx,
+						   struct r600_query_hw *query)
+{
+	unsigned buf_size = 4096;
+
 	/* Queries are normally read by the CPU after
 	 * being written by the gpu, hence staging is probably a good
 	 * usage pattern.
@@ -105,14 +242,30 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 		pipe_buffer_create(ctx->b.screen, PIPE_BIND_CUSTOM,
 				   PIPE_USAGE_STAGING, buf_size);
 
-	switch (type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
-		memset(results, 0, buf_size);
+	if (query->flags & R600_QUERY_HW_FLAG_PREDICATE)
+		query->ops->prepare_buffer(ctx, query, buf);
+
+	return buf;
+}
+
+static void r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
+					 struct r600_query_hw *query,
+					 struct r600_resource *buffer)
+{
+	/* Callers ensure that the buffer is currently unused by the GPU. */
+	uint32_t *results = ctx->ws->buffer_map(buffer->cs_buf, NULL,
+						PIPE_TRANSFER_WRITE |
+						PIPE_TRANSFER_UNSYNCHRONIZED);
+
+	memset(results, 0, buffer->b.b.width0);
+
+	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
+	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+		unsigned num_results;
+		unsigned i, j;
 
 		/* Set top bits for unused backends. */
-		num_results = buf_size / (16 * ctx->max_db);
+		num_results = buffer->b.b.width0 / (16 * ctx->max_db);
 		for (j = 0; j < num_results; j++) {
 			for (i = 0; i < ctx->max_db; i++) {
 				if (!(ctx->backend_mask & (1<<i))) {
@@ -122,22 +275,109 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 			}
 			results += 4 * ctx->max_db;
 		}
+	}
+}
+
+static struct r600_query_ops query_hw_ops = {
+	.destroy = r600_query_hw_destroy,
+	.begin = r600_query_hw_begin,
+	.end = r600_query_hw_end,
+	.get_result = r600_query_hw_get_result,
+};
+
+static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
+					struct r600_query_hw *query,
+					struct r600_resource *buffer,
+					uint64_t va);
+static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
+				       struct r600_query_hw *query,
+				       struct r600_resource *buffer,
+				       uint64_t va);
+static void r600_query_hw_add_result(struct r600_common_context *ctx,
+				     struct r600_query_hw *, void *buffer,
+				     union pipe_query_result *result);
+static void r600_query_hw_clear_result(struct r600_query_hw *,
+				       union pipe_query_result *);
+
+static struct r600_query_hw_ops query_hw_default_hw_ops = {
+	.prepare_buffer = r600_query_hw_prepare_buffer,
+	.emit_start = r600_query_hw_do_emit_start,
+	.emit_stop = r600_query_hw_do_emit_stop,
+	.clear_result = r600_query_hw_clear_result,
+	.add_result = r600_query_hw_add_result,
+};
+
+boolean r600_query_hw_init(struct r600_common_context *rctx,
+			   struct r600_query_hw *query)
+{
+	query->buffer.buf = r600_new_query_buffer(rctx, query);
+	if (!query->buffer.buf)
+		return FALSE;
+
+	return TRUE;
+}
+
+static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
+					       unsigned query_type,
+					       unsigned index)
+{
+	struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
+	if (!query)
+		return NULL;
+
+	query->b.type = query_type;
+	query->b.ops = &query_hw_ops;
+	query->ops = &query_hw_default_hw_ops;
+
+	switch (query_type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+		query->result_size = 16 * rctx->max_db;
+		query->num_cs_dw_begin = 6;
+		query->num_cs_dw_end = 6;
+		query->flags |= R600_QUERY_HW_FLAG_PREDICATE;
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
+		query->result_size = 16;
+		query->num_cs_dw_begin = 8;
+		query->num_cs_dw_end = 8;
+		query->flags = R600_QUERY_HW_FLAG_TIMER;
+		break;
 	case PIPE_QUERY_TIMESTAMP:
+		query->result_size = 8;
+		query->num_cs_dw_end = 8;
+		query->flags = R600_QUERY_HW_FLAG_TIMER |
+			       R600_QUERY_HW_FLAG_NO_START;
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
 	case PIPE_QUERY_SO_STATISTICS:
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+		query->result_size = 32;
+		query->num_cs_dw_begin = 6;
+		query->num_cs_dw_end = 6;
+		query->stream = index;
+		query->flags |= R600_QUERY_HW_FLAG_PREDICATE;
+		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
-		results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
-		memset(results, 0, buf_size);
+		/* 11 values on EG, 8 on R600. */
+		query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
+		query->num_cs_dw_begin = 6;
+		query->num_cs_dw_end = 6;
 		break;
 	default:
 		assert(0);
+		FREE(query);
+		return NULL;
 	}
-	return buf;
+
+	if (!r600_query_hw_init(rctx, query)) {
+		FREE(query);
+		return NULL;
+	}
+
+	return (struct pipe_query *)query;
 }
 
 static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
@@ -159,7 +399,7 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
 	}
 }
 
-static unsigned event_type_for_stream(struct r600_query *query)
+static unsigned event_type_for_stream(struct r600_query_hw *query)
 {
 	switch (query->stream) {
 	default:
@@ -170,28 +410,14 @@ static unsigned event_type_for_stream(struct r600_query *query)
 	}
 }
 
-static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query)
+static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
+					struct r600_query_hw *query,
+					struct r600_resource *buffer,
+					uint64_t va)
 {
 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
-	uint64_t va;
-
-	r600_update_occlusion_query_state(ctx, query->type, 1);
-	r600_update_prims_generated_query_state(ctx, query->type, 1);
-	ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw * 2, TRUE);
-
-	/* Get a new query buffer if needed. */
-	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
-		struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
-		*qbuf = query->buffer;
-		query->buffer.buf = r600_new_query_buffer(ctx, query->type);
-		query->buffer.results_end = 0;
-		query->buffer.previous = qbuf;
-	}
-
-	/* emit begin query */
-	va = query->buffer.buf->gpu_address + query->buffer.results_end;
 
-	switch (query->type) {
+	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -227,30 +453,50 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	}
 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_QUERY);
-
-	if (r600_is_timer_query(query->type))
-		ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw;
-	else
-		ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw;
 }
 
-static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query)
+static void r600_query_hw_emit_start(struct r600_common_context *ctx,
+				     struct r600_query_hw *query)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	uint64_t va;
 
-	/* The queries which need begin already called this in begin_query. */
-	if (!r600_query_needs_begin(query->type)) {
-		ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw, FALSE);
+	r600_update_occlusion_query_state(ctx, query->b.type, 1);
+	r600_update_prims_generated_query_state(ctx, query->b.type, 1);
+
+	ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
+			       TRUE);
+
+	/* Get a new query buffer if needed. */
+	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
+		struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
+		*qbuf = query->buffer;
+		query->buffer.buf = r600_new_query_buffer(ctx, query);
+		query->buffer.results_end = 0;
+		query->buffer.previous = qbuf;
 	}
 
-	va = query->buffer.buf->gpu_address;
+	/* emit begin query */
+	va = query->buffer.buf->gpu_address + query->buffer.results_end;
+
+	query->ops->emit_start(ctx, query, query->buffer.buf, va);
 
-	/* emit end query */
-	switch (query->type) {
+	if (query->flags & R600_QUERY_HW_FLAG_TIMER)
+		ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw_end;
+	else
+		ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw_end;
+}
+
+static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
+				       struct r600_query_hw *query,
+				       struct r600_resource *buffer,
+				       uint64_t va)
+{
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		va += query->buffer.results_end + 8;
+		va += 8;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 		radeon_emit(cs, va);
@@ -260,14 +506,14 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
 	case PIPE_QUERY_SO_STATISTICS:
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		va += query->buffer.results_end + query->result_size/2;
+		va += query->result_size/2;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32) & 0xFFFF);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
-		va += query->buffer.results_end + query->result_size/2;
+		va += query->result_size/2;
 		/* fall through */
 	case PIPE_QUERY_TIMESTAMP:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
@@ -278,7 +524,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 		radeon_emit(cs, 0);
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
-		va += query->buffer.results_end + query->result_size/2;
+		va += query->result_size/2;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
@@ -289,25 +535,41 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	}
 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_QUERY);
+}
+
+static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
+				    struct r600_query_hw *query)
+{
+	uint64_t va;
+
+	/* The queries which need begin already called this in begin_query. */
+	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
+		ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, FALSE);
+	}
+
+	/* emit end query */
+	va = query->buffer.buf->gpu_address + query->buffer.results_end;
+
+	query->ops->emit_stop(ctx, query, query->buffer.buf, va);
 
 	query->buffer.results_end += query->result_size;
 
-	if (r600_query_needs_begin(query->type)) {
-		if (r600_is_timer_query(query->type))
-			ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw;
+	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) {
+		if (query->flags & R600_QUERY_HW_FLAG_TIMER)
+			ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw_end;
 		else
-			ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw;
+			ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw_end;
 	}
 
-	r600_update_occlusion_query_state(ctx, query->type, -1);
-	r600_update_prims_generated_query_state(ctx, query->type, -1);
+	r600_update_occlusion_query_state(ctx, query->b.type, -1);
+	r600_update_prims_generated_query_state(ctx, query->b.type, -1);
 }
 
 static void r600_emit_query_predication(struct r600_common_context *ctx,
 					struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
-	struct r600_query *query = (struct r600_query*)ctx->render_cond;
+	struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
 	struct r600_query_buffer *qbuf;
 	uint32_t op;
 	bool flag_wait;
@@ -318,7 +580,7 @@ static void r600_emit_query_predication(struct r600_common_context *ctx,
 	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
 		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
 
-	switch (query->type) {
+	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
 		op = PRED_OP(PREDICATION_OP_ZPASS);
@@ -364,94 +626,21 @@ static void r600_emit_query_predication(struct r600_common_context *ctx,
 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct r600_query *query;
-	bool skip_allocation = false;
 
-	query = CALLOC_STRUCT(r600_query);
-	if (query == NULL)
-		return NULL;
-
-	query->type = query_type;
-
-	switch (query_type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		query->result_size = 16 * rctx->max_db;
-		query->num_cs_dw = 6;
-		break;
-		break;
-	case PIPE_QUERY_TIME_ELAPSED:
-		query->result_size = 16;
-		query->num_cs_dw = 8;
-		break;
-	case PIPE_QUERY_TIMESTAMP:
-		query->result_size = 8;
-		query->num_cs_dw = 8;
-		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-	case PIPE_QUERY_SO_STATISTICS:
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
-		query->result_size = 32;
-		query->num_cs_dw = 6;
-		query->stream = index;
-		break;
-	case PIPE_QUERY_PIPELINE_STATISTICS:
-		/* 11 values on EG, 8 on R600. */
-		query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
-		query->num_cs_dw = 6;
-		break;
-	/* Non-GPU queries and queries not requiring a buffer. */
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-	case PIPE_QUERY_GPU_FINISHED:
-	case R600_QUERY_DRAW_CALLS:
-	case R600_QUERY_REQUESTED_VRAM:
-	case R600_QUERY_REQUESTED_GTT:
-	case R600_QUERY_BUFFER_WAIT_TIME:
-	case R600_QUERY_NUM_CS_FLUSHES:
-	case R600_QUERY_NUM_BYTES_MOVED:
-	case R600_QUERY_VRAM_USAGE:
-	case R600_QUERY_GTT_USAGE:
-	case R600_QUERY_GPU_TEMPERATURE:
-	case R600_QUERY_CURRENT_GPU_SCLK:
-	case R600_QUERY_CURRENT_GPU_MCLK:
-	case R600_QUERY_GPU_LOAD:
-	case R600_QUERY_NUM_COMPILATIONS:
-	case R600_QUERY_NUM_SHADERS_CREATED:
-		skip_allocation = true;
-		break;
-	default:
-		assert(0);
-		FREE(query);
-		return NULL;
-	}
+	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
+	    query_type == PIPE_QUERY_GPU_FINISHED ||
+	    query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
+		return r600_query_sw_create(ctx, query_type);
 
-	if (!skip_allocation) {
-		query->buffer.buf = r600_new_query_buffer(rctx, query_type);
-		if (!query->buffer.buf) {
-			FREE(query);
-			return NULL;
-		}
-	}
-	return (struct pipe_query*)query;
+	return r600_query_hw_create(rctx, query_type, index);
 }
 
 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
 {
-	struct r600_query *rquery = (struct r600_query*)query;
-	struct r600_query_buffer *prev = rquery->buffer.previous;
-
-	/* Release all query buffers. */
-	while (prev) {
-		struct r600_query_buffer *qbuf = prev;
-		prev = prev->previous;
-		pipe_resource_reference((struct pipe_resource**)&qbuf->buf, NULL);
-		FREE(qbuf);
-	}
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct r600_query *rquery = (struct r600_query *)query;
 
-	pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
-	FREE(query);
+	rquery->ops->destroy(rctx, rquery);
 }
 
 static boolean r600_begin_query(struct pipe_context *ctx,
@@ -459,48 +648,14 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct r600_query *rquery = (struct r600_query *)query;
-	struct r600_query_buffer *prev = rquery->buffer.previous;
 
-	if (!r600_query_needs_begin(rquery->type)) {
-		assert(0);
-		return false;
-	}
+	return rquery->ops->begin(rctx, rquery);
+}
 
-	/* Non-GPU queries. */
-	switch (rquery->type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-		return true;
-	case R600_QUERY_DRAW_CALLS:
-		rquery->begin_result = rctx->num_draw_calls;
-		return true;
-	case R600_QUERY_REQUESTED_VRAM:
-	case R600_QUERY_REQUESTED_GTT:
-	case R600_QUERY_VRAM_USAGE:
-	case R600_QUERY_GTT_USAGE:
-	case R600_QUERY_GPU_TEMPERATURE:
-	case R600_QUERY_CURRENT_GPU_SCLK:
-	case R600_QUERY_CURRENT_GPU_MCLK:
-		rquery->begin_result = 0;
-		return true;
-	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
-		return true;
-	case R600_QUERY_NUM_CS_FLUSHES:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
-		return true;
-	case R600_QUERY_NUM_BYTES_MOVED:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_BYTES_MOVED);
-		return true;
-	case R600_QUERY_GPU_LOAD:
-		rquery->begin_result = r600_gpu_load_begin(rctx->screen);
-		return true;
-	case R600_QUERY_NUM_COMPILATIONS:
-		rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations);
-		return true;
-	case R600_QUERY_NUM_SHADERS_CREATED:
-		rquery->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
-		return true;
-	}
+static void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
+					struct r600_query_hw *query)
+{
+	struct r600_query_buffer *prev = query->buffer.previous;
 
 	/* Discard the old query buffers. */
 	while (prev) {
@@ -510,22 +665,39 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 		FREE(qbuf);
 	}
 
-	/* Obtain a new buffer if the current one can't be mapped without a stall. */
-	if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
-	    !rctx->ws->buffer_wait(rquery->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
-		pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
-		rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type);
+	if (query->flags & R600_QUERY_HW_FLAG_PREDICATE) {
+		/* Obtain a new buffer if the current one can't be mapped without a stall. */
+		if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
+		    !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
+			pipe_resource_reference((struct pipe_resource**)&query->buffer.buf, NULL);
+			query->buffer.buf = r600_new_query_buffer(rctx, query);
+		} else {
+			query->ops->prepare_buffer(rctx, query, query->buffer.buf);
+		}
 	}
 
-	rquery->buffer.results_end = 0;
-	rquery->buffer.previous = NULL;
+	query->buffer.results_end = 0;
+	query->buffer.previous = NULL;
+}
 
-	r600_emit_query_begin(rctx, rquery);
+boolean r600_query_hw_begin(struct r600_common_context *rctx,
+			    struct r600_query *rquery)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
 
-	if (r600_is_timer_query(rquery->type))
-		LIST_ADDTAIL(&rquery->list, &rctx->active_timer_queries);
+	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
+		assert(0);
+		return false;
+	}
+
+	r600_query_hw_reset_buffers(rctx, query);
+
+	r600_query_hw_emit_start(rctx, query);
+
+	if (query->flags & R600_QUERY_HW_FLAG_TIMER)
+		LIST_ADDTAIL(&query->list, &rctx->active_timer_queries);
 	else
-		LIST_ADDTAIL(&rquery->list, &rctx->active_nontimer_queries);
+		LIST_ADDTAIL(&query->list, &rctx->active_nontimer_queries);
    return true;
 }
 
@@ -534,64 +706,24 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct r600_query *rquery = (struct r600_query *)query;
 
-	/* Non-GPU queries. */
-	switch (rquery->type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-		return;
-	case PIPE_QUERY_GPU_FINISHED:
-		ctx->flush(ctx, &rquery->fence, 0);
-		return;
-	case R600_QUERY_DRAW_CALLS:
-		rquery->end_result = rctx->num_draw_calls;
-		return;
-	case R600_QUERY_REQUESTED_VRAM:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_VRAM_MEMORY);
-		return;
-	case R600_QUERY_REQUESTED_GTT:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_GTT_MEMORY);
-		return;
-	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
-		return;
-	case R600_QUERY_NUM_CS_FLUSHES:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
-		return;
-	case R600_QUERY_NUM_BYTES_MOVED:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_BYTES_MOVED);
-		return;
-	case R600_QUERY_VRAM_USAGE:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_VRAM_USAGE);
-		return;
-	case R600_QUERY_GTT_USAGE:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_GTT_USAGE);
-		return;
-	case R600_QUERY_GPU_TEMPERATURE:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_GPU_TEMPERATURE) / 1000;
-		return;
-	case R600_QUERY_CURRENT_GPU_SCLK:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_CURRENT_SCLK) * 1000000;
-		return;
-	case R600_QUERY_CURRENT_GPU_MCLK:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_CURRENT_MCLK) * 1000000;
-		return;
-	case R600_QUERY_GPU_LOAD:
-		rquery->end_result = r600_gpu_load_end(rctx->screen, rquery->begin_result);
-		return;
-	case R600_QUERY_NUM_COMPILATIONS:
-		rquery->end_result = p_atomic_read(&rctx->screen->num_compilations);
-		return;
-	case R600_QUERY_NUM_SHADERS_CREATED:
-		rquery->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
-		return;
-	}
+	rquery->ops->end(rctx, rquery);
+}
 
-	r600_emit_query_end(rctx, rquery);
+void r600_query_hw_end(struct r600_common_context *rctx,
+			      struct r600_query *rquery)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
+
+	if (query->flags & R600_QUERY_HW_FLAG_NO_START)
+		r600_query_hw_reset_buffers(rctx, query);
 
-	if (r600_query_needs_begin(rquery->type))
-		LIST_DELINIT(&rquery->list);
+	r600_query_hw_emit_stop(rctx, query);
+
+	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
+		LIST_DELINIT(&query->list);
 }
 
-static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
+static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
 				       bool test_status_bit)
 {
 	uint32_t *current_result = (uint32_t*)map;
@@ -609,80 +741,36 @@ static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned
 	return 0;
 }
 
-static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
-					    struct r600_query *query,
-					    struct r600_query_buffer *qbuf,
-					    boolean wait,
-					    union pipe_query_result *result)
+static void r600_query_hw_add_result(struct r600_common_context *ctx,
+				     struct r600_query_hw *query,
+				     void *buffer,
+				     union pipe_query_result *result)
 {
-	struct pipe_screen *screen = ctx->b.screen;
-	unsigned results_base = 0;
-	char *map;
-
-	/* Non-GPU queries. */
-	switch (query->type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-		/* Convert from cycles per millisecond to cycles per second (Hz). */
-		result->timestamp_disjoint.frequency =
-			(uint64_t)ctx->screen->info.r600_clock_crystal_freq * 1000;
-		result->timestamp_disjoint.disjoint = FALSE;
-		return TRUE;
-	case PIPE_QUERY_GPU_FINISHED:
-		result->b = screen->fence_finish(screen, query->fence,
-					wait ? PIPE_TIMEOUT_INFINITE : 0);
-		return result->b;
-	case R600_QUERY_DRAW_CALLS:
-	case R600_QUERY_REQUESTED_VRAM:
-	case R600_QUERY_REQUESTED_GTT:
-	case R600_QUERY_BUFFER_WAIT_TIME:
-	case R600_QUERY_NUM_CS_FLUSHES:
-	case R600_QUERY_NUM_BYTES_MOVED:
-	case R600_QUERY_VRAM_USAGE:
-	case R600_QUERY_GTT_USAGE:
-	case R600_QUERY_GPU_TEMPERATURE:
-	case R600_QUERY_CURRENT_GPU_SCLK:
-	case R600_QUERY_CURRENT_GPU_MCLK:
-	case R600_QUERY_NUM_COMPILATIONS:
-	case R600_QUERY_NUM_SHADERS_CREATED:
-		result->u64 = query->end_result - query->begin_result;
-		return TRUE;
-	case R600_QUERY_GPU_LOAD:
-		result->u64 = query->end_result;
-		return TRUE;
-	}
-
-	map = r600_buffer_map_sync_with_rings(ctx, qbuf->buf,
-						PIPE_TRANSFER_READ |
-						(wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
-	if (!map)
-		return FALSE;
-
-	/* count all results across all data blocks */
-	switch (query->type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-		while (results_base != qbuf->results_end) {
+	switch (query->b.type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER: {
+		unsigned results_base = 0;
+		while (results_base != query->result_size) {
 			result->u64 +=
-				r600_query_read_result(map + results_base, 0, 2, true);
+				r600_query_read_result(buffer + results_base, 0, 2, true);
 			results_base += 16;
 		}
 		break;
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		while (results_base != qbuf->results_end) {
+	}
+	case PIPE_QUERY_OCCLUSION_PREDICATE: {
+		unsigned results_base = 0;
+		while (results_base != query->result_size) {
 			result->b = result->b ||
-				r600_query_read_result(map + results_base, 0, 2, true) != 0;
+				r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
 			results_base += 16;
 		}
 		break;
+	}
 	case PIPE_QUERY_TIME_ELAPSED:
-		while (results_base != qbuf->results_end) {
-			result->u64 +=
-				r600_query_read_result(map + results_base, 0, 2, false);
-			results_base += query->result_size;
-		}
+		result->u64 += r600_query_read_result(buffer, 0, 2, false);
 		break;
 	case PIPE_QUERY_TIMESTAMP:
 	{
-		uint32_t *current_result = (uint32_t*)map;
+		uint32_t *current_result = (uint32_t*)buffer;
 		result->u64 = (uint64_t)current_result[0] |
 			      (uint64_t)current_result[1] << 32;
 		break;
@@ -694,84 +782,64 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 		 *    u64 PrimitiveStorageNeeded;
 		 * }
 		 * We only need NumPrimitivesWritten here. */
-		while (results_base != qbuf->results_end) {
-			result->u64 +=
-				r600_query_read_result(map + results_base, 2, 6, true);
-			results_base += query->result_size;
-		}
+		result->u64 += r600_query_read_result(buffer, 2, 6, true);
 		break;
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
 		/* Here we read PrimitiveStorageNeeded. */
-		while (results_base != qbuf->results_end) {
-			result->u64 +=
-				r600_query_read_result(map + results_base, 0, 4, true);
-			results_base += query->result_size;
-		}
+		result->u64 += r600_query_read_result(buffer, 0, 4, true);
 		break;
 	case PIPE_QUERY_SO_STATISTICS:
-		while (results_base != qbuf->results_end) {
-			result->so_statistics.num_primitives_written +=
-				r600_query_read_result(map + results_base, 2, 6, true);
-			result->so_statistics.primitives_storage_needed +=
-				r600_query_read_result(map + results_base, 0, 4, true);
-			results_base += query->result_size;
-		}
+		result->so_statistics.num_primitives_written +=
+			r600_query_read_result(buffer, 2, 6, true);
+		result->so_statistics.primitives_storage_needed +=
+			r600_query_read_result(buffer, 0, 4, true);
 		break;
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		while (results_base != qbuf->results_end) {
-			result->b = result->b ||
-				r600_query_read_result(map + results_base, 2, 6, true) !=
-				r600_query_read_result(map + results_base, 0, 4, true);
-			results_base += query->result_size;
-		}
+		result->b = result->b ||
+			r600_query_read_result(buffer, 2, 6, true) !=
+			r600_query_read_result(buffer, 0, 4, true);
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		if (ctx->chip_class >= EVERGREEN) {
-			while (results_base != qbuf->results_end) {
-				result->pipeline_statistics.ps_invocations +=
-					r600_query_read_result(map + results_base, 0, 22, false);
-				result->pipeline_statistics.c_primitives +=
-					r600_query_read_result(map + results_base, 2, 24, false);
-				result->pipeline_statistics.c_invocations +=
-					r600_query_read_result(map + results_base, 4, 26, false);
-				result->pipeline_statistics.vs_invocations +=
-					r600_query_read_result(map + results_base, 6, 28, false);
-				result->pipeline_statistics.gs_invocations +=
-					r600_query_read_result(map + results_base, 8, 30, false);
-				result->pipeline_statistics.gs_primitives +=
-					r600_query_read_result(map + results_base, 10, 32, false);
-				result->pipeline_statistics.ia_primitives +=
-					r600_query_read_result(map + results_base, 12, 34, false);
-				result->pipeline_statistics.ia_vertices +=
-					r600_query_read_result(map + results_base, 14, 36, false);
-				result->pipeline_statistics.hs_invocations +=
-					r600_query_read_result(map + results_base, 16, 38, false);
-				result->pipeline_statistics.ds_invocations +=
-					r600_query_read_result(map + results_base, 18, 40, false);
-				result->pipeline_statistics.cs_invocations +=
-					r600_query_read_result(map + results_base, 20, 42, false);
-				results_base += query->result_size;
-			}
+			result->pipeline_statistics.ps_invocations +=
+				r600_query_read_result(buffer, 0, 22, false);
+			result->pipeline_statistics.c_primitives +=
+				r600_query_read_result(buffer, 2, 24, false);
+			result->pipeline_statistics.c_invocations +=
+				r600_query_read_result(buffer, 4, 26, false);
+			result->pipeline_statistics.vs_invocations +=
+				r600_query_read_result(buffer, 6, 28, false);
+			result->pipeline_statistics.gs_invocations +=
+				r600_query_read_result(buffer, 8, 30, false);
+			result->pipeline_statistics.gs_primitives +=
+				r600_query_read_result(buffer, 10, 32, false);
+			result->pipeline_statistics.ia_primitives +=
+				r600_query_read_result(buffer, 12, 34, false);
+			result->pipeline_statistics.ia_vertices +=
+				r600_query_read_result(buffer, 14, 36, false);
+			result->pipeline_statistics.hs_invocations +=
+				r600_query_read_result(buffer, 16, 38, false);
+			result->pipeline_statistics.ds_invocations +=
+				r600_query_read_result(buffer, 18, 40, false);
+			result->pipeline_statistics.cs_invocations +=
+				r600_query_read_result(buffer, 20, 42, false);
 		} else {
-			while (results_base != qbuf->results_end) {
-				result->pipeline_statistics.ps_invocations +=
-					r600_query_read_result(map + results_base, 0, 16, false);
-				result->pipeline_statistics.c_primitives +=
-					r600_query_read_result(map + results_base, 2, 18, false);
-				result->pipeline_statistics.c_invocations +=
-					r600_query_read_result(map + results_base, 4, 20, false);
-				result->pipeline_statistics.vs_invocations +=
-					r600_query_read_result(map + results_base, 6, 22, false);
-				result->pipeline_statistics.gs_invocations +=
-					r600_query_read_result(map + results_base, 8, 24, false);
-				result->pipeline_statistics.gs_primitives +=
-					r600_query_read_result(map + results_base, 10, 26, false);
-				result->pipeline_statistics.ia_primitives +=
-					r600_query_read_result(map + results_base, 12, 28, false);
-				result->pipeline_statistics.ia_vertices +=
-					r600_query_read_result(map + results_base, 14, 30, false);
-				results_base += query->result_size;
-			}
+			result->pipeline_statistics.ps_invocations +=
+				r600_query_read_result(buffer, 0, 16, false);
+			result->pipeline_statistics.c_primitives +=
+				r600_query_read_result(buffer, 2, 18, false);
+			result->pipeline_statistics.c_invocations +=
+				r600_query_read_result(buffer, 4, 20, false);
+			result->pipeline_statistics.vs_invocations +=
+				r600_query_read_result(buffer, 6, 22, false);
+			result->pipeline_statistics.gs_invocations +=
+				r600_query_read_result(buffer, 8, 24, false);
+			result->pipeline_statistics.gs_primitives +=
+				r600_query_read_result(buffer, 10, 26, false);
+			result->pipeline_statistics.ia_primitives +=
+				r600_query_read_result(buffer, 12, 28, false);
+			result->pipeline_statistics.ia_vertices +=
+				r600_query_read_result(buffer, 14, 30, false);
 		}
 #if 0 /* for testing */
 		printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
@@ -793,23 +861,47 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 	default:
 		assert(0);
 	}
-
-	return TRUE;
 }
 
 static boolean r600_get_query_result(struct pipe_context *ctx,
-					struct pipe_query *query,
-					boolean wait, union pipe_query_result *result)
+				     struct pipe_query *query, boolean wait,
+				     union pipe_query_result *result)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct r600_query *rquery = (struct r600_query *)query;
+
+	return rquery->ops->get_result(rctx, rquery, wait, result);
+}
+
+static void r600_query_hw_clear_result(struct r600_query_hw *query,
+				       union pipe_query_result *result)
+{
+	util_query_clear_result(result, query->b.type);
+}
+
+boolean r600_query_hw_get_result(struct r600_common_context *rctx,
+				 struct r600_query *rquery,
+				 boolean wait, union pipe_query_result *result)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
 	struct r600_query_buffer *qbuf;
 
-	util_query_clear_result(result, rquery->type);
+	query->ops->clear_result(query, result);
 
-	for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) {
-		if (!r600_get_query_buffer_result(rctx, rquery, qbuf, wait, result)) {
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+		unsigned results_base = 0;
+		void *map;
+
+		map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf,
+						      PIPE_TRANSFER_READ |
+						      (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
+		if (!map)
 			return FALSE;
+
+		while (results_base != qbuf->results_end) {
+			query->ops->add_result(rctx, query, map + results_base,
+					       result);
+			results_base += query->result_size;
 		}
 	}
 
@@ -827,7 +919,7 @@ static void r600_render_condition(struct pipe_context *ctx,
 				  uint mode)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct r600_query *rquery = (struct r600_query*)query;
+	struct r600_query_hw *rquery = (struct r600_query_hw *)query;
 	struct r600_query_buffer *qbuf;
 	struct r600_atom *atom = &rctx->render_cond_atom;
 
@@ -837,8 +929,10 @@ static void r600_render_condition(struct pipe_context *ctx,
 
 	/* Compute the size of SET_PREDICATION packets. */
 	atom->num_dw = 0;
-	for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
-		atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
+	if (query) {
+		for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
+			atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
+	}
 
 	rctx->set_atom_dirty(rctx, atom, query != NULL);
 }
@@ -847,10 +941,10 @@ static void r600_suspend_queries(struct r600_common_context *ctx,
 				 struct list_head *query_list,
 				 unsigned *num_cs_dw_queries_suspend)
 {
-	struct r600_query *query;
+	struct r600_query_hw *query;
 
 	LIST_FOR_EACH_ENTRY(query, query_list, list) {
-		r600_emit_query_end(ctx, query);
+		r600_query_hw_emit_stop(ctx, query);
 	}
 	assert(*num_cs_dw_queries_suspend == 0);
 }
@@ -870,19 +964,19 @@ void r600_suspend_timer_queries(struct r600_common_context *ctx)
 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
 						    struct list_head *query_list)
 {
-	struct r600_query *query;
+	struct r600_query_hw *query;
 	unsigned num_dw = 0;
 
 	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		/* begin + end */
-		num_dw += query->num_cs_dw * 2;
+		num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
 
 		/* Workaround for the fact that
 		 * num_cs_dw_nontimer_queries_suspend is incremented for every
 		 * resumed query, which raises the bar in need_cs_space for
 		 * queries about to be resumed.
 		 */
-		num_dw += query->num_cs_dw;
+		num_dw += query->num_cs_dw_end;
 	}
 	/* primitives generated query */
 	num_dw += ctx->streamout.enable_atom.num_dw;
@@ -896,7 +990,7 @@ static void r600_resume_queries(struct r600_common_context *ctx,
 				struct list_head *query_list,
 				unsigned *num_cs_dw_queries_suspend)
 {
-	struct r600_query *query;
+	struct r600_query_hw *query;
 	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list);
 
 	assert(*num_cs_dw_queries_suspend == 0);
@@ -905,7 +999,7 @@ static void r600_resume_queries(struct r600_common_context *ctx,
 	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE);
 
 	LIST_FOR_EACH_ENTRY(query, query_list, list) {
-		r600_emit_query_begin(ctx, query);
+		r600_query_hw_emit_start(ctx, query);
 	}
 }
 
@@ -1002,6 +1096,76 @@ err:
 	return;
 }
 
+#define X(name_, query_type_, type_, result_type_) \
+	{ \
+		.name = name_, \
+		.query_type = R600_QUERY_##query_type_, \
+		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
+		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
+		.group_id = ~(unsigned)0 \
+	}
+
+static struct pipe_driver_query_info r600_driver_query_list[] = {
+	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
+	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
+	X("draw-calls",			DRAW_CALLS,		UINT64, CUMULATIVE),
+	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
+	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
+	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
+	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, CUMULATIVE),
+	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
+	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
+	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
+	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
+	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
+	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
+	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
+};
+
+#undef X
+
+static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
+{
+	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
+		return Elements(r600_driver_query_list);
+	else if (rscreen->info.drm_major == 3)
+		return Elements(r600_driver_query_list) - 3;
+	else
+		return Elements(r600_driver_query_list) - 4;
+}
+
+static int r600_get_driver_query_info(struct pipe_screen *screen,
+				      unsigned index,
+				      struct pipe_driver_query_info *info)
+{
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	unsigned num_queries = r600_get_num_queries(rscreen);
+
+	if (!info)
+		return num_queries;
+
+	if (index >= num_queries)
+		return 0;
+
+	*info = r600_driver_query_list[index];
+
+	switch (info->query_type) {
+	case R600_QUERY_REQUESTED_VRAM:
+	case R600_QUERY_VRAM_USAGE:
+		info->max_value.u64 = rscreen->info.vram_size;
+		break;
+	case R600_QUERY_REQUESTED_GTT:
+	case R600_QUERY_GTT_USAGE:
+		info->max_value.u64 = rscreen->info.gart_size;
+		break;
+	case R600_QUERY_GPU_TEMPERATURE:
+		info->max_value.u64 = 125;
+		break;
+	}
+
+	return 1;
+}
+
 void r600_query_init(struct r600_common_context *rctx)
 {
 	rctx->b.create_query = r600_create_query;
@@ -1017,3 +1181,8 @@ void r600_query_init(struct r600_common_context *rctx)
 	LIST_INITHEAD(&rctx->active_nontimer_queries);
 	LIST_INITHEAD(&rctx->active_timer_queries);
 }
+
+void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
+{
+	rscreen->b.get_driver_query_info = r600_get_driver_query_info;
+}
diff --git a/src/gallium/drivers/radeon/r600_query.h b/src/gallium/drivers/radeon/r600_query.h
new file mode 100644
index 00000000000..0ea5707ca45
--- /dev/null
+++ b/src/gallium/drivers/radeon/r600_query.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *  Nicolai Hähnle <[email protected]>
+ *
+ */
+
+#ifndef R600_QUERY_H
+#define R600_QUERY_H
+
+#include "pipe/p_defines.h"
+#include "util/list.h"
+
+struct r600_common_context;
+struct r600_query;
+struct r600_query_hw;
+struct r600_resource;
+
+#define R600_QUERY_DRAW_CALLS		(PIPE_QUERY_DRIVER_SPECIFIC + 0)
+#define R600_QUERY_REQUESTED_VRAM	(PIPE_QUERY_DRIVER_SPECIFIC + 1)
+#define R600_QUERY_REQUESTED_GTT	(PIPE_QUERY_DRIVER_SPECIFIC + 2)
+#define R600_QUERY_BUFFER_WAIT_TIME	(PIPE_QUERY_DRIVER_SPECIFIC + 3)
+#define R600_QUERY_NUM_CS_FLUSHES	(PIPE_QUERY_DRIVER_SPECIFIC + 4)
+#define R600_QUERY_NUM_BYTES_MOVED	(PIPE_QUERY_DRIVER_SPECIFIC + 5)
+#define R600_QUERY_VRAM_USAGE		(PIPE_QUERY_DRIVER_SPECIFIC + 6)
+#define R600_QUERY_GTT_USAGE		(PIPE_QUERY_DRIVER_SPECIFIC + 7)
+#define R600_QUERY_GPU_TEMPERATURE	(PIPE_QUERY_DRIVER_SPECIFIC + 8)
+#define R600_QUERY_CURRENT_GPU_SCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 9)
+#define R600_QUERY_CURRENT_GPU_MCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 10)
+#define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define R600_QUERY_NUM_SHADERS_CREATED	(PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define R600_QUERY_FIRST_PERFCOUNTER	(PIPE_QUERY_DRIVER_SPECIFIC + 100)
+
+struct r600_query_ops {
+	void (*destroy)(struct r600_common_context *, struct r600_query *);
+	boolean (*begin)(struct r600_common_context *, struct r600_query *);
+	void (*end)(struct r600_common_context *, struct r600_query *);
+	boolean (*get_result)(struct r600_common_context *,
+			      struct r600_query *, boolean wait,
+			      union pipe_query_result *result);
+};
+
+struct r600_query {
+	struct r600_query_ops *ops;
+
+	/* The type of query */
+	unsigned type;
+};
+
+enum {
+	R600_QUERY_HW_FLAG_NO_START = (1 << 0),
+	R600_QUERY_HW_FLAG_TIMER = (1 << 1),
+	R600_QUERY_HW_FLAG_PREDICATE = (1 << 2),
+};
+
+struct r600_query_hw_ops {
+	void (*prepare_buffer)(struct r600_common_context *,
+			       struct r600_query_hw *,
+			       struct r600_resource *);
+	void (*emit_start)(struct r600_common_context *,
+			   struct r600_query_hw *,
+			   struct r600_resource *buffer, uint64_t va);
+	void (*emit_stop)(struct r600_common_context *,
+			  struct r600_query_hw *,
+			  struct r600_resource *buffer, uint64_t va);
+	void (*clear_result)(struct r600_query_hw *, union pipe_query_result *);
+	void (*add_result)(struct r600_common_context *ctx,
+			   struct r600_query_hw *, void *buffer,
+			   union pipe_query_result *result);
+};
+
+struct r600_query_buffer {
+	/* The buffer where query results are stored. */
+	struct r600_resource		*buf;
+	/* Offset of the next free result after current query data */
+	unsigned			results_end;
+	/* If a query buffer is full, a new buffer is created and the old one
+	 * is put in here. When we calculate the result, we sum up the samples
+	 * from all buffers. */
+	struct r600_query_buffer	*previous;
+};
+
+struct r600_query_hw {
+	struct r600_query b;
+	struct r600_query_hw_ops *ops;
+	unsigned flags;
+
+	/* The query buffer and how many results are in it. */
+	struct r600_query_buffer buffer;
+	/* Size of the result in memory for both begin_query and end_query,
+	 * this can be one or two numbers, or it could even be a size of a structure. */
+	unsigned result_size;
+	/* The number of dwords for begin_query or end_query. */
+	unsigned num_cs_dw_begin;
+	unsigned num_cs_dw_end;
+	/* Linked list of queries */
+	struct list_head list;
+	/* For transform feedback: which stream the query is for */
+	unsigned stream;
+};
+
+boolean r600_query_hw_init(struct r600_common_context *rctx,
+			   struct r600_query_hw *query);
+void r600_query_hw_destroy(struct r600_common_context *rctx,
+			   struct r600_query *rquery);
+boolean r600_query_hw_begin(struct r600_common_context *rctx,
+			    struct r600_query *rquery);
+void r600_query_hw_end(struct r600_common_context *rctx,
+		       struct r600_query *rquery);
+boolean r600_query_hw_get_result(struct r600_common_context *rctx,
+				 struct r600_query *rquery,
+				 boolean wait,
+				 union pipe_query_result *result);
+
+#endif /* R600_QUERY_H */
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 0dac6fbbdce..8a60441c056 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -49,6 +49,7 @@
 #define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8))
 #define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
 #define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
+#define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8))
 
 /**
  * flush commands to the hardware
@@ -405,7 +406,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		enc->use_vm = true;
 	if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
 		enc->use_vui = true;
-	if (rscreen->info.family >= CHIP_TONGA)
+	if (rscreen->info.family >= CHIP_TONGA &&
+             rscreen->info.family != CHIP_STONEY)
 		enc->dual_pipe = true;
 	/* TODO enable B frame with dual instance */
 	if ((rscreen->info.family >= CHIP_TONGA) &&
@@ -478,6 +480,10 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		radeon_vce_50_init(enc);
 		break;
 
+	case FW_52_0_3:
+		radeon_vce_52_init(enc);
+		break;
+
 	default:
 		goto error;
 	}
@@ -500,11 +506,17 @@ error:
  */
 bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 {
-	return rscreen->info.vce_fw_version == FW_40_2_2 ||
-		rscreen->info.vce_fw_version == FW_50_0_1 ||
-		rscreen->info.vce_fw_version == FW_50_1_2 ||
-		rscreen->info.vce_fw_version == FW_50_10_2 ||
-		rscreen->info.vce_fw_version == FW_50_17_3;
+	switch (rscreen->info.vce_fw_version) {
+	case FW_40_2_2:
+	case FW_50_0_1:
+	case FW_50_1_2:
+	case FW_50_10_2:
+	case FW_50_17_3:
+	case FW_52_0_3:
+		return true;
+	default:
+		return false;
+	}
 }
 
 /**
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 624bda479f8..25e2133521f 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -140,4 +140,7 @@ void radeon_vce_40_2_2_init(struct rvce_encoder *enc);
 /* init vce fw 50 specific callbacks */
 void radeon_vce_50_init(struct rvce_encoder *enc);
 
+/* init vce fw 52 specific callbacks */
+void radeon_vce_52_init(struct rvce_encoder *enc);
+
 #endif
diff --git a/src/gallium/drivers/radeon/radeon_vce_52.c b/src/gallium/drivers/radeon/radeon_vce_52.c
new file mode 100644
index 00000000000..fbae1f97f41
--- /dev/null
+++ b/src/gallium/drivers/radeon/radeon_vce_52.c
@@ -0,0 +1,242 @@
+/**************************************************************************
+ *
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdio.h>
+
+#include "pipe/p_video_codec.h"
+
+#include "util/u_video.h"
+#include "util/u_memory.h"
+
+#include "vl/vl_video_buffer.h"
+
+#include "r600_pipe_common.h"
+#include "radeon_video.h"
+#include "radeon_vce.h"
+
+static const unsigned profiles[7] = { 66, 77, 88, 100, 110, 122, 244 };
+
+static void create(struct rvce_encoder *enc)
+{
+	enc->task_info(enc, 0x00000000, 0, 0, 0);
+
+	RVCE_BEGIN(0x01000001); // create cmd
+	RVCE_CS(0x00000000); // encUseCircularBuffer
+	RVCE_CS(profiles[enc->base.profile -
+		PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE]); // encProfile
+	RVCE_CS(enc->base.level); // encLevel
+	RVCE_CS(0x00000000); // encPicStructRestriction
+	RVCE_CS(enc->base.width); // encImageWidth
+	RVCE_CS(enc->base.height); // encImageHeight
+	RVCE_CS(enc->luma->level[0].pitch_bytes); // encRefPicLumaPitch
+	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encRefPicChromaPitch
+	RVCE_CS(align(enc->luma->npix_y, 16) / 8); // encRefYHeightInQw
+	RVCE_CS(0x00000000); // encRefPic(Addr|Array)Mode, encPicStructRestriction, disableRDO
+
+	RVCE_CS(0x00000000); // encPreEncodeContextBufferOffset
+	RVCE_CS(0x00000000); // encPreEncodeInputLumaBufferOffset
+	RVCE_CS(0x00000000); // encPreEncodeInputChromaBufferOffs
+	RVCE_CS(0x00000000); // encPreEncodeMode|ChromaFlag|VBAQMode|SceneChangeSensitivity
+	RVCE_END();
+}
+
+static void encode(struct rvce_encoder *enc)
+{
+	signed luma_offset, chroma_offset, bs_offset;
+	unsigned dep, bs_idx = enc->bs_idx++;
+	int i;
+
+	if (enc->dual_inst) {
+		if (bs_idx == 0)
+			dep = 1;
+		else if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR)
+			dep = 0;
+		else
+			dep = 2;
+	} else
+		dep = 0;
+
+	enc->task_info(enc, 0x00000003, dep, 0, bs_idx);
+
+	RVCE_BEGIN(0x05000001); // context buffer
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
+	RVCE_END();
+
+	bs_offset = -(signed)(bs_idx * enc->bs_size);
+
+	RVCE_BEGIN(0x05000004); // video bitstream buffer
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, bs_offset); // videoBitstreamRingAddressHi/Lo
+	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
+	RVCE_END();
+
+	if (enc->dual_pipe) {
+		unsigned aux_offset = enc->cpb.res->buf->size -
+			RVCE_MAX_AUX_BUFFER_NUM * RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2;
+		RVCE_BEGIN(0x05000002); // auxiliary buffer
+		for (i = 0; i < 8; ++i) {
+			RVCE_CS(aux_offset);
+			aux_offset += RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
+		}
+		for (i = 0; i < 8; ++i)
+			RVCE_CS(RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE);
+		RVCE_END();
+	}
+
+	RVCE_BEGIN(0x03000001); // encode
+	RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders
+	RVCE_CS(0x00000000); // pictureStructure
+	RVCE_CS(enc->bs_size); // allowedMaxBitstreamSize
+	RVCE_CS(0x00000000); // forceRefreshMap
+	RVCE_CS(0x00000000); // insertAUD
+	RVCE_CS(0x00000000); // endOfSequence
+	RVCE_CS(0x00000000); // endOfStream
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
+	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
+	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
+	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
+	if (enc->dual_pipe)
+		RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	else
+		RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	RVCE_CS(0x00000000); // encInputPicTileConfig
+	RVCE_CS(enc->pic.picture_type); // encPicType
+	RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
+	RVCE_CS(0x00000000); // encIdrPicId
+	RVCE_CS(0x00000000); // encMGSKeyPic
+	RVCE_CS(!enc->pic.not_referenced); // encReferenceFlag
+	RVCE_CS(0x00000000); // encTemporalLayerIndex
+	RVCE_CS(0x00000000); // num_ref_idx_active_override_flag
+	RVCE_CS(0x00000000); // num_ref_idx_l0_active_minus1
+	RVCE_CS(0x00000000); // num_ref_idx_l1_active_minus1
+
+	i = enc->pic.frame_num - enc->pic.ref_idx_l0;
+	if (i > 1 && enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) {
+		RVCE_CS(0x00000001); // encRefListModificationOp
+		RVCE_CS(i - 1);      // encRefListModificationNum
+	} else {
+		RVCE_CS(0x00000000); // encRefListModificationOp
+		RVCE_CS(0x00000000); // encRefListModificationNum
+	}
+
+	for (i = 0; i < 3; ++i) {
+		RVCE_CS(0x00000000); // encRefListModificationOp
+		RVCE_CS(0x00000000); // encRefListModificationNum
+	}
+	for (i = 0; i < 4; ++i) {
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingOp
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingNum
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingIdx
+		RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingOp
+		RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingNum
+	}
+
+	// encReferencePictureL0[0]
+	RVCE_CS(0x00000000); // pictureStructure
+	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
+	   enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+		struct rvce_cpb_slot *l0 = l0_slot(enc);
+		rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset);
+		RVCE_CS(l0->picture_type); // encPicType
+		RVCE_CS(l0->frame_num); // frameNumber
+		RVCE_CS(l0->pic_order_cnt); // pictureOrderCount
+		RVCE_CS(luma_offset); // lumaOffset
+		RVCE_CS(chroma_offset); // chromaOffset
+	} else {
+		RVCE_CS(0x00000000); // encPicType
+		RVCE_CS(0x00000000); // frameNumber
+		RVCE_CS(0x00000000); // pictureOrderCount
+		RVCE_CS(0xffffffff); // lumaOffset
+		RVCE_CS(0xffffffff); // chromaOffset
+	}
+
+	// encReferencePictureL0[1]
+	RVCE_CS(0x00000000); // pictureStructure
+	RVCE_CS(0x00000000); // encPicType
+	RVCE_CS(0x00000000); // frameNumber
+	RVCE_CS(0x00000000); // pictureOrderCount
+	RVCE_CS(0xffffffff); // lumaOffset
+	RVCE_CS(0xffffffff); // chromaOffset
+
+	// encReferencePictureL1[0]
+	RVCE_CS(0x00000000); // pictureStructure
+	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+		struct rvce_cpb_slot *l1 = l1_slot(enc);
+		rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset);
+		RVCE_CS(l1->picture_type); // encPicType
+		RVCE_CS(l1->frame_num); // frameNumber
+		RVCE_CS(l1->pic_order_cnt); // pictureOrderCount
+		RVCE_CS(luma_offset); // lumaOffset
+		RVCE_CS(chroma_offset); // chromaOffset
+	} else {
+		RVCE_CS(0x00000000); // encPicType
+		RVCE_CS(0x00000000); // frameNumber
+		RVCE_CS(0x00000000); // pictureOrderCount
+		RVCE_CS(0xffffffff); // lumaOffset
+		RVCE_CS(0xffffffff); // chromaOffset
+	}
+
+	rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
+	RVCE_CS(luma_offset); // encReconstructedLumaOffset
+	RVCE_CS(chroma_offset); // encReconstructedChromaOffset
+	RVCE_CS(0x00000000); // encColocBufferOffset
+	RVCE_CS(0x00000000); // encReconstructedRefBasePictureLumaOffset
+	RVCE_CS(0x00000000); // encReconstructedRefBasePictureChromaOffset
+	RVCE_CS(0x00000000); // encReferenceRefBasePictureLumaOffset
+	RVCE_CS(0x00000000); // encReferenceRefBasePictureChromaOffset
+	RVCE_CS(0x00000000); // pictureCount
+	RVCE_CS(enc->pic.frame_num); // frameNumber
+	RVCE_CS(enc->pic.pic_order_cnt); // pictureOrderCount
+	RVCE_CS(0x00000000); // numIPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numPPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numBPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numIRPicRemainInRCGOP
+	RVCE_CS(0x00000000); // enableIntraRefresh
+
+	RVCE_CS(0x00000000); // aq_variance_en
+	RVCE_CS(0x00000000); // aq_block_size
+	RVCE_CS(0x00000000); // aq_mb_variance_sel
+	RVCE_CS(0x00000000); // aq_frame_variance_sel
+	RVCE_CS(0x00000000); // aq_param_a
+	RVCE_CS(0x00000000); // aq_param_b
+	RVCE_CS(0x00000000); // aq_param_c
+	RVCE_CS(0x00000000); // aq_param_d
+	RVCE_CS(0x00000000); // aq_param_e
+
+	RVCE_CS(0x00000000); // contextInSFB
+	RVCE_END();
+}
+
+void radeon_vce_52_init(struct rvce_encoder *enc)
+{
+	radeon_vce_50_init(enc);
+
+	enc->create = create;
+	enc->encode = encode;
+}
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 93847d5ec2f..209b940aa11 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3256,25 +3256,34 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 			}
 		}
 
-		/* GRBM_GFX_INDEX is privileged on VI */
-		if (sctx->b.chip_class <= CIK)
+		/* GRBM_GFX_INDEX has a different offset on SI and CI+ */
+		if (sctx->b.chip_class < CIK)
 			si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
 				       SE_INDEX(se) | SH_BROADCAST_WRITES |
 				       INSTANCE_BROADCAST_WRITES);
+		else
+			si_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX,
+				       S_030800_SE_INDEX(se) | S_030800_SH_BROADCAST_WRITES(1) |
+				       S_030800_INSTANCE_BROADCAST_WRITES(1));
 		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se);
 		if (sctx->b.chip_class >= CIK)
 			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
 	}
 
-	/* GRBM_GFX_INDEX is privileged on VI */
-	if (sctx->b.chip_class <= CIK)
+	/* GRBM_GFX_INDEX has a different offset on SI and CI+ */
+	if (sctx->b.chip_class < CIK)
 		si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
 			       SE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
 			       INSTANCE_BROADCAST_WRITES);
+	else
+		si_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX,
+			       S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
+			       S_030800_INSTANCE_BROADCAST_WRITES(1));
 }
 
 static void si_init_config(struct si_context *sctx)
 {
+	struct si_screen *sscreen = sctx->screen;
 	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
 	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
 	unsigned raster_config, raster_config_1;
@@ -3345,9 +3354,14 @@ static void si_init_config(struct si_context *sctx)
 		raster_config_1 = 0x0000002e;
 		break;
 	case CHIP_FIJI:
-		/* Fiji should be same as Hawaii, but that causes corruption in some cases */
-		raster_config = 0x16000012; /* 0x3a00161a */
-		raster_config_1 = 0x0000002a; /* 0x0000002e */
+		if (sscreen->b.info.cik_macrotile_mode_array[0] == 0x000000e8) {
+			/* old kernels with old tiling config */
+			raster_config = 0x16000012;
+			raster_config_1 = 0x0000002a;
+		} else {
+			raster_config = 0x3a00161a;
+			raster_config_1 = 0x0000002e;
+		}
 		break;
 	case CHIP_TONGA:
 		raster_config = 0x16000012;
diff --git a/src/gallium/drivers/softpipe/Automake.inc b/src/gallium/drivers/softpipe/Automake.inc
index 6455f3caa3d..bd3c2eead16 100644
--- a/src/gallium/drivers/softpipe/Automake.inc
+++ b/src/gallium/drivers/softpipe/Automake.inc
@@ -3,13 +3,10 @@ if HAVE_GALLIUM_SOFTPIPE
 TARGET_DRIVERS += swrast
 TARGET_CPPFLAGS += -DGALLIUM_SOFTPIPE
 TARGET_LIB_DEPS += \
-	$(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la \
 	$(top_builddir)/src/gallium/drivers/softpipe/libsoftpipe.la
 
-if HAVE_DRI2
+if HAVE_DRISW_KMS
 TARGET_DRIVERS += kms_swrast
-TARGET_LIB_DEPS += \
-	$(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la
 
 endif
 endif
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index bcce18a3502..6a4f9d8d076 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -51,14 +51,16 @@
 #define SVGA_QUERY_NUM_VALIDATIONS         (PIPE_QUERY_DRIVER_SPECIFIC + 3)
 #define SVGA_QUERY_MAP_BUFFER_TIME         (PIPE_QUERY_DRIVER_SPECIFIC + 4)
 #define SVGA_QUERY_NUM_RESOURCES_MAPPED    (PIPE_QUERY_DRIVER_SPECIFIC + 5)
+#define SVGA_QUERY_NUM_BYTES_UPLOADED      (PIPE_QUERY_DRIVER_SPECIFIC + 6)
+
 /* running total counters */
-#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 6)
-#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 7)
-#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 8)
-#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 9)
-#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 10)
+#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 7)
+#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 8)
+#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 9)
+#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 10)
+#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 11)
 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
-#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 12)
 
 /**
  * Maximum supported number of constant buffers per shader
@@ -485,6 +487,7 @@ struct svga_context
       uint64_t num_shaders;          /**< SVGA_QUERY_NUM_SHADERS */
       uint64_t num_state_objects;    /**< SVGA_QUERY_NUM_STATE_OBJECTS */
       uint64_t num_surface_views;    /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
+      uint64_t num_bytes_uploaded;   /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
    } hud;
 
    /** The currently bound stream output targets */
diff --git a/src/gallium/drivers/svga/svga_format.c b/src/gallium/drivers/svga/svga_format.c
index 28b8064bf70..2b549dfa5bb 100644
--- a/src/gallium/drivers/svga/svga_format.c
+++ b/src/gallium/drivers/svga/svga_format.c
@@ -53,17 +53,17 @@ static const struct vgpu10_format_entry format_conversion_table[] =
    { PIPE_FORMAT_A8R8G8B8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_X8R8G8B8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_B5G5R5A1_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B5G5R5A1_UNORM,       0 },
-   { PIPE_FORMAT_B4G4R4A4_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_A4R4G4B4,             0 },
+   { PIPE_FORMAT_B4G4R4A4_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_B5G6R5_UNORM,          SVGA3D_FORMAT_INVALID,      SVGA3D_B5G6R5_UNORM,         0 },
    { PIPE_FORMAT_R10G10B10A2_UNORM,     SVGA3D_R10G10B10A2_UNORM,   SVGA3D_R10G10B10A2_UNORM,    0 },
-   { PIPE_FORMAT_L8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_LUMINANCE8,           0 },
+   { PIPE_FORMAT_L8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_A8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_A8_UNORM,             0 },
    { PIPE_FORMAT_I8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_L8A8_UNORM,            SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_L16_UNORM,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_UYVY,                  SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_YUYV,                  SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_Z16_UNORM,             SVGA3D_FORMAT_INVALID,      SVGA3D_Z_D16,                0 },
+   { PIPE_FORMAT_Z16_UNORM,             SVGA3D_FORMAT_INVALID,      SVGA3D_D16_UNORM,            0 },
    { PIPE_FORMAT_Z32_UNORM,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_Z32_FLOAT,             SVGA3D_FORMAT_INVALID,      SVGA3D_D32_FLOAT,            0 },
    { PIPE_FORMAT_Z24_UNORM_S8_UINT,     SVGA3D_FORMAT_INVALID,      SVGA3D_D24_UNORM_S8_UINT,    0 },
@@ -152,14 +152,14 @@ static const struct vgpu10_format_entry format_conversion_table[] =
    { PIPE_FORMAT_A8R8G8B8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_X8R8G8B8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_R8G8B8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_R8G8B8A8_UNORM_SRGB,  0 },
-   { PIPE_FORMAT_DXT1_RGB,              SVGA3D_FORMAT_INVALID,      SVGA3D_DXT1,                 0 },
-   { PIPE_FORMAT_DXT1_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_DXT1,                 0 },
-   { PIPE_FORMAT_DXT3_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_DXT3,                 0 },
-   { PIPE_FORMAT_DXT5_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_DXT5,                 0 },
-   { PIPE_FORMAT_DXT1_SRGB,             SVGA3D_FORMAT_INVALID,      SVGA3D_DXT1,                 0 },
-   { PIPE_FORMAT_DXT1_SRGBA,            SVGA3D_FORMAT_INVALID,      SVGA3D_DXT1,                 0 },
-   { PIPE_FORMAT_DXT3_SRGBA,            SVGA3D_FORMAT_INVALID,      SVGA3D_DXT3,                 0 },
-   { PIPE_FORMAT_DXT5_SRGBA,            SVGA3D_FORMAT_INVALID,      SVGA3D_DXT5,                 0 },
+   { PIPE_FORMAT_DXT1_RGB,              SVGA3D_FORMAT_INVALID,      SVGA3D_BC1_UNORM,            0 },
+   { PIPE_FORMAT_DXT1_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_BC1_UNORM,            0 },
+   { PIPE_FORMAT_DXT3_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_BC2_UNORM,            0 },
+   { PIPE_FORMAT_DXT5_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_BC3_UNORM,            0 },
+   { PIPE_FORMAT_DXT1_SRGB,             SVGA3D_FORMAT_INVALID,      SVGA3D_BC1_UNORM_SRGB,       0 },
+   { PIPE_FORMAT_DXT1_SRGBA,            SVGA3D_FORMAT_INVALID,      SVGA3D_BC1_UNORM_SRGB,       0 },
+   { PIPE_FORMAT_DXT3_SRGBA,            SVGA3D_FORMAT_INVALID,      SVGA3D_BC2_UNORM_SRGB,       0 },
+   { PIPE_FORMAT_DXT5_SRGBA,            SVGA3D_FORMAT_INVALID,      SVGA3D_BC3_UNORM_SRGB,       0 },
    { PIPE_FORMAT_RGTC1_UNORM,           SVGA3D_FORMAT_INVALID,      SVGA3D_BC4_UNORM,            0 },
    { PIPE_FORMAT_RGTC1_SNORM,           SVGA3D_FORMAT_INVALID,      SVGA3D_BC4_SNORM,            0 },
    { PIPE_FORMAT_RGTC2_UNORM,           SVGA3D_FORMAT_INVALID,      SVGA3D_BC5_UNORM,            0 },
@@ -326,6 +326,34 @@ static const struct vgpu10_format_entry format_conversion_table[] =
    { PIPE_FORMAT_ETC2_R11_SNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_ETC2_RG11_UNORM,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
    { PIPE_FORMAT_ETC2_RG11_SNORM,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_4x4,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_5x4,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_5x5,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_6x5,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_6x6,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_8x5,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_8x6,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_8x8,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_10x5,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_10x6,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_10x8,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_10x10,            SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_12x10,            SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_12x12,            SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_4x4_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_5x4_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_5x5_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_6x5_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_6x6_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_8x5_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_8x6_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_8x8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_10x5_SRGB,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_10x6_SRGB,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_10x8_SRGB,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_10x10_SRGB,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_12x10_SRGB,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
+   { PIPE_FORMAT_ASTC_12x12_SRGB,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
 };
 
 
@@ -472,7 +500,7 @@ struct format_cap {
  * PIPE_FORMAT_Z24_UNORM_S8_UINT is converted to SVGA3D_D24_UNORM_S8_UINT
  * for rendering but converted to SVGA3D_R24_UNORM_X8_TYPELESS for sampling.
  * If we want to query if a format supports both rendering and sampling the
- * host will tell us no for both SVGA3D_D24_UNORM_S8_UINT and
+ * host will tell us no for SVGA3D_D24_UNORM_S8_UINT, SVGA3D_D16_UNORM and
  * SVGA3D_R24_UNORM_X8_TYPELESS.  So we override the host query for those
  * formats and report that both can do rendering and sampling.
  */
@@ -1410,27 +1438,50 @@ static const struct format_cap format_cap_table[] = {
    },
    {
       "SVGA3D_BC1_TYPELESS",
-      SVGA3D_BC1_TYPELESS, 0, 0, 0, 0, 0
+      SVGA3D_BC1_TYPELESS,
+      SVGA3D_DEVCAP_DXFMT_BC1_TYPELESS,
+      4, 4, 8,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_BC1_UNORM_SRGB",
-      SVGA3D_BC1_UNORM_SRGB, 0, 0, 0, 0, 0
+      SVGA3D_BC1_UNORM_SRGB,
+      SVGA3D_DEVCAP_DXFMT_BC1_UNORM_SRGB,
+      4, 4, 8,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_BC2_TYPELESS",
-      SVGA3D_BC2_TYPELESS, 0, 0, 0, 0, 0
+      SVGA3D_BC2_TYPELESS,
+      SVGA3D_DEVCAP_DXFMT_BC2_TYPELESS,
+      4, 4, 16,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_BC2_UNORM_SRGB",
-      SVGA3D_BC2_UNORM_SRGB, 0, 0, 0, 0, 0
+      SVGA3D_BC2_UNORM_SRGB,
+      SVGA3D_DEVCAP_DXFMT_BC2_UNORM_SRGB,
+      4, 4, 16,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_BC3_TYPELESS",
-      SVGA3D_BC3_TYPELESS, 0, 0, 0, 0, 0
+      SVGA3D_BC3_TYPELESS,
+      SVGA3D_DEVCAP_DXFMT_BC3_TYPELESS,
+      4, 4, 16,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_BC3_UNORM_SRGB",
-      SVGA3D_BC3_UNORM_SRGB, 0, 0, 0, 0, 0
+      SVGA3D_BC3_UNORM_SRGB,
+      4, 4, 16,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_BC4_TYPELESS",
@@ -1671,7 +1722,7 @@ static const struct format_cap format_cap_table[] = {
    {
       "SVGA3D_D16_UNORM",
       SVGA3D_D16_UNORM,
-      SVGA3D_DEVCAP_DXFMT_D16_UNORM,
+      0, /*SVGA3D_DEVCAP_DXFMT_D16_UNORM*/
       1, 1, 2,
       SVGA3DFORMAT_OP_TEXTURE |
       SVGA3DFORMAT_OP_CUBETEXTURE |
@@ -1690,15 +1741,27 @@ static const struct format_cap format_cap_table[] = {
    },
    {
       "SVGA3D_BC1_UNORM",
-      SVGA3D_BC1_UNORM, 0, 0, 0, 0, 0
+      SVGA3D_BC1_UNORM,
+      SVGA3D_DEVCAP_DXFMT_BC1_UNORM,
+      4, 4, 8,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_BC2_UNORM",
-      SVGA3D_BC2_UNORM, 0, 0, 0, 0, 0
+      SVGA3D_BC2_UNORM,
+      SVGA3D_DEVCAP_DXFMT_BC2_UNORM,
+      4, 4, 16,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_BC3_UNORM",
-      SVGA3D_BC3_UNORM, 0, 0, 0, 0, 0
+      SVGA3D_BC3_UNORM,
+      SVGA3D_DEVCAP_DXFMT_BC3_UNORM,
+      4, 4, 16,
+      SVGA3DFORMAT_OP_TEXTURE |
+      SVGA3DFORMAT_OP_CUBETEXTURE
    },
    {
       "SVGA3D_B5G6R5_UNORM",
@@ -2053,6 +2116,7 @@ svga_typeless_format(SVGA3dSurfaceFormat format)
    case SVGA3D_R8G8_UINT:
    case SVGA3D_R8G8_SINT:
       return SVGA3D_R8G8_TYPELESS;
+   case SVGA3D_D16_UNORM:
    case SVGA3D_R16_UNORM:
    case SVGA3D_R16_UINT:
    case SVGA3D_R16_SNORM:
@@ -2070,6 +2134,15 @@ svga_typeless_format(SVGA3dSurfaceFormat format)
    case SVGA3D_B8G8R8X8_UNORM_SRGB:
    case SVGA3D_B8G8R8X8_UNORM:
       return SVGA3D_B8G8R8X8_TYPELESS;
+   case SVGA3D_BC1_UNORM:
+   case SVGA3D_BC1_UNORM_SRGB:
+      return SVGA3D_BC1_TYPELESS;
+   case SVGA3D_BC2_UNORM:
+   case SVGA3D_BC2_UNORM_SRGB:
+      return SVGA3D_BC2_TYPELESS;
+   case SVGA3D_BC3_UNORM:
+   case SVGA3D_BC3_UNORM_SRGB:
+      return SVGA3D_BC3_TYPELESS;
    case SVGA3D_BC4_UNORM:
    case SVGA3D_BC4_SNORM:
       return SVGA3D_BC4_TYPELESS;
@@ -2079,18 +2152,10 @@ svga_typeless_format(SVGA3dSurfaceFormat format)
 
    /* Special cases (no corresponding _TYPELESS formats) */
    case SVGA3D_A8_UNORM:
-   case SVGA3D_A4R4G4B4:
    case SVGA3D_B5G5R5A1_UNORM:
    case SVGA3D_B5G6R5_UNORM:
-   case SVGA3D_DXT1:
-   case SVGA3D_DXT2:
-   case SVGA3D_DXT3:
-   case SVGA3D_DXT4:
-   case SVGA3D_DXT5:
    case SVGA3D_R11G11B10_FLOAT:
    case SVGA3D_R9G9B9E5_SHAREDEXP:
-   case SVGA3D_Z_D32:
-   case SVGA3D_Z_D16:
       return format;
    default:
       debug_printf("Unexpected format %s in %s\n",
@@ -2098,3 +2163,26 @@ svga_typeless_format(SVGA3dSurfaceFormat format)
       return format;
    }
 }
+
+
+/**
+ * Given a surface format, return the corresponding format to use for
+ * a texture sampler.  In most cases, it's the format unchanged, but there
+ * are some special cases.
+ */
+SVGA3dSurfaceFormat
+svga_sampler_format(SVGA3dSurfaceFormat format)
+{
+   switch (format) {
+   case SVGA3D_D16_UNORM:
+      return SVGA3D_R16_UNORM;
+   case SVGA3D_D24_UNORM_S8_UINT:
+      return SVGA3D_R24_UNORM_X8_TYPELESS;
+   case SVGA3D_D32_FLOAT:
+      return SVGA3D_R32_FLOAT;
+   case SVGA3D_D32_FLOAT_S8X24_UINT:
+      return SVGA3D_R32_FLOAT_X8X24_TYPELESS;
+   default:
+      return format;
+   }
+}
diff --git a/src/gallium/drivers/svga/svga_format.h b/src/gallium/drivers/svga/svga_format.h
index 0af218cb01a..9f9a530d473 100644
--- a/src/gallium/drivers/svga/svga_format.h
+++ b/src/gallium/drivers/svga/svga_format.h
@@ -93,4 +93,8 @@ SVGA3dSurfaceFormat
 svga_typeless_format(SVGA3dSurfaceFormat format);
 
 
+SVGA3dSurfaceFormat
+svga_sampler_format(SVGA3dSurfaceFormat format);
+
+
 #endif /* SVGA_FORMAT_H_ */
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 8b9818334ca..5416a009dcb 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -731,6 +731,7 @@ svga_create_query(struct pipe_context *pipe,
    case SVGA_QUERY_MAP_BUFFER_TIME:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_RESOURCES_MAPPED:
+   case SVGA_QUERY_NUM_BYTES_UPLOADED:
       break;
    default:
       assert(!"unexpected query type in svga_create_query()");
@@ -797,6 +798,7 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_MAP_BUFFER_TIME:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_RESOURCES_MAPPED:
+   case SVGA_QUERY_NUM_BYTES_UPLOADED:
       /* nothing */
       break;
    default:
@@ -876,6 +878,9 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_RESOURCES_MAPPED:
       sq->begin_count = svga->hud.num_resources_mapped;
       break;
+   case SVGA_QUERY_NUM_BYTES_UPLOADED:
+      sq->begin_count = svga->hud.num_bytes_uploaded;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -966,6 +971,9 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_RESOURCES_MAPPED:
       sq->end_count = svga->hud.num_resources_mapped;
       break;
+   case SVGA_QUERY_NUM_BYTES_UPLOADED:
+      sq->end_count = svga->hud.num_bytes_uploaded;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -1061,6 +1069,7 @@ svga_get_query_result(struct pipe_context *pipe,
    case SVGA_QUERY_NUM_FLUSHES:
    case SVGA_QUERY_NUM_VALIDATIONS:
    case SVGA_QUERY_NUM_RESOURCES_MAPPED:
+   case SVGA_QUERY_NUM_BYTES_UPLOADED:
    case SVGA_QUERY_MAP_BUFFER_TIME:
       vresult->u64 = sq->end_count - sq->begin_count;
       break;
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index 71f2f4f2779..449cc149a81 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -80,6 +80,11 @@ svga_buffer_transfer_map(struct pipe_context *pipe,
    uint8_t *map;
    int64_t begin = os_time_get();
 
+   assert(box->y == 0);
+   assert(box->z == 0);
+   assert(box->height == 1);
+   assert(box->depth == 1);
+
    transfer = CALLOC_STRUCT(pipe_transfer);
    if (transfer == NULL) {
       return NULL;
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 69e5f75e208..8c5cff5abc1 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -429,6 +429,8 @@ svga_buffer_upload_flush(struct svga_context *svga,
 
          assert(box->x <= sbuf->b.b.width0);
          assert(box->x + box->w <= sbuf->b.b.width0);
+
+         svga->hud.num_bytes_uploaded += box->w;
       }
    }
    else {
@@ -454,6 +456,8 @@ svga_buffer_upload_flush(struct svga_context *svga,
 
          assert(box->x <= sbuf->b.b.width0);
          assert(box->x + box->w <= sbuf->b.b.width0);
+
+         svga->hud.num_bytes_uploaded += box->w;
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index a02d1e495ff..81594777258 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -380,6 +380,12 @@ svga_texture_transfer_map(struct pipe_context *pipe,
       break;
    }
 
+   if (usage & PIPE_TRANSFER_WRITE) {
+      /* record texture upload for HUD */
+      svga->hud.num_bytes_uploaded +=
+         nblocksx * nblocksy * d * util_format_get_blocksize(texture->format);
+   }
+
    if (!use_direct_map) {
       /* Use a DMA buffer */
       st->hw_nblocksy = nblocksy;
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index a80bc9b9119..09a3d33552b 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -789,6 +789,8 @@ svga_get_driver_query_info(struct pipe_screen *screen,
       {"map-buffer-time", SVGA_QUERY_MAP_BUFFER_TIME, {0},
        PIPE_DRIVER_QUERY_TYPE_MICROSECONDS},
       {"num-resources-mapped", SVGA_QUERY_NUM_RESOURCES_MAPPED, {0}},
+      {"num-bytes-uploaded", SVGA_QUERY_NUM_BYTES_UPLOADED, {0},
+       PIPE_DRIVER_QUERY_TYPE_BYTES, PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE},
 
       /* running total counters */
       {"memory-used", SVGA_QUERY_MEMORY_USED, {0},
diff --git a/src/gallium/drivers/svga/svga_state_sampler.c b/src/gallium/drivers/svga/svga_state_sampler.c
index 611d2c6102f..c5d52bbfd14 100644
--- a/src/gallium/drivers/svga/svga_state_sampler.c
+++ b/src/gallium/drivers/svga/svga_state_sampler.c
@@ -108,6 +108,9 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,
                                      PIPE_BIND_SAMPLER_VIEW);
       assert(format != SVGA3D_FORMAT_INVALID);
 
+      /* Convert the format to a sampler-friendly format, if needed */
+      format = svga_sampler_format(format);
+
       if (texture->target == PIPE_BUFFER) {
          viewDesc.buffer.firstElement = sv->base.u.buf.first_element;
          viewDesc.buffer.numElements = (sv->base.u.buf.last_element -
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 8b02680c77e..62a51e9a94d 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -456,9 +456,6 @@ trace_screen_create(struct pipe_screen *screen)
 {
    struct trace_screen *tr_scr;
 
-   if(!screen)
-      goto error1;
-
    if (!trace_enabled())
       goto error1;
 
diff --git a/src/gallium/drivers/vc4/Automake.inc b/src/gallium/drivers/vc4/Automake.inc
index 6fa3e190cac..5664c2ab14e 100644
--- a/src/gallium/drivers/vc4/Automake.inc
+++ b/src/gallium/drivers/vc4/Automake.inc
@@ -6,8 +6,4 @@ TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/winsys/vc4/drm/libvc4drm.la \
 	$(top_builddir)/src/gallium/drivers/vc4/libvc4.la
 
-if USE_VC4_SIMULATOR
-TARGET_CPPFLAGS += -DUSE_VC4_SIMULATOR
-endif
-
 endif
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index f4a57ba3404..a3bf72fc72a 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -23,7 +23,6 @@ include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
 if USE_VC4_SIMULATOR
-SIM_CFLAGS = -DUSE_VC4_SIMULATOR=1
 SIM_LDFLAGS = -lsimpenrose
 endif
 
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index 373c9e12d11..0672a92226f 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -509,8 +509,8 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
         nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color);
         nir_ssa_def *src_color[4], *unpacked_dst_color[4];
         for (unsigned i = 0; i < 4; i++) {
-                src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false);
-                unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
+                src_color[i] = nir_channel(b, intr->src[0].ssa, i);
+                unpacked_dst_color[i] = nir_channel(b, dst_vec4, i);
         }
 
         vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 7ea263afb68..1afe52a63f4 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -84,7 +84,7 @@ vc4_nir_unpack_16u(nir_builder *b, nir_ssa_def *src, unsigned chan)
 static nir_ssa_def *
 vc4_nir_unpack_8f(nir_builder *b, nir_ssa_def *src, unsigned chan)
 {
-        return nir_swizzle(b, nir_unpack_unorm_4x8(b, src), &chan, 1, false);
+        return nir_channel(b, nir_unpack_unorm_4x8(b, src), chan);
 }
 
 static nir_ssa_def *
@@ -326,9 +326,8 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
                 intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
 
                 assert(intr->src[0].is_ssa);
-                intr_comp->src[0] = nir_src_for_ssa(nir_swizzle(b,
-                                                                intr->src[0].ssa,
-                                                                &i, 1, false));
+                intr_comp->src[0] =
+                        nir_src_for_ssa(nir_channel(b, intr->src[0].ssa, i));
                 nir_builder_instr_insert(b, &intr_comp->instr);
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index f1bab810eff..07a92266dd2 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -144,6 +144,8 @@ qir_opt_algebraic(struct vc4_compile *c)
                 case QOP_SEL_X_Y_ZC:
                 case QOP_SEL_X_Y_NS:
                 case QOP_SEL_X_Y_NC:
+                case QOP_SEL_X_Y_CS:
+                case QOP_SEL_X_Y_CC:
                         if (is_zero(c, inst->src[1])) {
                                 /* Replace references to a 0 uniform value
                                  * with the SEL_X_0 equivalent.
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index a48dad804e2..197577b6c20 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -987,6 +987,10 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 qir_SF(c, qir_SUB(c, src[0], src[1]));
                 *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
                 break;
+        case nir_op_uge:
+                qir_SF(c, qir_SUB(c, src[0], src[1]));
+                *dest = qir_SEL_X_0_CC(c, qir_uniform_ui(c, ~0));
+                break;
         case nir_op_ilt:
                 qir_SF(c, qir_SUB(c, src[0], src[1]));
                 *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
@@ -1167,7 +1171,7 @@ emit_point_size_write(struct vc4_compile *c)
         struct qreg point_size;
 
         if (c->output_point_size_index != -1)
-                point_size = c->outputs[c->output_point_size_index + 3];
+                point_size = c->outputs[c->output_point_size_index];
         else
                 point_size = qir_uniform_f(c, 1.0);
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 7894b081b19..f2855e159fc 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -69,10 +69,14 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1, false, true },
         [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1, false, true },
         [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1, false, true },
+        [QOP_SEL_X_0_CS] = { "fsel_x_0_cs", 1, 1, false, true },
+        [QOP_SEL_X_0_CC] = { "fsel_x_0_cc", 1, 1, false, true },
         [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2, false, true },
         [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2, false, true },
         [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2, false, true },
         [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2, false, true },
+        [QOP_SEL_X_Y_CS] = { "fsel_x_y_cs", 1, 2, false, true },
+        [QOP_SEL_X_Y_CC] = { "fsel_x_y_cc", 1, 2, false, true },
 
         [QOP_RCP] = { "rcp", 1, 1, false, true },
         [QOP_RSQ] = { "rsq", 1, 1, false, true },
@@ -218,10 +222,14 @@ qir_depends_on_flags(struct qinst *inst)
         case QOP_SEL_X_0_NC:
         case QOP_SEL_X_0_ZS:
         case QOP_SEL_X_0_ZC:
+        case QOP_SEL_X_0_CS:
+        case QOP_SEL_X_0_CC:
         case QOP_SEL_X_Y_NS:
         case QOP_SEL_X_Y_NC:
         case QOP_SEL_X_Y_ZS:
         case QOP_SEL_X_Y_ZC:
+        case QOP_SEL_X_Y_CS:
+        case QOP_SEL_X_Y_CC:
                 return true;
         default:
                 return false;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index a92ad93ee07..ddb35e41fcf 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -99,11 +99,15 @@ enum qop {
         QOP_SEL_X_0_ZC,
         QOP_SEL_X_0_NS,
         QOP_SEL_X_0_NC,
+        QOP_SEL_X_0_CS,
+        QOP_SEL_X_0_CC,
         /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */
         QOP_SEL_X_Y_ZS,
         QOP_SEL_X_Y_ZC,
         QOP_SEL_X_Y_NS,
         QOP_SEL_X_Y_NC,
+        QOP_SEL_X_Y_CS,
+        QOP_SEL_X_Y_CC,
 
         QOP_FTOI,
         QOP_ITOF,
@@ -567,10 +571,14 @@ QIR_ALU1(SEL_X_0_ZS)
 QIR_ALU1(SEL_X_0_ZC)
 QIR_ALU1(SEL_X_0_NS)
 QIR_ALU1(SEL_X_0_NC)
+QIR_ALU1(SEL_X_0_CS)
+QIR_ALU1(SEL_X_0_CC)
 QIR_ALU2(SEL_X_Y_ZS)
 QIR_ALU2(SEL_X_Y_ZC)
 QIR_ALU2(SEL_X_Y_NS)
 QIR_ALU2(SEL_X_Y_NC)
+QIR_ALU2(SEL_X_Y_CS)
+QIR_ALU2(SEL_X_Y_CC)
 QIR_ALU2(FMIN)
 QIR_ALU2(FMAX)
 QIR_ALU2(FMINABS)
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index f087c3b81b5..a57e100593c 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -22,14 +22,10 @@
  */
 
 /**
- * @file vc4_opt_algebraic.c
+ * @file vc4_qir_lower_uniforms.c
  *
- * This is the optimization pass for miscellaneous changes to instructions
- * where we can simplify the operation by some knowledge about the specific
- * operations.
- *
- * Mostly this will be a matter of turning things into MOVs so that they can
- * later be copy-propagated out.
+ * This is the pre-code-generation pass for fixing up instructions that try to
+ * read from multiple uniform values.
  */
 
 #include "vc4_qir.h"
@@ -85,6 +81,33 @@ is_lowerable_uniform(struct qinst *inst, int i)
         return true;
 }
 
+/* Returns the number of different uniform values referenced by the
+ * instruction.
+ */
+static uint32_t
+qir_get_instruction_uniform_count(struct qinst *inst)
+{
+        uint32_t count = 0;
+
+        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                if (inst->src[i].file != QFILE_UNIF)
+                        continue;
+
+                bool is_duplicate = false;
+                for (int j = 0; j < i; j++) {
+                        if (inst->src[j].file == QFILE_UNIF &&
+                            inst->src[j].index == inst->src[i].index) {
+                                is_duplicate = true;
+                                break;
+                        }
+                }
+                if (!is_duplicate)
+                        count++;
+        }
+
+        return count;
+}
+
 void
 qir_lower_uniforms(struct vc4_compile *c)
 {
@@ -98,13 +121,7 @@ qir_lower_uniforms(struct vc4_compile *c)
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 uint32_t nsrc = qir_get_op_nsrc(inst->op);
 
-                uint32_t count = 0;
-                for (int i = 0; i < nsrc; i++) {
-                        if (inst->src[i].file == QFILE_UNIF)
-                                count++;
-                }
-
-                if (count <= 1)
+                if (qir_get_instruction_uniform_count(inst) <= 1)
                         continue;
 
                 for (int i = 0; i < nsrc; i++) {
@@ -140,23 +157,22 @@ qir_lower_uniforms(struct vc4_compile *c)
                 list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                         uint32_t nsrc = qir_get_op_nsrc(inst->op);
 
-                        uint32_t count = 0;
-                        for (int i = 0; i < nsrc; i++) {
-                                if (inst->src[i].file == QFILE_UNIF)
-                                        count++;
-                        }
+                        uint32_t count = qir_get_instruction_uniform_count(inst);
 
                         if (count <= 1)
                                 continue;
 
+                        bool removed = false;
                         for (int i = 0; i < nsrc; i++) {
                                 if (is_lowerable_uniform(inst, i) &&
                                     inst->src[i].index == max_index) {
                                         inst->src[i] = temp;
                                         remove_uniform(ht, unif);
-                                        count--;
+                                        removed = true;
                                 }
                         }
+                        if (removed)
+                                count--;
 
                         /* If the instruction doesn't need lowering any more,
                          * then drop it from the list.
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 133e1385178..e0d3633da42 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -311,6 +311,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_SEL_X_0_ZC:
                 case QOP_SEL_X_0_NS:
                 case QOP_SEL_X_0_NC:
+                case QOP_SEL_X_0_CS:
+                case QOP_SEL_X_0_CC:
                         queue(c, qpu_a_MOV(dst, src[0]) | unpack);
                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
                                           QPU_COND_ZS);
@@ -324,6 +326,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_SEL_X_Y_ZC:
                 case QOP_SEL_X_Y_NS:
                 case QOP_SEL_X_Y_NC:
+                case QOP_SEL_X_Y_CS:
+                case QOP_SEL_X_Y_CC:
                         queue(c, qpu_a_MOV(dst, src[0]));
                         if (qinst->src[0].pack)
                                 *(last_inst(c)) |= unpack;
diff --git a/src/gallium/drivers/vc4/vc4_reorder_uniforms.c b/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
index 7f11fba2340..85a0c95e851 100644
--- a/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
@@ -44,18 +44,28 @@ qir_reorder_uniforms(struct vc4_compile *c)
         uint32_t next_uniform = 0;
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+                uint32_t new = ~0;
+
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                         if (inst->src[i].file != QFILE_UNIF)
                                 continue;
 
-                        uint32_t new = next_uniform++;
-                        if (uniform_index_size <= new) {
-                                uniform_index_size =
-                                        MAX2(uniform_index_size * 2, 16);
-                                uniform_index =
-                                        realloc(uniform_index,
-                                                uniform_index_size *
-                                                sizeof(uint32_t));
+                        if (new == ~0) {
+                                new = next_uniform++;
+                                if (uniform_index_size <= new) {
+                                        uniform_index_size =
+                                                MAX2(uniform_index_size * 2, 16);
+                                        uniform_index =
+                                                realloc(uniform_index,
+                                                        uniform_index_size *
+                                                        sizeof(uint32_t));
+                                }
+                        } else {
+                                /* If we've got two uniform references in this
+                                 * instruction, they need to be the same
+                                 * uniform value.
+                                 */
+                                assert(inst->src[i].index == uniform_index[new]);
                         }
 
                         uniform_index[new] = inst->src[i].index;
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 27f358f8fb9..be7447de67d 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -116,6 +116,25 @@ struct pipe_context {
                                        unsigned query_type,
                                        unsigned index );
 
+   /**
+    * Create a query object that queries all given query types simultaneously.
+    *
+    * This can only be used for those query types for which
+    * get_driver_query_info indicates that it must be used. Only one batch
+    * query object may be active at a time.
+    *
+    * There may be additional constraints on which query types can be used
+    * together, in particular those that are implied by
+    * get_driver_query_group_info.
+    *
+    * \param num_queries the number of query types
+    * \param query_types array of \p num_queries query types
+    * \return a query object, or NULL on error.
+    */
+   struct pipe_query *(*create_batch_query)( struct pipe_context *pipe,
+                                             unsigned num_queries,
+                                             unsigned *query_types );
+
    void (*destroy_query)(struct pipe_context *pipe,
                          struct pipe_query *q);
 
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 7240154727e..b3c8b9f7360 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -776,6 +776,16 @@ struct pipe_query_data_pipeline_statistics
 };
 
 /**
+ * For batch queries.
+ */
+union pipe_numeric_type_union
+{
+   uint64_t u64;
+   uint32_t u32;
+   float f;
+};
+
+/**
  * Query result (returned by pipe_context::get_query_result).
  */
 union pipe_query_result
@@ -791,6 +801,8 @@ union pipe_query_result
    /* PIPE_QUERY_PRIMITIVES_GENERATED */
    /* PIPE_QUERY_PRIMITIVES_EMITTED */
    /* PIPE_DRIVER_QUERY_TYPE_UINT64 */
+   /* PIPE_DRIVER_QUERY_TYPE_BYTES */
+   /* PIPE_DRIVER_QUERY_TYPE_MICROSECONDS */
    /* PIPE_DRIVER_QUERY_TYPE_HZ */
    uint64_t u64;
 
@@ -809,6 +821,9 @@ union pipe_query_result
 
    /* PIPE_QUERY_PIPELINE_STATISTICS */
    struct pipe_query_data_pipeline_statistics pipeline_statistics;
+
+   /* batch queries */
+   union pipe_numeric_type_union batch[0];
 };
 
 union pipe_color_union
@@ -829,12 +844,6 @@ enum pipe_driver_query_type
    PIPE_DRIVER_QUERY_TYPE_HZ           = 6,
 };
 
-enum pipe_driver_query_group_type
-{
-   PIPE_DRIVER_QUERY_GROUP_TYPE_CPU = 0,
-   PIPE_DRIVER_QUERY_GROUP_TYPE_GPU = 1,
-};
-
 /* Whether an average value per frame or a cumulative value should be
  * displayed.
  */
@@ -844,12 +853,13 @@ enum pipe_driver_query_result_type
    PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE = 1,
 };
 
-union pipe_numeric_type_union
-{
-   uint64_t u64;
-   uint32_t u32;
-   float f;
-};
+/**
+ * Some hardware requires some hardware-specific queries to be submitted
+ * as batched queries. The corresponding query objects are created using
+ * create_batch_query, and at most one such query may be active at
+ * any time.
+ */
+#define PIPE_DRIVER_QUERY_FLAG_BATCH     (1 << 0)
 
 struct pipe_driver_query_info
 {
@@ -859,12 +869,12 @@ struct pipe_driver_query_info
    enum pipe_driver_query_type type;
    enum pipe_driver_query_result_type result_type;
    unsigned group_id;
+   unsigned flags;
 };
 
 struct pipe_driver_query_group_info
 {
    const char *name;
-   enum pipe_driver_query_group_type type;
    unsigned max_active_queries;
    unsigned num_queries;
 };
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
index 5f0690e5ae6..d9c9f9b5cc2 100644
--- a/src/gallium/include/pipe/p_format.h
+++ b/src/gallium/include/pipe/p_format.h
@@ -359,6 +359,36 @@ enum pipe_format {
    PIPE_FORMAT_ETC2_RG11_UNORM         = 277,
    PIPE_FORMAT_ETC2_RG11_SNORM         = 278,
 
+   PIPE_FORMAT_ASTC_4x4                = 279,
+   PIPE_FORMAT_ASTC_5x4                = 280,
+   PIPE_FORMAT_ASTC_5x5                = 281,
+   PIPE_FORMAT_ASTC_6x5                = 282,
+   PIPE_FORMAT_ASTC_6x6                = 283,
+   PIPE_FORMAT_ASTC_8x5                = 284,
+   PIPE_FORMAT_ASTC_8x6                = 285,
+   PIPE_FORMAT_ASTC_8x8                = 286,
+   PIPE_FORMAT_ASTC_10x5               = 287,
+   PIPE_FORMAT_ASTC_10x6               = 288,
+   PIPE_FORMAT_ASTC_10x8               = 289,
+   PIPE_FORMAT_ASTC_10x10              = 290,
+   PIPE_FORMAT_ASTC_12x10              = 291,
+   PIPE_FORMAT_ASTC_12x12              = 292,
+
+   PIPE_FORMAT_ASTC_4x4_SRGB           = 293,
+   PIPE_FORMAT_ASTC_5x4_SRGB           = 294,
+   PIPE_FORMAT_ASTC_5x5_SRGB           = 295,
+   PIPE_FORMAT_ASTC_6x5_SRGB           = 296,
+   PIPE_FORMAT_ASTC_6x6_SRGB           = 297,
+   PIPE_FORMAT_ASTC_8x5_SRGB           = 298,
+   PIPE_FORMAT_ASTC_8x6_SRGB           = 299,
+   PIPE_FORMAT_ASTC_8x8_SRGB           = 300,
+   PIPE_FORMAT_ASTC_10x5_SRGB          = 301,
+   PIPE_FORMAT_ASTC_10x6_SRGB          = 302,
+   PIPE_FORMAT_ASTC_10x8_SRGB          = 303,
+   PIPE_FORMAT_ASTC_10x10_SRGB         = 304,
+   PIPE_FORMAT_ASTC_12x10_SRGB         = 305,
+   PIPE_FORMAT_ASTC_12x12_SRGB         = 306,
+
    PIPE_FORMAT_COUNT
 };
 
diff --git a/src/gallium/include/state_tracker/drm_driver.h b/src/gallium/include/state_tracker/drm_driver.h
index 740c4bbe1a6..959a7625e30 100644
--- a/src/gallium/include/state_tracker/drm_driver.h
+++ b/src/gallium/include/state_tracker/drm_driver.h
@@ -117,10 +117,4 @@ struct drm_driver_descriptor driver_descriptor = {             \
    .configuration = (conf),				       \
 };
 
-extern struct pipe_screen *dd_create_screen(int fd);
-
-extern const char *dd_driver_name(void);
-
-extern const struct drm_conf_ret *dd_configuration(enum drm_conf conf);
-
 #endif
diff --git a/src/gallium/include/state_tracker/sw_driver.h b/src/gallium/include/state_tracker/sw_driver.h
new file mode 100644
index 00000000000..0eb2b44d6fd
--- /dev/null
+++ b/src/gallium/include/state_tracker/sw_driver.h
@@ -0,0 +1,21 @@
+
+#ifndef _SW_DRIVER_H_
+#define _SW_DRIVER_H_
+
+#include "pipe/p_compiler.h"
+
+struct pipe_screen;
+struct sw_winsys;
+
+struct sw_driver_descriptor
+{
+   struct pipe_screen *(*create_screen)(struct sw_winsys *ws);
+   struct {
+       const char * const name;
+       struct sw_winsys *(*create_winsys)();
+   } winsys[];
+};
+
+extern struct sw_driver_descriptor swrast_driver_descriptor;
+
+#endif
diff --git a/src/gallium/state_trackers/clover/Makefile.am b/src/gallium/state_trackers/clover/Makefile.am
index fd0ccf88cc5..3c9421692fc 100644
--- a/src/gallium/state_trackers/clover/Makefile.am
+++ b/src/gallium/state_trackers/clover/Makefile.am
@@ -1,8 +1,6 @@
 include Makefile.sources
 
 AM_CPPFLAGS = \
-	$(GALLIUM_PIPE_LOADER_DEFINES) \
-	-DPIPE_SEARCH_DIR=\"$(libdir)/gallium-pipe\" \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/gallium/include \
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
index 6efff79c7f4..1be2f6413f4 100644
--- a/src/gallium/state_trackers/clover/core/device.cpp
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -41,7 +41,7 @@ namespace {
 
 device::device(clover::platform &platform, pipe_loader_device *ldev) :
    platform(platform), ldev(ldev) {
-   pipe = pipe_loader_create_screen(ldev, PIPE_SEARCH_DIR);
+   pipe = pipe_loader_create_screen(ldev);
    if (!pipe || !pipe->get_param(pipe, PIPE_CAP_COMPUTE)) {
       if (pipe)
          pipe->destroy(pipe);
diff --git a/src/gallium/state_trackers/dri/Android.mk b/src/gallium/state_trackers/dri/Android.mk
index 43f0de9b464..f0eb18dcacf 100644
--- a/src/gallium/state_trackers/dri/Android.mk
+++ b/src/gallium/state_trackers/dri/Android.mk
@@ -29,9 +29,6 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(common_SOURCES)
 
-LOCAL_CFLAGS := \
-	-DGALLIUM_STATIC_TARGETS=1 \
-
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa \
diff --git a/src/gallium/state_trackers/dri/Makefile.am b/src/gallium/state_trackers/dri/Makefile.am
index 9f4deba0c1e..74bccaa6416 100644
--- a/src/gallium/state_trackers/dri/Makefile.am
+++ b/src/gallium/state_trackers/dri/Makefile.am
@@ -25,8 +25,6 @@ include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
 AM_CPPFLAGS = \
-	$(GALLIUM_PIPE_LOADER_DEFINES) \
-	-DPIPE_SEARCH_DIR=\"$(libdir)/gallium-pipe\" \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/mesa \
@@ -36,15 +34,10 @@ AM_CPPFLAGS = \
 	$(LIBDRM_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
 
-if HAVE_GALLIUM_STATIC_TARGETS
-AM_CPPFLAGS += \
-	-DGALLIUM_STATIC_TARGETS=1
-
 if HAVE_GALLIUM_SOFTPIPE
 AM_CPPFLAGS += \
 	-DGALLIUM_SOFTPIPE
 endif # HAVE_GALLIUM_SOFTPIPE
-endif # HAVE_GALLIUM_STATIC_TARGETS
 
 noinst_LTLIBRARIES = libdri.la
 libdri_la_SOURCES = $(common_SOURCES)
diff --git a/src/gallium/state_trackers/dri/SConscript b/src/gallium/state_trackers/dri/SConscript
index 657300baf13..fa48fb8a0d7 100644
--- a/src/gallium/state_trackers/dri/SConscript
+++ b/src/gallium/state_trackers/dri/SConscript
@@ -15,10 +15,6 @@ env.Append(CPPPATH = [
     xmlpool_options.dir.dir, # Dir to generated xmlpool/options.h
 ])
 
-env.Append(CPPDEFINES = [
-    ('GALLIUM_STATIC_TARGETS', '1'),
-])
-
 sources = env.ParseSourceList('Makefile.sources', 'common_SOURCES')
 
 # XXX: if HAVE_DRISW
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index 019414b56fe..beb0866c83f 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -1444,8 +1444,8 @@ dri2_init_screen(__DRIscreen * sPriv)
    const __DRIconfig **configs;
    struct dri_screen *screen;
    struct pipe_screen *pscreen = NULL;
-   const struct drm_conf_ret *throttle_ret = NULL;
-   const struct drm_conf_ret *dmabuf_ret = NULL;
+   const struct drm_conf_ret *throttle_ret;
+   const struct drm_conf_ret *dmabuf_ret;
 
    screen = CALLOC_STRUCT(dri_screen);
    if (!screen)
@@ -1457,19 +1457,14 @@ dri2_init_screen(__DRIscreen * sPriv)
 
    sPriv->driverPrivate = (void *)screen;
 
-#if GALLIUM_STATIC_TARGETS
-   pscreen = dd_create_screen(screen->fd);
+   if (pipe_loader_drm_probe_fd(&screen->dev, dup(screen->fd)))
+      pscreen = pipe_loader_create_screen(screen->dev);
 
-   throttle_ret = dd_configuration(DRM_CONF_THROTTLE);
-   dmabuf_ret = dd_configuration(DRM_CONF_SHARE_FD);
-#else
-   if (pipe_loader_drm_probe_fd(&screen->dev, screen->fd)) {
-      pscreen = pipe_loader_create_screen(screen->dev, PIPE_SEARCH_DIR);
+   if (!pscreen)
+       goto fail;
 
-      throttle_ret = pipe_loader_configuration(screen->dev, DRM_CONF_THROTTLE);
-      dmabuf_ret = pipe_loader_configuration(screen->dev, DRM_CONF_SHARE_FD);
-   }
-#endif // GALLIUM_STATIC_TARGETS
+   throttle_ret = pipe_loader_configuration(screen->dev, DRM_CONF_THROTTLE);
+   dmabuf_ret = pipe_loader_configuration(screen->dev, DRM_CONF_SHARE_FD);
 
    if (throttle_ret && throttle_ret->val.val_int != -1) {
       screen->throttling_enabled = TRUE;
@@ -1486,20 +1481,14 @@ dri2_init_screen(__DRIscreen * sPriv)
       }
    }
 
-   if (pscreen && pscreen->get_param(pscreen, PIPE_CAP_DEVICE_RESET_STATUS_QUERY)) {
+   if (pscreen->get_param(pscreen, PIPE_CAP_DEVICE_RESET_STATUS_QUERY)) {
       sPriv->extensions = dri_robust_screen_extensions;
       screen->has_reset_status_query = true;
    }
    else
       sPriv->extensions = dri_screen_extensions;
 
-   /* dri_init_screen_helper checks pscreen for us */
-
-#if GALLIUM_STATIC_TARGETS
-   configs = dri_init_screen_helper(screen, pscreen, dd_driver_name());
-#else
    configs = dri_init_screen_helper(screen, pscreen, screen->dev->driver_name);
-#endif // GALLIUM_STATIC_TARGETS
    if (!configs)
       goto fail;
 
@@ -1511,10 +1500,8 @@ dri2_init_screen(__DRIscreen * sPriv)
    return configs;
 fail:
    dri_destroy_screen_helper(screen);
-#if !GALLIUM_STATIC_TARGETS
    if (screen->dev)
       pipe_loader_release(&screen->dev, 1);
-#endif // !GALLIUM_STATIC_TARGETS
    FREE(screen);
    return NULL;
 }
@@ -1527,7 +1514,6 @@ fail:
 static const __DRIconfig **
 dri_kms_init_screen(__DRIscreen * sPriv)
 {
-#if GALLIUM_STATIC_TARGETS
 #if defined(GALLIUM_SOFTPIPE)
    const __DRIconfig **configs;
    struct dri_screen *screen;
@@ -1543,7 +1529,11 @@ dri_kms_init_screen(__DRIscreen * sPriv)
 
    sPriv->driverPrivate = (void *)screen;
 
-   pscreen = kms_swrast_create_screen(screen->fd);
+   if (pipe_loader_sw_probe_kms(&screen->dev, dup(screen->fd)))
+      pscreen = pipe_loader_create_screen(screen->dev);
+
+   if (!pscreen)
+       goto fail;
 
    if (drmGetCap(sPriv->fd, DRM_CAP_PRIME, &cap) == 0 &&
           (cap & DRM_PRIME_CAP_IMPORT)) {
@@ -1553,7 +1543,6 @@ dri_kms_init_screen(__DRIscreen * sPriv)
 
    sPriv->extensions = dri_screen_extensions;
 
-   /* dri_init_screen_helper checks pscreen for us */
    configs = dri_init_screen_helper(screen, pscreen, "swrast");
    if (!configs)
       goto fail;
@@ -1566,9 +1555,10 @@ dri_kms_init_screen(__DRIscreen * sPriv)
    return configs;
 fail:
    dri_destroy_screen_helper(screen);
+   if (screen->dev)
+      pipe_loader_release(&screen->dev, 1);
    FREE(screen);
 #endif // GALLIUM_SOFTPIPE
-#endif // GALLIUM_STATIC_TARGETS
    return NULL;
 }
 
diff --git a/src/gallium/state_trackers/dri/dri_screen.c b/src/gallium/state_trackers/dri/dri_screen.c
index c4c2d9c8fb1..2ac55c88926 100644
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -390,9 +390,7 @@ dri_destroy_screen(__DRIscreen * sPriv)
 
    dri_destroy_screen_helper(screen);
 
-#if !GALLIUM_STATIC_TARGETS
    pipe_loader_release(&screen->dev, 1);
-#endif // !GALLIUM_STATIC_TARGETS
 
    free(screen);
    sPriv->driverPrivate = NULL;
@@ -416,11 +414,6 @@ dri_init_screen_helper(struct dri_screen *screen,
                        const char* driver_name)
 {
    screen->base.screen = pscreen;
-   if (!screen->base.screen) {
-      debug_printf("%s: failed to create pipe_screen\n", __FUNCTION__);
-      return NULL;
-   }
-
    screen->base.get_egl_image = dri_get_egl_image;
    screen->base.get_param = dri_get_param;
 
diff --git a/src/gallium/state_trackers/dri/drisw.c b/src/gallium/state_trackers/dri/drisw.c
index 753c59d696a..b85a73c57d2 100644
--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -39,6 +39,7 @@
 #include "util/u_inlines.h"
 #include "util/u_box.h"
 #include "pipe/p_context.h"
+#include "pipe-loader/pipe_loader.h"
 #include "state_tracker/drisw_api.h"
 #include "state_tracker/st_context.h"
 
@@ -382,7 +383,7 @@ drisw_init_screen(__DRIscreen * sPriv)
 {
    const __DRIconfig **configs;
    struct dri_screen *screen;
-   struct pipe_screen *pscreen;
+   struct pipe_screen *pscreen = NULL;
 
    screen = CALLOC_STRUCT(dri_screen);
    if (!screen)
@@ -396,8 +397,11 @@ drisw_init_screen(__DRIscreen * sPriv)
    sPriv->driverPrivate = (void *)screen;
    sPriv->extensions = drisw_screen_extensions;
 
-   pscreen = drisw_create_screen(&drisw_lf);
-   /* dri_init_screen_helper checks pscreen for us */
+   if (pipe_loader_sw_probe_dri(&screen->dev, &drisw_lf))
+      pscreen = pipe_loader_create_screen(screen->dev);
+
+   if (!pscreen)
+      goto fail;
 
    configs = dri_init_screen_helper(screen, pscreen, "swrast");
    if (!configs)
@@ -406,6 +410,8 @@ drisw_init_screen(__DRIscreen * sPriv)
    return configs;
 fail:
    dri_destroy_screen_helper(screen);
+   if (screen->dev)
+      pipe_loader_release(&screen->dev, 1);
    FREE(screen);
    return NULL;
 }
diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c
index 7df90b16a84..da9ca104d93 100644
--- a/src/gallium/state_trackers/omx/entrypoint.c
+++ b/src/gallium/state_trackers/omx/entrypoint.c
@@ -33,6 +33,7 @@
 
 #include <assert.h>
 #include <string.h>
+#include <stdbool.h>
 
 #include <X11/Xlib.h>
 
@@ -73,29 +74,30 @@ int omx_component_library_Setup(stLoaderComponentType **stComponents)
 
 struct vl_screen *omx_get_screen(void)
 {
+   static bool first_time = true;
    pipe_mutex_lock(omx_lock);
 
-   if (!omx_display) {
-      omx_render_node = debug_get_option("OMX_RENDER_NODE", NULL);
-      if (!omx_render_node) {
-         omx_display = XOpenDisplay(NULL);
-         if (!omx_display)
-            goto error;
-      }
-   }
-
    if (!omx_screen) {
+      if (first_time) {
+         omx_render_node = debug_get_option("OMX_RENDER_NODE", NULL);
+         first_time = false;
+      }
       if (omx_render_node) {
          drm_fd = loader_open_device(omx_render_node);
          if (drm_fd < 0)
             goto error;
+
          omx_screen = vl_drm_screen_create(drm_fd);
          if (!omx_screen) {
             close(drm_fd);
             goto error;
          }
       } else {
-         omx_screen = vl_screen_create(omx_display, 0);
+         omx_display = XOpenDisplay(NULL);
+         if (!omx_display)
+            goto error;
+
+         omx_screen = vl_dri2_screen_create(omx_display, 0);
          if (!omx_screen) {
             XCloseDisplay(omx_display);
             goto error;
@@ -117,16 +119,13 @@ void omx_put_screen(void)
 {
    pipe_mutex_lock(omx_lock);
    if ((--omx_usecount) == 0) {
-      if (!omx_render_node) {
-         vl_screen_destroy(omx_screen);
-         if (omx_display)
-            XCloseDisplay(omx_display);
-      } else {
-         close(drm_fd);
-         vl_drm_screen_destroy(omx_screen);
-      }
+      omx_screen->destroy(omx_screen);
       omx_screen = NULL;
-      omx_display = NULL;
+
+      if (omx_render_node)
+         close(drm_fd);
+      else
+         XCloseDisplay(omx_display);
    }
    pipe_mutex_unlock(omx_lock);
 }
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index 98c4104da48..f0051e5f6a5 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -102,7 +102,6 @@ PUBLIC VAStatus
 VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
 {
    vlVaDriver *drv;
-   int drm_fd;
    struct drm_state *drm_info;
 
    if (!ctx)
@@ -119,26 +118,20 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
       return VA_STATUS_ERROR_UNIMPLEMENTED;
    case VA_DISPLAY_GLX:
    case VA_DISPLAY_X11:
-      drv->vscreen = vl_screen_create(ctx->native_dpy, ctx->x11_screen);
+      drv->vscreen = vl_dri2_screen_create(ctx->native_dpy, ctx->x11_screen);
       if (!drv->vscreen)
          goto error_screen;
       break;
    case VA_DISPLAY_DRM:
    case VA_DISPLAY_DRM_RENDERNODES: {
       drm_info = (struct drm_state *) ctx->drm_state;
-      if (!drm_info) {
-         FREE(drv);
-         return VA_STATUS_ERROR_INVALID_PARAMETER;
-      }
-
-      drm_fd = drm_info->fd;
 
-      if (drm_fd < 0) {
+      if (!drm_info || drm_info->fd < 0) {
          FREE(drv);
          return VA_STATUS_ERROR_INVALID_PARAMETER;
       }
 
-      drv->vscreen = vl_drm_screen_create(drm_fd);
+      drv->vscreen = vl_drm_screen_create(drm_info->fd);
       if (!drv->vscreen)
          goto error_screen;
       }
@@ -182,10 +175,7 @@ error_htab:
    drv->pipe->destroy(drv->pipe);
 
 error_pipe:
-   if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11)
-      vl_screen_destroy(drv->vscreen);
-   else
-      vl_drm_screen_destroy(drv->vscreen);
+   drv->vscreen->destroy(drv->vscreen);
 
 error_screen:
    FREE(drv);
@@ -322,10 +312,7 @@ vlVaTerminate(VADriverContextP ctx)
    vl_compositor_cleanup_state(&drv->cstate);
    vl_compositor_cleanup(&drv->compositor);
    drv->pipe->destroy(drv->pipe);
-   if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11)
-      vl_screen_destroy(drv->vscreen);
-   else
-      vl_drm_screen_destroy(drv->vscreen);
+   drv->vscreen->destroy(drv->vscreen);
    handle_table_destroy(drv->htab);
    FREE(drv);
 
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index 5e7841a0521..a37a9b791db 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -763,7 +763,7 @@ handleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, v
    dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
    dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
 
-   dirty_area = vl_screen_get_dirty_area(drv->vscreen);
+   dirty_area = drv->vscreen->get_dirty_area(drv->vscreen);
 
    vl_compositor_clear_layers(&drv->cstate);
    vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 589d6860b6a..c052c8f2284 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -229,6 +229,7 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s
    struct pipe_screen *screen;
    struct pipe_resource *tex;
    struct pipe_surface surf_templ, *surf_draw;
+   struct vl_screen *vscreen;
    struct u_rect src_rect, *dirty_area;
    struct u_rect dst_rect = {destx, destx + destw, desty, desty + desth};
    VAStatus status;
@@ -242,17 +243,18 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s
       return VA_STATUS_ERROR_INVALID_SURFACE;
 
    screen = drv->pipe->screen;
+   vscreen = drv->vscreen;
 
    if(surf->fence) {
       screen->fence_finish(screen, surf->fence, PIPE_TIMEOUT_INFINITE);
       screen->fence_reference(screen, &surf->fence, NULL);
    }
 
-   tex = vl_screen_texture_from_drawable(drv->vscreen, (Drawable)draw);
+   tex = vscreen->texture_from_drawable(vscreen, draw);
    if (!tex)
       return VA_STATUS_ERROR_INVALID_DISPLAY;
 
-   dirty_area = vl_screen_get_dirty_area(drv->vscreen);
+   dirty_area = vscreen->get_dirty_area(vscreen);
 
    memset(&surf_templ, 0, sizeof(surf_templ));
    surf_templ.format = tex->format;
@@ -276,11 +278,8 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s
    if (status)
       return status;
 
-   screen->flush_frontbuffer
-   (
-      screen, tex, 0, 0,
-      vl_screen_get_private(drv->vscreen), NULL
-   );
+   screen->flush_frontbuffer(screen, tex, 0, 0,
+                             vscreen->get_private(vscreen), NULL);
 
    screen->fence_reference(screen, &surf->fence, NULL);
    drv->pipe->flush(drv->pipe, &surf->fence, 0);
diff --git a/src/gallium/state_trackers/vdpau/device.c b/src/gallium/state_trackers/vdpau/device.c
index 31c95054f56..c70cc6e2752 100644
--- a/src/gallium/state_trackers/vdpau/device.c
+++ b/src/gallium/state_trackers/vdpau/device.c
@@ -63,7 +63,7 @@ vdp_imp_device_create_x11(Display *display, int screen, VdpDevice *device,
 
    pipe_reference_init(&dev->reference, 1);
 
-   dev->vscreen = vl_screen_create(display, screen);
+   dev->vscreen = vl_dri2_screen_create(display, screen);
    if (!dev->vscreen) {
       ret = VDP_STATUS_RESOURCES;
       goto no_vscreen;
@@ -136,7 +136,7 @@ no_handle:
 no_resource:
    dev->context->destroy(dev->context);
 no_context:
-   vl_screen_destroy(dev->vscreen);
+   dev->vscreen->destroy(dev->vscreen);
 no_vscreen:
    FREE(dev);
 no_dev:
@@ -227,7 +227,7 @@ vlVdpDeviceFree(vlVdpDevice *dev)
    vl_compositor_cleanup(&dev->compositor);
    pipe_sampler_view_reference(&dev->dummy_sv, NULL);
    dev->context->destroy(dev->context);
-   vl_screen_destroy(dev->vscreen);
+   dev->vscreen->destroy(dev->vscreen);
    FREE(dev);
    vlDestroyHTAB();
 }
diff --git a/src/gallium/state_trackers/vdpau/presentation.c b/src/gallium/state_trackers/vdpau/presentation.c
index e53303708b2..e7f387e6173 100644
--- a/src/gallium/state_trackers/vdpau/presentation.c
+++ b/src/gallium/state_trackers/vdpau/presentation.c
@@ -186,7 +186,8 @@ vlVdpPresentationQueueGetTime(VdpPresentationQueue presentation_queue,
       return VDP_STATUS_INVALID_HANDLE;
 
    pipe_mutex_lock(pq->device->mutex);
-   *current_time = vl_screen_get_timestamp(pq->device->vscreen, pq->drawable);
+   *current_time = pq->device->vscreen->get_timestamp(pq->device->vscreen,
+                                                      (void *)pq->drawable);
    pipe_mutex_unlock(pq->device->mutex);
 
    return VDP_STATUS_OK;
@@ -214,6 +215,7 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
 
    struct vl_compositor *compositor;
    struct vl_compositor_state *cstate;
+   struct vl_screen *vscreen;
 
    pq = vlGetDataHTAB(presentation_queue);
    if (!pq)
@@ -226,15 +228,16 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
    pipe = pq->device->context;
    compositor = &pq->device->compositor;
    cstate = &pq->cstate;
+   vscreen = pq->device->vscreen;
 
    pipe_mutex_lock(pq->device->mutex);
-   tex = vl_screen_texture_from_drawable(pq->device->vscreen, pq->drawable);
+   tex = vscreen->texture_from_drawable(vscreen, (void *)pq->drawable);
    if (!tex) {
       pipe_mutex_unlock(pq->device->mutex);
       return VDP_STATUS_INVALID_HANDLE;
    }
 
-   dirty_area = vl_screen_get_dirty_area(pq->device->vscreen);
+   dirty_area = vscreen->get_dirty_area(vscreen);
 
    memset(&surf_templ, 0, sizeof(surf_templ));
    surf_templ.format = tex->format;
@@ -267,12 +270,9 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
       vl_compositor_render(cstate, compositor, surf_draw, dirty_area, true);
    }
 
-   vl_screen_set_next_timestamp(pq->device->vscreen, earliest_presentation_time);
-   pipe->screen->flush_frontbuffer
-   (
-      pipe->screen, tex, 0, 0,
-      vl_screen_get_private(pq->device->vscreen), NULL
-   );
+   vscreen->set_next_timestamp(vscreen, earliest_presentation_time);
+   pipe->screen->flush_frontbuffer(pipe->screen, tex, 0, 0,
+                                   vscreen->get_private(vscreen), NULL);
 
    pipe->screen->fence_reference(pipe->screen, &surf->fence, NULL);
    pipe->flush(pipe, &surf->fence, 0);
diff --git a/src/gallium/state_trackers/xa/Makefile.am b/src/gallium/state_trackers/xa/Makefile.am
index 5051e8246e3..968778f995c 100644
--- a/src/gallium/state_trackers/xa/Makefile.am
+++ b/src/gallium/state_trackers/xa/Makefile.am
@@ -28,15 +28,6 @@ AM_CFLAGS = \
 	$(GALLIUM_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
 
-AM_CPPFLAGS = \
-	$(GALLIUM_PIPE_LOADER_DEFINES) \
-	-DPIPE_SEARCH_DIR=\"$(libdir)/gallium-pipe\"
-
-if HAVE_GALLIUM_STATIC_TARGETS
-AM_CPPFLAGS += \
-	-DGALLIUM_STATIC_TARGETS=1
-endif
-
 xa_includedir = $(includedir)
 xa_include_HEADERS = \
 	xa_composite.h \
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index 4fdbdc96ae6..faa630c144b 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -152,21 +152,13 @@ xa_tracker_create(int drm_fd)
     struct xa_tracker *xa = calloc(1, sizeof(struct xa_tracker));
     enum xa_surface_type stype;
     unsigned int num_formats;
-    int loader_fd;
 
     if (!xa)
 	return NULL;
 
-#if GALLIUM_STATIC_TARGETS
-    xa->screen = dd_create_screen(drm_fd);
-    (void) loader_fd; /* silence unused var warning */
-#else
-    loader_fd = dup(drm_fd);
-    if (loader_fd == -1)
-        return NULL;
-    if (pipe_loader_drm_probe_fd(&xa->dev, loader_fd))
-	xa->screen = pipe_loader_create_screen(xa->dev, PIPE_SEARCH_DIR);
-#endif
+    if (pipe_loader_drm_probe_fd(&xa->dev, dup(drm_fd)))
+	xa->screen = pipe_loader_create_screen(xa->dev);
+
     if (!xa->screen)
 	goto out_no_screen;
 
@@ -214,10 +206,8 @@ xa_tracker_create(int drm_fd)
  out_no_pipe:
     xa->screen->destroy(xa->screen);
  out_no_screen:
-#if !GALLIUM_STATIC_TARGETS
     if (xa->dev)
 	pipe_loader_release(&xa->dev, 1);
-#endif
     free(xa);
     return NULL;
 }
@@ -228,9 +218,7 @@ xa_tracker_destroy(struct xa_tracker *xa)
     free(xa->supported_formats);
     xa_context_destroy(xa->default_ctx);
     xa->screen->destroy(xa->screen);
-#if !GALLIUM_STATIC_TARGETS
     pipe_loader_release(&xa->dev, 1);
-#endif
     free(xa);
 }
 
diff --git a/src/gallium/state_trackers/xvmc/context.c b/src/gallium/state_trackers/xvmc/context.c
index 4702b44d1f4..a6991ab8d61 100644
--- a/src/gallium/state_trackers/xvmc/context.c
+++ b/src/gallium/state_trackers/xvmc/context.c
@@ -229,7 +229,7 @@ Status XvMCCreateContext(Display *dpy, XvPortID port, int surface_type_id,
       return BadAlloc;
 
    /* TODO: Reuse screen if process creates another context */
-   vscreen = vl_screen_create(dpy, scrn);
+   vscreen = vl_dri2_screen_create(dpy, scrn);
 
    if (!vscreen) {
       XVMC_MSG(XVMC_ERR, "[XvMC] Could not create VL screen.\n");
@@ -240,7 +240,7 @@ Status XvMCCreateContext(Display *dpy, XvPortID port, int surface_type_id,
    pipe = vscreen->pscreen->context_create(vscreen->pscreen, vscreen, 0);
    if (!pipe) {
       XVMC_MSG(XVMC_ERR, "[XvMC] Could not create VL context.\n");
-      vl_screen_destroy(vscreen);
+      vscreen->destroy(vscreen);
       FREE(context_priv);
       return BadAlloc;
    }
@@ -258,7 +258,7 @@ Status XvMCCreateContext(Display *dpy, XvPortID port, int surface_type_id,
    if (!context_priv->decoder) {
       XVMC_MSG(XVMC_ERR, "[XvMC] Could not create VL decoder.\n");
       pipe->destroy(pipe);
-      vl_screen_destroy(vscreen);
+      vscreen->destroy(vscreen);
       FREE(context_priv);
       return BadAlloc;
    }
@@ -267,7 +267,7 @@ Status XvMCCreateContext(Display *dpy, XvPortID port, int surface_type_id,
       XVMC_MSG(XVMC_ERR, "[XvMC] Could not create VL compositor.\n");
       context_priv->decoder->destroy(context_priv->decoder);
       pipe->destroy(pipe);
-      vl_screen_destroy(vscreen);
+      vscreen->destroy(vscreen);
       FREE(context_priv);
       return BadAlloc;
    }
@@ -277,7 +277,7 @@ Status XvMCCreateContext(Display *dpy, XvPortID port, int surface_type_id,
       vl_compositor_cleanup(&context_priv->compositor);
       context_priv->decoder->destroy(context_priv->decoder);
       pipe->destroy(pipe);
-      vl_screen_destroy(vscreen);
+      vscreen->destroy(vscreen);
       FREE(context_priv);
       return BadAlloc;
    }
@@ -332,7 +332,7 @@ Status XvMCDestroyContext(Display *dpy, XvMCContext *context)
    vl_compositor_cleanup_state(&context_priv->cstate);
    vl_compositor_cleanup(&context_priv->compositor);
    context_priv->pipe->destroy(context_priv->pipe);
-   vl_screen_destroy(context_priv->vscreen);
+   context_priv->vscreen->destroy(context_priv->vscreen);
    FREE(context_priv);
    context->privData = NULL;
 
diff --git a/src/gallium/state_trackers/xvmc/surface.c b/src/gallium/state_trackers/xvmc/surface.c
index 15eae59ff6e..199712ba168 100644
--- a/src/gallium/state_trackers/xvmc/surface.c
+++ b/src/gallium/state_trackers/xvmc/surface.c
@@ -355,6 +355,7 @@ Status XvMCPutSurface(Display *dpy, XvMCSurface *surface, Drawable drawable,
    struct pipe_context *pipe;
    struct vl_compositor *compositor;
    struct vl_compositor_state *cstate;
+   struct vl_screen *vscreen;
 
    XvMCSurfacePrivate *surface_priv;
    XvMCContextPrivate *context_priv;
@@ -386,9 +387,10 @@ Status XvMCPutSurface(Display *dpy, XvMCSurface *surface, Drawable drawable,
    pipe = context_priv->pipe;
    compositor = &context_priv->compositor;
    cstate = &context_priv->cstate;
+   vscreen = context_priv->vscreen;
 
-   tex = vl_screen_texture_from_drawable(context_priv->vscreen, drawable);
-   dirty_area = vl_screen_get_dirty_area(context_priv->vscreen);
+   tex = vscreen->texture_from_drawable(vscreen, (void *)drawable);
+   dirty_area = vscreen->get_dirty_area(vscreen);
 
    memset(&surf_templ, 0, sizeof(surf_templ));
    surf_templ.format = tex->format;
@@ -444,11 +446,8 @@ Status XvMCPutSurface(Display *dpy, XvMCSurface *surface, Drawable drawable,
 
    XVMC_MSG(XVMC_TRACE, "[XvMC] Submitted surface %p for display. Pushing to front buffer.\n", surface);
 
-   pipe->screen->flush_frontbuffer
-   (
-      pipe->screen, tex, 0, 0,
-      vl_screen_get_private(context_priv->vscreen), NULL
-   );
+   pipe->screen->flush_frontbuffer(pipe->screen, tex, 0, 0,
+                                   vscreen->get_private(vscreen), NULL);
 
    if(dump_window == -1) {
       dump_window = debug_get_num_option("XVMC_DUMP", 0);
diff --git a/src/gallium/targets/d3dadapter9/Makefile.am b/src/gallium/targets/d3dadapter9/Makefile.am
index b5221472ef0..d1d9829b6c5 100644
--- a/src/gallium/targets/d3dadapter9/Makefile.am
+++ b/src/gallium/targets/d3dadapter9/Makefile.am
@@ -34,19 +34,6 @@ AM_CFLAGS = \
 	$(GALLIUM_TARGET_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
 
-if HAVE_GALLIUM_STATIC_TARGETS
-AM_CPPFLAGS = \
-	-DNINE_TARGET \
-	-DGALLIUM_STATIC_TARGETS=1
-
-else
-
-AM_CPPFLAGS = \
-	-DPIPE_SEARCH_DIR=\"$(libdir)/gallium-pipe\" \
-	$(GALLIUM_PIPE_LOADER_DEFINES)
-
-endif
-
 ninedir = $(D3D_DRIVER_INSTALL_DIR)
 nine_LTLIBRARIES = d3dadapter9.la
 
@@ -78,7 +65,6 @@ d3dadapter9_la_LIBADD = \
 	$(top_builddir)/src/glsl/libnir.la \
 	$(top_builddir)/src/gallium/state_trackers/nine/libninetracker.la \
 	$(top_builddir)/src/util/libmesautil.la \
-	$(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \
 	$(EXPAT_LIBS) \
 	$(GALLIUM_COMMON_LIB_DEPS)
 
@@ -87,7 +73,7 @@ EXTRA_DIST = d3dadapter9.sym
 
 TARGET_DRIVERS =
 TARGET_CPPFLAGS =
-TARGET_LIB_DEPS = $(top_builddir)/src/loader/libloader.la
+TARGET_LIB_DEPS =
 
 include $(top_srcdir)/src/gallium/drivers/i915/Automake.inc
 
@@ -111,14 +97,16 @@ include $(top_srcdir)/src/gallium/drivers/llvmpipe/Automake.inc
 if HAVE_GALLIUM_STATIC_TARGETS
 
 d3dadapter9_la_CPPFLAGS = $(AM_CPPFLAGS) $(TARGET_CPPFLAGS)
-d3dadapter9_la_LIBADD += $(TARGET_LIB_DEPS) \
+d3dadapter9_la_LIBADD += \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_static.la \
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
+	$(TARGET_LIB_DEPS) \
 	$(TARGET_RADEON_WINSYS) $(TARGET_RADEON_COMMON)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
 
 d3dadapter9_la_LIBADD += \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/d3dadapter9/drm.c b/src/gallium/targets/d3dadapter9/drm.c
index fabc820f268..ad712db05eb 100644
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -20,6 +20,7 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+/* XXX: header order is slightly screwy here */
 #include "loader.h"
 
 #include "adapter9.h"
@@ -29,8 +30,7 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
 
-#include "target-helpers/inline_drm_helper.h"
-#include "target-helpers/inline_sw_helper.h"
+#include "target-helpers/drm_helper.h"
 #include "state_tracker/drm_driver.h"
 
 #include "d3dadapter/d3dadapter9.h"
@@ -91,53 +91,15 @@ drm_destroy( struct d3dadapter9_context *ctx )
     else if (ctx->hal)
         ctx->hal->destroy(ctx->hal);
 
-#if !GALLIUM_STATIC_TARGETS
     if (drm->swdev)
         pipe_loader_release(&drm->swdev, 1);
     if (drm->dev)
         pipe_loader_release(&drm->dev, 1);
-#endif
 
     close(drm->fd);
     FREE(ctx);
 }
 
-/* read a DWORD in the form 0xnnnnnnnn, which is how sysfs pci id stuff is
- * formatted. */
-static inline DWORD
-read_file_dword( const char *name )
-{
-    char buf[32];
-    int fd, r;
-
-    fd = open(name, O_RDONLY);
-    if (fd < 0) {
-        DBG("Unable to get PCI information from `%s'\n", name);
-        return 0;
-    }
-
-    r = read(fd, buf, 32);
-    close(fd);
-
-    return (r > 0) ? (DWORD)strtol(buf, NULL, 0) : 0;
-}
-
-/* sysfs doesn't expose the revision as its own file, so this function grabs a
- * dword at an offset in the raw PCI header. The reason this isn't used for all
- * data is that the kernel will make corrections but not expose them in the raw
- * header bytes. */
-static inline DWORD
-read_config_dword( int fd,
-                   unsigned offset )
-{
-    DWORD r = 0;
-
-    if (lseek(fd, offset, SEEK_SET) != offset) { return 0; }
-    if (read(fd, &r, 4) != 4) { return 0; }
-
-    return r;
-}
-
 static inline void
 get_bus_info( int fd,
               DWORD *vendorid,
@@ -215,26 +177,16 @@ drm_create_adapter( int fd,
     driOptionCache userInitOptions;
     int throttling_value_user = -2;
 
-#if !GALLIUM_STATIC_TARGETS
-    const char *paths[] = {
-        getenv("D3D9_DRIVERS_PATH"),
-        getenv("D3D9_DRIVERS_DIR"),
-        PIPE_SEARCH_DIR
-    };
-#endif
-
     if (!ctx) { return E_OUTOFMEMORY; }
 
     ctx->base.destroy = drm_destroy;
 
+    /* Although the fd is provided from external source, mesa/nine
+     * takes ownership of it. */
     fd = loader_get_user_preferred_fd(fd, &different_device);
     ctx->fd = fd;
     ctx->base.linear_framebuffer = !!different_device;
 
-#if GALLIUM_STATIC_TARGETS
-    ctx->base.hal = dd_create_screen(fd);
-#else
-    /* use pipe-loader to dlopen appropriate drm driver */
     if (!pipe_loader_drm_probe_fd(&ctx->dev, fd)) {
         ERR("Failed to probe drm fd %d.\n", fd);
         FREE(ctx);
@@ -242,26 +194,15 @@ drm_create_adapter( int fd,
         return D3DERR_DRIVERINTERNALERROR;
     }
 
-    /* use pipe-loader to create a drm screen (hal) */
-    ctx->base.hal = NULL;
-    for (i = 0; !ctx->base.hal && i < Elements(paths); ++i) {
-        if (!paths[i]) { continue; }
-        ctx->base.hal = pipe_loader_create_screen(ctx->dev, paths[i]);
-    }
-#endif
+    ctx->base.hal = pipe_loader_create_screen(ctx->dev);
     if (!ctx->base.hal) {
         ERR("Unable to load requested driver.\n");
         drm_destroy(&ctx->base);
         return D3DERR_DRIVERINTERNALERROR;
     }
 
-#if GALLIUM_STATIC_TARGETS
-    dmabuf_ret = dd_configuration(DRM_CONF_SHARE_FD);
-    throttle_ret = dd_configuration(DRM_CONF_THROTTLE);
-#else
     dmabuf_ret = pipe_loader_configuration(ctx->dev, DRM_CONF_SHARE_FD);
     throttle_ret = pipe_loader_configuration(ctx->dev, DRM_CONF_THROTTLE);
-#endif // GALLIUM_STATIC_TARGETS
     if (!dmabuf_ret || !dmabuf_ret->val.val_bool) {
         ERR("The driver is not capable of dma-buf sharing."
             "Abandon to load nine state tracker\n");
@@ -308,18 +249,10 @@ drm_create_adapter( int fd,
     driDestroyOptionCache(&userInitOptions);
     driDestroyOptionInfo(&defaultInitOptions);
 
-#if GALLIUM_STATIC_TARGETS
-    ctx->base.ref = ninesw_create_screen(ctx->base.hal);
-#else
     /* wrap it to create a software screen that can share resources */
-    if (pipe_loader_sw_probe_wrapped(&ctx->swdev, ctx->base.hal)) {
-        ctx->base.ref = NULL;
-        for (i = 0; !ctx->base.ref && i < Elements(paths); ++i) {
-            if (!paths[i]) { continue; }
-            ctx->base.ref = pipe_loader_create_screen(ctx->swdev, paths[i]);
-        }
-    }
-#endif
+    if (pipe_loader_sw_probe_wrapped(&ctx->swdev, ctx->base.hal))
+        ctx->base.ref = pipe_loader_create_screen(ctx->swdev);
+
     if (!ctx->base.ref) {
         ERR("Couldn't wrap drm screen to swrast screen. Software devices "
             "will be unavailable.\n");
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index a33d7f83671..2d9610ee9ab 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -35,7 +35,7 @@ endif
 
 LOCAL_SRC_FILES := target.c
 
-LOCAL_CFLAGS := -DDRI_TARGET
+LOCAL_CFLAGS :=
 
 LOCAL_SHARED_LIBRARIES := \
 	libdl \
@@ -108,6 +108,7 @@ LOCAL_STATIC_LIBRARIES := \
 	libmesa_dri_common \
 	libmesa_megadriver_stub \
 	libmesa_gallium \
+	libmesa_pipe_loader \
 	libmesa_util \
 	libmesa_loader \
 
diff --git a/src/gallium/targets/dri/Makefile.am b/src/gallium/targets/dri/Makefile.am
index 95efdd4451c..2666524fbfe 100644
--- a/src/gallium/targets/dri/Makefile.am
+++ b/src/gallium/targets/dri/Makefile.am
@@ -10,7 +10,6 @@ AM_CFLAGS = \
 
 AM_CPPFLAGS = \
 	$(DEFINES) \
-	-DDRI_TARGET \
         -DGALLIUM_DDEBUG \
 	-DGALLIUM_NOOP \
 	-DGALLIUM_RBUG \
@@ -65,7 +64,7 @@ EXTRA_DIST = \
 
 TARGET_DRIVERS =
 TARGET_CPPFLAGS =
-TARGET_LIB_DEPS = $(top_builddir)/src/loader/libloader.la
+TARGET_LIB_DEPS =
 
 include $(top_srcdir)/src/gallium/drivers/i915/Automake.inc
 
@@ -92,14 +91,16 @@ if HAVE_GALLIUM_STATIC_TARGETS
 
 gallium_dri_la_SOURCES += target.c
 gallium_dri_la_CPPFLAGS = $(AM_CPPFLAGS) $(TARGET_CPPFLAGS)
-gallium_dri_la_LIBADD += $(TARGET_LIB_DEPS) \
+gallium_dri_la_LIBADD += \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_static.la \
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
+	$(TARGET_LIB_DEPS) \
 	$(TARGET_RADEON_WINSYS) $(TARGET_RADEON_COMMON)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
 
 gallium_dri_la_LIBADD += \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/dri/SConscript b/src/gallium/targets/dri/SConscript
index 2fb0da09200..b4516598675 100644
--- a/src/gallium/targets/dri/SConscript
+++ b/src/gallium/targets/dri/SConscript
@@ -30,7 +30,6 @@ env.PkgUseModules('DRM')
 env.Append(CPPDEFINES = [
     'GALLIUM_VMWGFX',
     'GALLIUM_SOFTPIPE',
-    'DRI_TARGET',
 ])
 
 env.Prepend(LIBS = [
@@ -39,6 +38,7 @@ env.Prepend(LIBS = [
     svga,
     ws_dri,
     softpipe,
+    pipe_loader,
     libloader,
     mesautil,
     mesa,
diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c
index 32a11ef6281..d6fbd01b88f 100644
--- a/src/gallium/targets/dri/target.c
+++ b/src/gallium/targets/dri/target.c
@@ -1,2 +1,163 @@
-#include "target-helpers/inline_drm_helper.h"
-#include "target-helpers/inline_sw_helper.h"
+#include "target-helpers/drm_helper.h"
+
+#include "dri_screen.h"
+
+#if defined(GALLIUM_SOFTPIPE)
+
+const __DRIextension **__driDriverGetExtensions_swrast(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_swrast(void)
+{
+   globalDriverAPI = &galliumsw_driver_api;
+   return galliumsw_driver_extensions;
+}
+
+#if defined(HAVE_LIBDRM)
+
+const __DRIextension **__driDriverGetExtensions_kms_swrast(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_kms_swrast(void)
+{
+   globalDriverAPI = &dri_kms_driver_api;
+   return galliumdrm_driver_extensions;
+}
+
+#endif
+#endif
+
+#if defined(GALLIUM_I915)
+
+const __DRIextension **__driDriverGetExtensions_i915(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_i915(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_ILO)
+
+const __DRIextension **__driDriverGetExtensions_i965(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_i965(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_NOUVEAU)
+
+const __DRIextension **__driDriverGetExtensions_nouveau(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_nouveau(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_R300)
+
+const __DRIextension **__driDriverGetExtensions_r300(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_r300(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_R600)
+
+const __DRIextension **__driDriverGetExtensions_r600(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_r600(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_RADEONSI)
+
+const __DRIextension **__driDriverGetExtensions_radeonsi(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_radeonsi(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_VMWGFX)
+
+const __DRIextension **__driDriverGetExtensions_vmwgfx(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_vmwgfx(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_FREEDRENO)
+
+const __DRIextension **__driDriverGetExtensions_msm(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_msm(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+
+const __DRIextension **__driDriverGetExtensions_kgsl(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_kgsl(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_VIRGL)
+
+const __DRIextension **__driDriverGetExtensions_virtio_gpu(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_virtio_gpu(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+#if defined(GALLIUM_VC4)
+
+const __DRIextension **__driDriverGetExtensions_vc4(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_vc4(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+
+#if defined(USE_VC4_SIMULATOR)
+const __DRIextension **__driDriverGetExtensions_i965(void);
+
+/**
+ * When building using the simulator (on x86), we advertise ourselves as the
+ * i965 driver so that you can just make a directory with a link from
+ * i965_dri.so to the built vc4_dri.so, and point LIBGL_DRIVERS_PATH to that
+ * on your i965-using host to run the driver under simulation.
+ *
+ * This is, of course, incompatible with building with the ilo driver, but you
+ * shouldn't be building that anyway.
+ */
+PUBLIC const __DRIextension **__driDriverGetExtensions_i965(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+#endif
diff --git a/src/gallium/targets/omx/Makefile.am b/src/gallium/targets/omx/Makefile.am
index a4dff487dd8..3bdb9eb7e61 100644
--- a/src/gallium/targets/omx/Makefile.am
+++ b/src/gallium/targets/omx/Makefile.am
@@ -40,7 +40,7 @@ if HAVE_GALLIUM_STATIC_TARGETS
 
 TARGET_DRIVERS =
 TARGET_CPPFLAGS =
-TARGET_LIB_DEPS = $(top_builddir)/src/loader/libloader.la
+TARGET_LIB_DEPS =
 
 
 include $(top_srcdir)/src/gallium/drivers/nouveau/Automake.inc
@@ -50,14 +50,16 @@ include $(top_srcdir)/src/gallium/drivers/radeonsi/Automake.inc
 
 libomx_mesa_la_SOURCES += target.c
 libomx_mesa_la_CPPFLAGS = $(TARGET_CPPFLAGS)
-libomx_mesa_la_LIBADD += $(TARGET_LIB_DEPS) \
+libomx_mesa_la_LIBADD += \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_static.la \
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
+	$(TARGET_LIB_DEPS) \
 	$(TARGET_RADEON_WINSYS) $(TARGET_RADEON_COMMON)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
 
 libomx_mesa_la_LIBADD += \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/omx/target.c b/src/gallium/targets/omx/target.c
index fde4a4a7dcf..42b1346d341 100644
--- a/src/gallium/targets/omx/target.c
+++ b/src/gallium/targets/omx/target.c
@@ -1 +1 @@
-#include "target-helpers/inline_drm_helper.h"
+#include "target-helpers/drm_helper.h"
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am
index c78b26832ff..3cb29766724 100644
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -15,11 +15,10 @@ lib@OPENCL_LIBNAME@_la_LDFLAGS += \
 endif
 
 lib@OPENCL_LIBNAME@_la_LIBADD = \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la \
 	$(top_builddir)/src/gallium/state_trackers/clover/libclover.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
 	$(ELF_LIB) \
 	-ldl \
 	-lclangCodeGen \
diff --git a/src/gallium/targets/pipe-loader/Makefile.am b/src/gallium/targets/pipe-loader/Makefile.am
index 4f25b4f6073..4bc3b55f26b 100644
--- a/src/gallium/targets/pipe-loader/Makefile.am
+++ b/src/gallium/targets/pipe-loader/Makefile.am
@@ -27,6 +27,7 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/gallium/drivers \
 	-I$(top_srcdir)/src/gallium/winsys \
+	$(GALLIUM_PIPE_LOADER_DEFINES) \
 	$(LIBDRM_CFLAGS) \
 	$(VISIBILITY_CFLAGS) \
 	-DGALLIUM_RBUG \
@@ -208,6 +209,10 @@ AM_CPPFLAGS += -DGALLIUM_LLVMPIPE
 pipe_swrast_la_LIBADD += \
 	$(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la
 endif
+
+pipe_swrast_la_LIBADD += \
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
+
 endif
 
 EXTRA_DIST = pipe.sym
diff --git a/src/gallium/targets/pipe-loader/pipe.sym b/src/gallium/targets/pipe-loader/pipe.sym
index 19b1d77b040..b2fa619f7de 100644
--- a/src/gallium/targets/pipe-loader/pipe.sym
+++ b/src/gallium/targets/pipe-loader/pipe.sym
@@ -1,7 +1,7 @@
 {
 	global:
 		driver_descriptor;
-		swrast_create_screen;
+		swrast_driver_descriptor;
 	local:
 		*;
 };
diff --git a/src/gallium/targets/pipe-loader/pipe_swrast.c b/src/gallium/targets/pipe-loader/pipe_swrast.c
index f7f354acf3f..cf617f37e20 100644
--- a/src/gallium/targets/pipe-loader/pipe_swrast.c
+++ b/src/gallium/targets/pipe-loader/pipe_swrast.c
@@ -1,7 +1,11 @@
 
 #include "target-helpers/inline_sw_helper.h"
 #include "target-helpers/inline_debug_helper.h"
-#include "state_tracker/drm_driver.h"
+#include "state_tracker/sw_driver.h"
+#include "sw/dri/dri_sw_winsys.h"
+#include "sw/kms-dri/kms_dri_sw_winsys.h"
+#include "sw/null/null_sw_winsys.h"
+#include "sw/wrapper/wrapper_sw_winsys.h"
 
 PUBLIC struct pipe_screen *
 swrast_create_screen(struct sw_winsys *ws);
@@ -17,3 +21,31 @@ swrast_create_screen(struct sw_winsys *ws)
 
    return screen;
 }
+
+PUBLIC
+struct sw_driver_descriptor swrast_driver_descriptor = {
+   .create_screen = swrast_create_screen,
+   .winsys = {
+#ifdef HAVE_PIPE_LOADER_DRI
+      {
+         .name = "dri",
+         .create_winsys = dri_create_sw_winsys,
+      },
+#endif
+#ifdef HAVE_PIPE_LOADER_KMS
+      {
+         .name = "kms_dri",
+         .create_winsys = kms_dri_create_winsys,
+      },
+#endif
+      {
+         .name = "null",
+         .create_winsys = null_sw_create,
+      },
+      {
+         .name = "wrapped",
+         .create_winsys = wrapper_sw_winsys_wrap_pipe_screen,
+      },
+      { 0 },
+   }
+};
diff --git a/src/gallium/targets/va/Makefile.am b/src/gallium/targets/va/Makefile.am
index 9613f041b58..733e7acb455 100644
--- a/src/gallium/targets/va/Makefile.am
+++ b/src/gallium/targets/va/Makefile.am
@@ -40,21 +40,23 @@ if HAVE_GALLIUM_STATIC_TARGETS
 
 TARGET_DRIVERS =
 TARGET_CPPFLAGS =
-TARGET_LIB_DEPS = $(top_builddir)/src/loader/libloader.la
+TARGET_LIB_DEPS =
 
 include $(top_srcdir)/src/gallium/drivers/r600/Automake.inc
 include $(top_srcdir)/src/gallium/drivers/radeonsi/Automake.inc
 
 gallium_drv_video_la_SOURCES += target.c
 gallium_drv_video_la_CPPFLAGS = $(TARGET_CPPFLAGS)
-gallium_drv_video_la_LIBADD += $(TARGET_LIB_DEPS) \
+gallium_drv_video_la_LIBADD += \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_static.la \
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
+	$(TARGET_LIB_DEPS) \
 	$(TARGET_RADEON_WINSYS) $(TARGET_RADEON_COMMON)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
 
 gallium_drv_video_la_LIBADD += \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/va/target.c b/src/gallium/targets/va/target.c
index fde4a4a7dcf..42b1346d341 100644
--- a/src/gallium/targets/va/target.c
+++ b/src/gallium/targets/va/target.c
@@ -1 +1 @@
-#include "target-helpers/inline_drm_helper.h"
+#include "target-helpers/drm_helper.h"
diff --git a/src/gallium/targets/vdpau/Makefile.am b/src/gallium/targets/vdpau/Makefile.am
index 7eb62c1cc78..d388f8b5014 100644
--- a/src/gallium/targets/vdpau/Makefile.am
+++ b/src/gallium/targets/vdpau/Makefile.am
@@ -47,7 +47,7 @@ EXTRA_DIST = \
 
 TARGET_DRIVERS =
 TARGET_CPPFLAGS =
-TARGET_LIB_DEPS = $(top_builddir)/src/loader/libloader.la
+TARGET_LIB_DEPS =
 
 include $(top_srcdir)/src/gallium/drivers/nouveau/Automake.inc
 
@@ -59,14 +59,16 @@ if HAVE_GALLIUM_STATIC_TARGETS
 
 libvdpau_gallium_la_SOURCES += target.c
 libvdpau_gallium_la_CPPFLAGS = $(TARGET_CPPFLAGS)
-libvdpau_gallium_la_LIBADD += $(TARGET_LIB_DEPS) \
+libvdpau_gallium_la_LIBADD += \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_static.la \
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
+	$(TARGET_LIB_DEPS) \
 	$(TARGET_RADEON_WINSYS) $(TARGET_RADEON_COMMON)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
 
 libvdpau_gallium_la_LIBADD += \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/vdpau/target.c b/src/gallium/targets/vdpau/target.c
index fde4a4a7dcf..42b1346d341 100644
--- a/src/gallium/targets/vdpau/target.c
+++ b/src/gallium/targets/vdpau/target.c
@@ -1 +1 @@
-#include "target-helpers/inline_drm_helper.h"
+#include "target-helpers/drm_helper.h"
diff --git a/src/gallium/targets/xa/Makefile.am b/src/gallium/targets/xa/Makefile.am
index 02c42c665ed..a63fd6903a4 100644
--- a/src/gallium/targets/xa/Makefile.am
+++ b/src/gallium/targets/xa/Makefile.am
@@ -60,7 +60,7 @@ if HAVE_GALLIUM_STATIC_TARGETS
 
 TARGET_DRIVERS =
 TARGET_CPPFLAGS =
-TARGET_LIB_DEPS = $(top_builddir)/src/loader/libloader.la
+TARGET_LIB_DEPS =
 
 include $(top_srcdir)/src/gallium/drivers/i915/Automake.inc
 
@@ -74,13 +74,15 @@ include $(top_srcdir)/src/gallium/drivers/freedreno/Automake.inc
 
 libxatracker_la_SOURCES += target.c
 libxatracker_la_CPPFLAGS = $(TARGET_CPPFLAGS)
-libxatracker_la_LIBADD += $(TARGET_LIB_DEPS)
+libxatracker_la_LIBADD += \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_static.la \
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
+	$(TARGET_LIB_DEPS)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
 
 libxatracker_la_LIBADD += \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/xa/target.c b/src/gallium/targets/xa/target.c
index fde4a4a7dcf..42b1346d341 100644
--- a/src/gallium/targets/xa/target.c
+++ b/src/gallium/targets/xa/target.c
@@ -1 +1 @@
-#include "target-helpers/inline_drm_helper.h"
+#include "target-helpers/drm_helper.h"
diff --git a/src/gallium/targets/xvmc/Makefile.am b/src/gallium/targets/xvmc/Makefile.am
index b3285890822..fdc5f4b7318 100644
--- a/src/gallium/targets/xvmc/Makefile.am
+++ b/src/gallium/targets/xvmc/Makefile.am
@@ -38,7 +38,7 @@ EXTRA_DIST = xvmc.sym
 
 TARGET_DRIVERS =
 TARGET_CPPFLAGS =
-TARGET_LIB_DEPS = $(top_builddir)/src/loader/libloader.la
+TARGET_LIB_DEPS =
 
 include $(top_srcdir)/src/gallium/drivers/nouveau/Automake.inc
 
@@ -48,13 +48,15 @@ if HAVE_GALLIUM_STATIC_TARGETS
 
 libXvMCgallium_la_SOURCES += target.c
 libXvMCgallium_la_CPPFLAGS = $(TARGET_CPPFLAGS)
-libXvMCgallium_la_LIBADD += $(TARGET_LIB_DEPS) \
+libXvMCgallium_la_LIBADD += \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_static.la \
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
+	$(TARGET_LIB_DEPS) \
 	$(TARGET_RADEON_WINSYS) $(TARGET_RADEON_COMMON)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
 libXvMCgallium_la_LIBADD += \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/xvmc/target.c b/src/gallium/targets/xvmc/target.c
index fde4a4a7dcf..42b1346d341 100644
--- a/src/gallium/targets/xvmc/target.c
+++ b/src/gallium/targets/xvmc/target.c
@@ -1 +1 @@
-#include "target-helpers/inline_drm_helper.h"
+#include "target-helpers/drm_helper.h"
diff --git a/src/gallium/tests/trivial/Makefile.am b/src/gallium/tests/trivial/Makefile.am
index 56b7f3ffc66..585fb699e6c 100644
--- a/src/gallium/tests/trivial/Makefile.am
+++ b/src/gallium/tests/trivial/Makefile.am
@@ -5,17 +5,10 @@ PIPE_SRC_DIR = $(top_builddir)/src/gallium/targets/pipe-loader
 AM_CFLAGS = \
 	$(GALLIUM_CFLAGS)
 
-AM_CPPFLAGS = \
-	-I$(top_srcdir)/src/gallium/drivers \
-	-I$(top_srcdir)/src/gallium/winsys \
-	-DPIPE_SEARCH_DIR=\"$(PIPE_SRC_DIR)/.libs\" \
-	$(GALLIUM_PIPE_LOADER_DEFINES)
-
 LDADD = \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
 	$(GALLIUM_COMMON_LIB_DEPS)
 
 noinst_PROGRAMS = compute tri quad-tex
diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c
index b344f78b25c..bcdfb11c4f1 100644
--- a/src/gallium/tests/trivial/compute.c
+++ b/src/gallium/tests/trivial/compute.c
@@ -74,7 +74,7 @@ static void init_ctx(struct context *ctx)
         ret = pipe_loader_probe(&ctx->dev, 1);
         assert(ret);
 
-        ctx->screen = pipe_loader_create_screen(ctx->dev, PIPE_SEARCH_DIR);
+        ctx->screen = pipe_loader_create_screen(ctx->dev);
         assert(ctx->screen);
 
         ctx->pipe = ctx->screen->context_create(ctx->screen, NULL, 0);
diff --git a/src/gallium/tests/trivial/quad-tex.c b/src/gallium/tests/trivial/quad-tex.c
index f66f63043da..4c5a9200a52 100644
--- a/src/gallium/tests/trivial/quad-tex.c
+++ b/src/gallium/tests/trivial/quad-tex.c
@@ -96,7 +96,7 @@ static void init_prog(struct program *p)
 	assert(ret);
 
 	/* init a pipe screen */
-	p->screen = pipe_loader_create_screen(p->dev, PIPE_SEARCH_DIR);
+	p->screen = pipe_loader_create_screen(p->dev);
 	assert(p->screen);
 
 	/* create the pipe driver context and cso context */
diff --git a/src/gallium/tests/trivial/tri.c b/src/gallium/tests/trivial/tri.c
index a555200842e..c71a63f44e5 100644
--- a/src/gallium/tests/trivial/tri.c
+++ b/src/gallium/tests/trivial/tri.c
@@ -91,7 +91,7 @@ static void init_prog(struct program *p)
 	assert(ret);
 
 	/* init a pipe screen */
-	p->screen = pipe_loader_create_screen(p->dev, PIPE_SEARCH_DIR);
+	p->screen = pipe_loader_create_screen(p->dev);
 	assert(p->screen);
 
 	/* create the pipe driver context and cso context */
diff --git a/src/glsl/Android.gen.mk b/src/glsl/Android.gen.mk
index 6898fb0d492..59cc8577a6e 100644
--- a/src/glsl/Android.gen.mk
+++ b/src/glsl/Android.gen.mk
@@ -38,7 +38,8 @@ LOCAL_C_INCLUDES += \
 	$(MESA_TOP)/src/glsl/nir
 
 LOCAL_EXPORT_C_INCLUDE_DIRS += \
-	$(intermediates)/nir
+	$(intermediates)/nir \
+	$(MESA_TOP)/src/glsl/nir
 
 LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, \
 	$(LIBGLCPP_GENERATED_FILES) \
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 957fd6b90ba..0c9fd75d206 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -22,10 +22,12 @@ NIR_FILES = \
 	nir/glsl_to_nir.h \
 	nir/glsl_types.cpp \
 	nir/glsl_types.h \
+	nir/builtin_type_macros.h \
 	nir/nir.c \
 	nir/nir.h \
 	nir/nir_array.h \
 	nir/nir_builder.h \
+	nir/nir_clone.c \
 	nir/nir_constant_expressions.h \
 	nir/nir_control_flow.c \
 	nir/nir_control_flow.h \
@@ -102,7 +104,6 @@ LIBGLSL_FILES = \
 	blob.c \
 	blob.h \
 	builtin_functions.cpp \
-	builtin_type_macros.h \
 	builtin_types.cpp \
 	builtin_variables.cpp \
 	glsl_parser_extras.cpp \
diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index 1b75234d578..3bea63ea0ed 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -336,7 +336,7 @@ public:
       array_dimensions.push_tail(&dim->link);
    }
 
-   const bool is_single_dimension()
+   bool is_single_dimension() const
    {
       return this->array_dimensions.tail_pred->prev != NULL &&
              this->array_dimensions.tail_pred->prev->is_head_sentinel();
@@ -350,6 +350,26 @@ public:
    exec_list array_dimensions;
 };
 
+class ast_layout_expression : public ast_node {
+public:
+   ast_layout_expression(const struct YYLTYPE &locp, ast_expression *expr)
+   {
+      set_location(locp);
+      layout_const_expressions.push_tail(&expr->link);
+   }
+
+   bool process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+                                   const char *qual_indentifier,
+                                   unsigned *value, bool can_be_zero);
+
+   void merge_qualifier(ast_layout_expression *l_expr)
+   {
+      layout_const_expressions.append_list(&l_expr->layout_const_expressions);
+   }
+
+   exec_list layout_const_expressions;
+};
+
 /**
  * C-style aggregate initialization class
  *
@@ -558,7 +578,7 @@ struct ast_type_qualifier {
    unsigned precision:2;
 
    /** Geometry shader invocations for GL_ARB_gpu_shader5. */
-   int invocations;
+   ast_layout_expression *invocations;
 
    /**
     * Location specified via GL_ARB_explicit_attrib_location layout
@@ -566,20 +586,20 @@ struct ast_type_qualifier {
     * \note
     * This field is only valid if \c explicit_location is set.
     */
-   int location;
+   ast_expression *location;
    /**
     * Index specified via GL_ARB_explicit_attrib_location layout
     *
     * \note
     * This field is only valid if \c explicit_index is set.
     */
-   int index;
+   ast_expression *index;
 
    /** Maximum output vertices in GLSL 1.50 geometry shaders. */
-   int max_vertices;
+   ast_layout_expression *max_vertices;
 
    /** Stream in GLSL 1.50 geometry shaders. */
-   unsigned stream;
+   ast_expression *stream;
 
    /**
     * Input or output primitive type in GLSL 1.50 geometry shaders
@@ -593,7 +613,7 @@ struct ast_type_qualifier {
     * \note
     * This field is only valid if \c explicit_binding is set.
     */
-   int binding;
+   ast_expression *binding;
 
    /**
     * Offset specified via GL_ARB_shader_atomic_counter's "offset"
@@ -602,14 +622,14 @@ struct ast_type_qualifier {
     * \note
     * This field is only valid if \c explicit_offset is set.
     */
-   int offset;
+   ast_expression *offset;
 
    /**
     * Local size specified via GL_ARB_compute_shader's "local_size_{x,y,z}"
     * layout qualifier.  Element i of this array is only valid if
     * flags.q.local_size & (1 << i) is set.
     */
-   int local_size[3];
+   ast_layout_expression *local_size[3];
 
    /** Tessellation evaluation shader: vertex spacing (equal, fractional even/odd) */
    GLenum vertex_spacing;
@@ -621,7 +641,7 @@ struct ast_type_qualifier {
    bool point_mode;
 
    /** Tessellation control shader: number of output vertices */
-   int vertices;
+   ast_layout_expression *vertices;
 
    /**
     * Image format specified with an ARB_shader_image_load_store
@@ -752,7 +772,7 @@ public:
 class ast_fully_specified_type : public ast_node {
 public:
    virtual void print(void) const;
-   bool has_qualifiers() const;
+   bool has_qualifiers(_mesa_glsl_parse_state *state) const;
 
    ast_fully_specified_type() : qualifier(), specifier(NULL)
    {
@@ -1093,17 +1113,13 @@ public:
 class ast_tcs_output_layout : public ast_node
 {
 public:
-   ast_tcs_output_layout(const struct YYLTYPE &locp, int vertices)
-      : vertices(vertices)
+   ast_tcs_output_layout(const struct YYLTYPE &locp)
    {
       set_location(locp);
    }
 
    virtual ir_rvalue *hir(exec_list *instructions,
                           struct _mesa_glsl_parse_state *state);
-
-private:
-   const int vertices;
 };
 
 
@@ -1135,9 +1151,12 @@ private:
 class ast_cs_input_layout : public ast_node
 {
 public:
-   ast_cs_input_layout(const struct YYLTYPE &locp, const unsigned *local_size)
+   ast_cs_input_layout(const struct YYLTYPE &locp,
+                       ast_layout_expression **local_size)
    {
-      memcpy(this->local_size, local_size, sizeof(this->local_size));
+      for (int i = 0; i < 3; i++) {
+         this->local_size[i] = local_size[i];
+      }
       set_location(locp);
    }
 
@@ -1145,7 +1164,7 @@ public:
                           struct _mesa_glsl_parse_state *state);
 
 private:
-   unsigned local_size[3];
+   ast_layout_expression *local_size[3];
 };
 
 /*@}*/
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 65db2618895..52881a4da7a 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2491,7 +2491,7 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
                        "uniform block layout qualifiers row_major and "
                        "column_major may not be applied to variables "
                        "outside of uniform blocks");
-   } else if (!type->is_matrix()) {
+   } else if (!type->without_array()->is_matrix()) {
       /* The OpenGL ES 3.0 conformance tests did not originally allow
        * matrix layout qualifiers on non-matrices.  However, the OpenGL
        * 4.4 and OpenGL ES 3.0 (revision TBD) specifications were
@@ -2502,39 +2502,88 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
                          "uniform block layout qualifiers row_major and "
                          "column_major applied to non-matrix types may "
                          "be rejected by older compilers");
-   } else if (type->is_record()) {
-      /* We allow 'layout(row_major)' on structure types because it's the only
-       * way to get row-major layouts on matrices contained in structures.
-       */
-      _mesa_glsl_warning(loc, state,
-                         "uniform block layout qualifiers row_major and "
-                         "column_major applied to structure types is not "
-                         "strictly conformant and may be rejected by other "
-                         "compilers");
    }
 }
 
 static bool
-validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
                            YYLTYPE *loc,
-                           const glsl_type *type,
-                           const ast_type_qualifier *qual)
+                           const char *qual_indentifier,
+                           ast_expression *const_expression,
+                           unsigned *value)
+{
+   exec_list dummy_instructions;
+
+   if (const_expression == NULL) {
+      *value = 0;
+      return true;
+   }
+
+   ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
+
+   ir_constant *const const_int = ir->constant_expression_value();
+   if (const_int == NULL || !const_int->type->is_integer()) {
+      _mesa_glsl_error(loc, state, "%s must be an integral constant "
+                       "expression", qual_indentifier);
+      return false;
+   }
+
+   if (const_int->value.i[0] < 0) {
+      _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
+                       qual_indentifier, const_int->value.u[0]);
+      return false;
+   }
+
+   /* If the location is const (and we've verified that
+    * it is) then no instructions should have been emitted
+    * when we converted it to HIR. If they were emitted,
+    * then either the location isn't const after all, or
+    * we are emitting unnecessary instructions.
+    */
+   assert(dummy_instructions.is_empty());
+
+   *value = const_int->value.u[0];
+   return true;
+}
+
+static bool
+validate_stream_qualifier(YYLTYPE *loc, struct _mesa_glsl_parse_state *state,
+                          unsigned stream)
+{
+   if (stream >= state->ctx->Const.MaxVertexStreams) {
+      _mesa_glsl_error(loc, state,
+                       "invalid stream specified %d is larger than "
+                       "MAX_VERTEX_STREAMS - 1 (%d).",
+                       stream, state->ctx->Const.MaxVertexStreams - 1);
+      return false;
+   }
+
+   return true;
+}
+
+static void
+apply_explicit_binding(struct _mesa_glsl_parse_state *state,
+                       YYLTYPE *loc,
+                       ir_variable *var,
+                       const glsl_type *type,
+                       const ast_type_qualifier *qual)
 {
    if (!qual->flags.q.uniform && !qual->flags.q.buffer) {
       _mesa_glsl_error(loc, state,
                        "the \"binding\" qualifier only applies to uniforms and "
                        "shader storage buffer objects");
-      return false;
+      return;
    }
 
-   if (qual->binding < 0) {
-      _mesa_glsl_error(loc, state, "binding values must be >= 0");
-      return false;
+   unsigned qual_binding;
+   if (!process_qualifier_constant(state, loc, "binding", qual->binding,
+                                   &qual_binding)) {
+      return;
    }
 
    const struct gl_context *const ctx = state->ctx;
    unsigned elements = type->is_array() ? type->arrays_of_arrays_size() : 1;
-   unsigned max_index = qual->binding + elements - 1;
+   unsigned max_index = qual_binding + elements - 1;
    const glsl_type *base_type = type->without_array();
 
    if (base_type->is_interface()) {
@@ -2550,11 +2599,11 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
        */
       if (qual->flags.q.uniform &&
          max_index >= ctx->Const.MaxUniformBufferBindings) {
-         _mesa_glsl_error(loc, state, "layout(binding = %d) for %d UBOs exceeds "
+         _mesa_glsl_error(loc, state, "layout(binding = %u) for %d UBOs exceeds "
                           "the maximum number of UBO binding points (%d)",
-                          qual->binding, elements,
+                          qual_binding, elements,
                           ctx->Const.MaxUniformBufferBindings);
-         return false;
+         return;
       }
 
       /* SSBOs. From page 67 of the GLSL 4.30 specification:
@@ -2568,11 +2617,11 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
        */
       if (qual->flags.q.buffer &&
          max_index >= ctx->Const.MaxShaderStorageBufferBindings) {
-         _mesa_glsl_error(loc, state, "layout(binding = %d) for %d SSBOs exceeds "
+         _mesa_glsl_error(loc, state, "layout(binding = %u) for %d SSBOs exceeds "
                           "the maximum number of SSBO binding points (%d)",
-                          qual->binding, elements,
+                          qual_binding, elements,
                           ctx->Const.MaxShaderStorageBufferBindings);
-         return false;
+         return;
       }
    } else if (base_type->is_sampler()) {
       /* Samplers.  From page 63 of the GLSL 4.20 specification:
@@ -2587,19 +2636,19 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
       if (max_index >= limit) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) for %d samplers "
                           "exceeds the maximum number of texture image units "
-                          "(%d)", qual->binding, elements, limit);
+                          "(%u)", qual_binding, elements, limit);
 
-         return false;
+         return;
       }
    } else if (base_type->contains_atomic()) {
       assert(ctx->Const.MaxAtomicBufferBindings <= MAX_COMBINED_ATOMIC_BUFFERS);
-      if (unsigned(qual->binding) >= ctx->Const.MaxAtomicBufferBindings) {
+      if (qual_binding >= ctx->Const.MaxAtomicBufferBindings) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) exceeds the "
                           " maximum number of atomic counter buffer bindings"
-                          "(%d)", qual->binding,
+                          "(%u)", qual_binding,
                           ctx->Const.MaxAtomicBufferBindings);
 
-         return false;
+         return;
       }
    } else if (state->is_version(420, 310) && base_type->is_image()) {
       assert(ctx->Const.MaxImageUnits <= MAX_IMAGE_UNITS);
@@ -2607,17 +2656,20 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
          _mesa_glsl_error(loc, state, "Image binding %d exceeds the "
                           " maximum number of image units (%d)", max_index,
                           ctx->Const.MaxImageUnits);
-         return false;
+         return;
       }
 
    } else {
       _mesa_glsl_error(loc, state,
                        "the \"binding\" qualifier only applies to uniform "
                        "blocks, opaque variables, or arrays thereof");
-      return false;
+      return;
    }
 
-   return true;
+   var->data.explicit_binding = true;
+   var->data.binding = qual_binding;
+
+   return;
 }
 
 
@@ -2660,20 +2712,26 @@ interpret_interpolation_qualifier(const struct ast_type_qualifier *qual,
 
 
 static void
-validate_explicit_location(const struct ast_type_qualifier *qual,
-                           ir_variable *var,
-                           struct _mesa_glsl_parse_state *state,
-                           YYLTYPE *loc)
+apply_explicit_location(const struct ast_type_qualifier *qual,
+                        ir_variable *var,
+                        struct _mesa_glsl_parse_state *state,
+                        YYLTYPE *loc)
 {
    bool fail = false;
 
+   unsigned qual_location;
+   if (!process_qualifier_constant(state, loc, "location", qual->location,
+                                   &qual_location)) {
+      return;
+   }
+
    /* Checks for GL_ARB_explicit_uniform_location. */
    if (qual->flags.q.uniform) {
       if (!state->check_explicit_uniform_location_allowed(loc, var))
          return;
 
       const struct gl_context *const ctx = state->ctx;
-      unsigned max_loc = qual->location + var->type->uniform_locations() - 1;
+      unsigned max_loc = qual_location + var->type->uniform_locations() - 1;
 
       if (max_loc >= ctx->Const.MaxUserAssignableUniformLocations) {
          _mesa_glsl_error(loc, state, "location(s) consumed by uniform %s "
@@ -2683,7 +2741,7 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
       }
 
       var->data.explicit_location = true;
-      var->data.location = qual->location;
+      var->data.location = qual_location;
       return;
    }
 
@@ -2768,30 +2826,40 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
       switch (state->stage) {
       case MESA_SHADER_VERTEX:
          var->data.location = (var->data.mode == ir_var_shader_in)
-            ? (qual->location + VERT_ATTRIB_GENERIC0)
-            : (qual->location + VARYING_SLOT_VAR0);
+            ? (qual_location + VERT_ATTRIB_GENERIC0)
+            : (qual_location + VARYING_SLOT_VAR0);
          break;
 
       case MESA_SHADER_TESS_CTRL:
       case MESA_SHADER_TESS_EVAL:
       case MESA_SHADER_GEOMETRY:
          if (var->data.patch)
-            var->data.location = qual->location + VARYING_SLOT_PATCH0;
+            var->data.location = qual_location + VARYING_SLOT_PATCH0;
          else
-            var->data.location = qual->location + VARYING_SLOT_VAR0;
+            var->data.location = qual_location + VARYING_SLOT_VAR0;
          break;
 
       case MESA_SHADER_FRAGMENT:
          var->data.location = (var->data.mode == ir_var_shader_out)
-            ? (qual->location + FRAG_RESULT_DATA0)
-            : (qual->location + VARYING_SLOT_VAR0);
+            ? (qual_location + FRAG_RESULT_DATA0)
+            : (qual_location + VARYING_SLOT_VAR0);
          break;
       case MESA_SHADER_COMPUTE:
          assert(!"Unexpected shader type");
          break;
       }
 
-      if (qual->flags.q.explicit_index) {
+      /* Check if index was set for the uniform instead of the function */
+      if (qual->flags.q.explicit_index && qual->flags.q.subroutine) {
+         _mesa_glsl_error(loc, state, "an index qualifier can only be "
+                          "used with subroutine functions");
+         return;
+      }
+
+      unsigned qual_index;
+      if (qual->flags.q.explicit_index &&
+          process_qualifier_constant(state, loc, "index", qual->index,
+                                     &qual_index)) {
          /* From the GLSL 4.30 specification, section 4.4.2 (Output
           * Layout Qualifiers):
           *
@@ -2801,12 +2869,12 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
           * Older specifications don't mandate a behavior; we take
           * this as a clarification and always generate the error.
           */
-         if (qual->index < 0 || qual->index > 1) {
+         if (qual_index > 1) {
             _mesa_glsl_error(loc, state,
                              "explicit index may only be 0 or 1");
          } else {
             var->data.explicit_index = true;
-            var->data.index = qual->index;
+            var->data.index = qual_index;
          }
       }
    }
@@ -2939,6 +3007,221 @@ validate_array_dimensions(const glsl_type *t,
 }
 
 static void
+apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
+                                   ir_variable *var,
+                                   struct _mesa_glsl_parse_state *state,
+                                   YYLTYPE *loc)
+{
+   if (var->name != NULL && strcmp(var->name, "gl_FragCoord") == 0) {
+
+      /* Section 4.3.8.1, page 39 of GLSL 1.50 spec says:
+       *
+       *    "Within any shader, the first redeclarations of gl_FragCoord
+       *     must appear before any use of gl_FragCoord."
+       *
+       * Generate a compiler error if above condition is not met by the
+       * fragment shader.
+       */
+      ir_variable *earlier = state->symbols->get_variable("gl_FragCoord");
+      if (earlier != NULL &&
+          earlier->data.used &&
+          !state->fs_redeclares_gl_fragcoord) {
+         _mesa_glsl_error(loc, state,
+                          "gl_FragCoord used before its first redeclaration "
+                          "in fragment shader");
+      }
+
+      /* Make sure all gl_FragCoord redeclarations specify the same layout
+       * qualifiers.
+       */
+      if (is_conflicting_fragcoord_redeclaration(state, qual)) {
+         const char *const qual_string =
+            get_layout_qualifier_string(qual->flags.q.origin_upper_left,
+                                        qual->flags.q.pixel_center_integer);
+
+         const char *const state_string =
+            get_layout_qualifier_string(state->fs_origin_upper_left,
+                                        state->fs_pixel_center_integer);
+
+         _mesa_glsl_error(loc, state,
+                          "gl_FragCoord redeclared with different layout "
+                          "qualifiers (%s) and (%s) ",
+                          state_string,
+                          qual_string);
+      }
+      state->fs_origin_upper_left = qual->flags.q.origin_upper_left;
+      state->fs_pixel_center_integer = qual->flags.q.pixel_center_integer;
+      state->fs_redeclares_gl_fragcoord_with_no_layout_qualifiers =
+         !qual->flags.q.origin_upper_left && !qual->flags.q.pixel_center_integer;
+      state->fs_redeclares_gl_fragcoord =
+         state->fs_origin_upper_left ||
+         state->fs_pixel_center_integer ||
+         state->fs_redeclares_gl_fragcoord_with_no_layout_qualifiers;
+   }
+
+   var->data.pixel_center_integer = qual->flags.q.pixel_center_integer;
+   var->data.origin_upper_left = qual->flags.q.origin_upper_left;
+   if ((qual->flags.q.origin_upper_left || qual->flags.q.pixel_center_integer)
+       && (strcmp(var->name, "gl_FragCoord") != 0)) {
+      const char *const qual_string = (qual->flags.q.origin_upper_left)
+         ? "origin_upper_left" : "pixel_center_integer";
+
+      _mesa_glsl_error(loc, state,
+		       "layout qualifier `%s' can only be applied to "
+		       "fragment shader input `gl_FragCoord'",
+		       qual_string);
+   }
+
+   if (qual->flags.q.explicit_location) {
+      apply_explicit_location(qual, var, state, loc);
+   } else if (qual->flags.q.explicit_index) {
+      if (!qual->flags.q.subroutine_def)
+         _mesa_glsl_error(loc, state,
+                          "explicit index requires explicit location");
+   }
+
+   if (qual->flags.q.explicit_binding) {
+      apply_explicit_binding(state, loc, var, var->type, qual);
+   }
+
+   if (state->stage == MESA_SHADER_GEOMETRY &&
+       qual->flags.q.out && qual->flags.q.stream) {
+      unsigned qual_stream;
+      if (process_qualifier_constant(state, loc, "stream", qual->stream,
+                                     &qual_stream) &&
+          validate_stream_qualifier(loc, state, qual_stream)) {
+         var->data.stream = qual_stream;
+      }
+   }
+
+   if (var->type->contains_atomic()) {
+      if (var->data.mode == ir_var_uniform) {
+         if (var->data.explicit_binding) {
+            unsigned *offset =
+               &state->atomic_counter_offsets[var->data.binding];
+
+            if (*offset % ATOMIC_COUNTER_SIZE)
+               _mesa_glsl_error(loc, state,
+                                "misaligned atomic counter offset");
+
+            var->data.atomic.offset = *offset;
+            *offset += var->type->atomic_size();
+
+         } else {
+            _mesa_glsl_error(loc, state,
+                             "atomic counters require explicit binding point");
+         }
+      } else if (var->data.mode != ir_var_function_in) {
+         _mesa_glsl_error(loc, state, "atomic counters may only be declared as "
+                          "function parameters or uniform-qualified "
+                          "global variables");
+      }
+   }
+
+   /* Is the 'layout' keyword used with parameters that allow relaxed checking.
+    * Many implementations of GL_ARB_fragment_coord_conventions_enable and some
+    * implementations (only Mesa?) GL_ARB_explicit_attrib_location_enable
+    * allowed the layout qualifier to be used with 'varying' and 'attribute'.
+    * These extensions and all following extensions that add the 'layout'
+    * keyword have been modified to require the use of 'in' or 'out'.
+    *
+    * The following extension do not allow the deprecated keywords:
+    *
+    *    GL_AMD_conservative_depth
+    *    GL_ARB_conservative_depth
+    *    GL_ARB_gpu_shader5
+    *    GL_ARB_separate_shader_objects
+    *    GL_ARB_tessellation_shader
+    *    GL_ARB_transform_feedback3
+    *    GL_ARB_uniform_buffer_object
+    *
+    * It is unknown whether GL_EXT_shader_image_load_store or GL_NV_gpu_shader5
+    * allow layout with the deprecated keywords.
+    */
+   const bool relaxed_layout_qualifier_checking =
+      state->ARB_fragment_coord_conventions_enable;
+
+   const bool uses_deprecated_qualifier = qual->flags.q.attribute
+      || qual->flags.q.varying;
+   if (qual->has_layout() && uses_deprecated_qualifier) {
+      if (relaxed_layout_qualifier_checking) {
+         _mesa_glsl_warning(loc, state,
+                            "`layout' qualifier may not be used with "
+                            "`attribute' or `varying'");
+      } else {
+         _mesa_glsl_error(loc, state,
+                          "`layout' qualifier may not be used with "
+                          "`attribute' or `varying'");
+      }
+   }
+
+   /* Layout qualifiers for gl_FragDepth, which are enabled by extension
+    * AMD_conservative_depth.
+    */
+   int depth_layout_count = qual->flags.q.depth_any
+      + qual->flags.q.depth_greater
+      + qual->flags.q.depth_less
+      + qual->flags.q.depth_unchanged;
+   if (depth_layout_count > 0
+       && !state->AMD_conservative_depth_enable
+       && !state->ARB_conservative_depth_enable) {
+       _mesa_glsl_error(loc, state,
+                        "extension GL_AMD_conservative_depth or "
+                        "GL_ARB_conservative_depth must be enabled "
+                        "to use depth layout qualifiers");
+   } else if (depth_layout_count > 0
+              && strcmp(var->name, "gl_FragDepth") != 0) {
+       _mesa_glsl_error(loc, state,
+                        "depth layout qualifiers can be applied only to "
+                        "gl_FragDepth");
+   } else if (depth_layout_count > 1
+              && strcmp(var->name, "gl_FragDepth") == 0) {
+      _mesa_glsl_error(loc, state,
+                       "at most one depth layout qualifier can be applied to "
+                       "gl_FragDepth");
+   }
+   if (qual->flags.q.depth_any)
+      var->data.depth_layout = ir_depth_layout_any;
+   else if (qual->flags.q.depth_greater)
+      var->data.depth_layout = ir_depth_layout_greater;
+   else if (qual->flags.q.depth_less)
+      var->data.depth_layout = ir_depth_layout_less;
+   else if (qual->flags.q.depth_unchanged)
+       var->data.depth_layout = ir_depth_layout_unchanged;
+   else
+       var->data.depth_layout = ir_depth_layout_none;
+
+   if (qual->flags.q.std140 ||
+       qual->flags.q.std430 ||
+       qual->flags.q.packed ||
+       qual->flags.q.shared) {
+      _mesa_glsl_error(loc, state,
+                       "uniform and shader storage block layout qualifiers "
+                       "std140, std430, packed, and shared can only be "
+                       "applied to uniform or shader storage blocks, not "
+                       "members");
+   }
+
+   if (qual->flags.q.row_major || qual->flags.q.column_major) {
+      validate_matrix_layout_for_type(state, loc, var->type, var);
+   }
+
+   /* From section 4.4.1.3 of the GLSL 4.50 specification (Fragment Shader
+    * Inputs):
+    *
+    *  "Fragment shaders also allow the following layout qualifier on in only
+    *   (not with variable declarations)
+    *     layout-qualifier-id
+    *        early_fragment_tests
+    *   [...]"
+    */
+   if (qual->flags.q.early_fragment_tests) {
+      _mesa_glsl_error(loc, state, "early_fragment_tests layout qualifier only "
+                       "valid in fragment shader input layout declaration.");
+   }
+}
+
+static void
 apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
                                  ir_variable *var,
                                  struct _mesa_glsl_parse_state *state,
@@ -2992,11 +3275,6 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
          select_gles_precision(qual->precision, var->type, state, loc);
    }
 
-   if (state->stage == MESA_SHADER_GEOMETRY &&
-       qual->flags.q.out && qual->flags.q.stream) {
-      var->data.stream = qual->stream;
-   }
-
    if (qual->flags.q.patch)
       var->data.patch = 1;
 
@@ -3136,102 +3414,6 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       interpret_interpolation_qualifier(qual, (ir_variable_mode) var->data.mode,
                                         state, loc);
 
-   var->data.pixel_center_integer = qual->flags.q.pixel_center_integer;
-   var->data.origin_upper_left = qual->flags.q.origin_upper_left;
-   if ((qual->flags.q.origin_upper_left || qual->flags.q.pixel_center_integer)
-       && (strcmp(var->name, "gl_FragCoord") != 0)) {
-      const char *const qual_string = (qual->flags.q.origin_upper_left)
-         ? "origin_upper_left" : "pixel_center_integer";
-
-      _mesa_glsl_error(loc, state,
-		       "layout qualifier `%s' can only be applied to "
-		       "fragment shader input `gl_FragCoord'",
-		       qual_string);
-   }
-
-   if (var->name != NULL && strcmp(var->name, "gl_FragCoord") == 0) {
-
-      /* Section 4.3.8.1, page 39 of GLSL 1.50 spec says:
-       *
-       *    "Within any shader, the first redeclarations of gl_FragCoord
-       *     must appear before any use of gl_FragCoord."
-       *
-       * Generate a compiler error if above condition is not met by the
-       * fragment shader.
-       */
-      ir_variable *earlier = state->symbols->get_variable("gl_FragCoord");
-      if (earlier != NULL &&
-          earlier->data.used &&
-          !state->fs_redeclares_gl_fragcoord) {
-         _mesa_glsl_error(loc, state,
-                          "gl_FragCoord used before its first redeclaration "
-                          "in fragment shader");
-      }
-
-      /* Make sure all gl_FragCoord redeclarations specify the same layout
-       * qualifiers.
-       */
-      if (is_conflicting_fragcoord_redeclaration(state, qual)) {
-         const char *const qual_string =
-            get_layout_qualifier_string(qual->flags.q.origin_upper_left,
-                                        qual->flags.q.pixel_center_integer);
-
-         const char *const state_string =
-            get_layout_qualifier_string(state->fs_origin_upper_left,
-                                        state->fs_pixel_center_integer);
-
-         _mesa_glsl_error(loc, state,
-                          "gl_FragCoord redeclared with different layout "
-                          "qualifiers (%s) and (%s) ",
-                          state_string,
-                          qual_string);
-      }
-      state->fs_origin_upper_left = qual->flags.q.origin_upper_left;
-      state->fs_pixel_center_integer = qual->flags.q.pixel_center_integer;
-      state->fs_redeclares_gl_fragcoord_with_no_layout_qualifiers =
-         !qual->flags.q.origin_upper_left && !qual->flags.q.pixel_center_integer;
-      state->fs_redeclares_gl_fragcoord =
-         state->fs_origin_upper_left ||
-         state->fs_pixel_center_integer ||
-         state->fs_redeclares_gl_fragcoord_with_no_layout_qualifiers;
-   }
-
-   if (qual->flags.q.explicit_location) {
-      validate_explicit_location(qual, var, state, loc);
-   } else if (qual->flags.q.explicit_index) {
-      _mesa_glsl_error(loc, state, "explicit index requires explicit location");
-   }
-
-   if (qual->flags.q.explicit_binding &&
-       validate_binding_qualifier(state, loc, var->type, qual)) {
-      var->data.explicit_binding = true;
-      var->data.binding = qual->binding;
-   }
-
-   if (var->type->contains_atomic()) {
-      if (var->data.mode == ir_var_uniform) {
-         if (var->data.explicit_binding) {
-            unsigned *offset =
-               &state->atomic_counter_offsets[var->data.binding];
-
-            if (*offset % ATOMIC_COUNTER_SIZE)
-               _mesa_glsl_error(loc, state,
-                                "misaligned atomic counter offset");
-
-            var->data.atomic.offset = *offset;
-            *offset += var->type->atomic_size();
-
-         } else {
-            _mesa_glsl_error(loc, state,
-                             "atomic counters require explicit binding point");
-         }
-      } else if (var->data.mode != ir_var_function_in) {
-         _mesa_glsl_error(loc, state, "atomic counters may only be declared as "
-                          "function parameters or uniform-qualified "
-                          "global variables");
-      }
-   }
-
    /* Does the declaration use the deprecated 'attribute' or 'varying'
     * keywords?
     */
@@ -3267,114 +3449,13 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
                        "`out' or `varying' variables between shader stages");
    }
 
-
-   /* Is the 'layout' keyword used with parameters that allow relaxed checking.
-    * Many implementations of GL_ARB_fragment_coord_conventions_enable and some
-    * implementations (only Mesa?) GL_ARB_explicit_attrib_location_enable
-    * allowed the layout qualifier to be used with 'varying' and 'attribute'.
-    * These extensions and all following extensions that add the 'layout'
-    * keyword have been modified to require the use of 'in' or 'out'.
-    *
-    * The following extension do not allow the deprecated keywords:
-    *
-    *    GL_AMD_conservative_depth
-    *    GL_ARB_conservative_depth
-    *    GL_ARB_gpu_shader5
-    *    GL_ARB_separate_shader_objects
-    *    GL_ARB_tessellation_shader
-    *    GL_ARB_transform_feedback3
-    *    GL_ARB_uniform_buffer_object
-    *
-    * It is unknown whether GL_EXT_shader_image_load_store or GL_NV_gpu_shader5
-    * allow layout with the deprecated keywords.
-    */
-   const bool relaxed_layout_qualifier_checking =
-      state->ARB_fragment_coord_conventions_enable;
-
-   if (qual->has_layout() && uses_deprecated_qualifier) {
-      if (relaxed_layout_qualifier_checking) {
-         _mesa_glsl_warning(loc, state,
-                            "`layout' qualifier may not be used with "
-                            "`attribute' or `varying'");
-      } else {
-         _mesa_glsl_error(loc, state,
-                          "`layout' qualifier may not be used with "
-                          "`attribute' or `varying'");
-      }
-   }
-
-   /* Layout qualifiers for gl_FragDepth, which are enabled by extension
-    * AMD_conservative_depth.
-    */
-   int depth_layout_count = qual->flags.q.depth_any
-      + qual->flags.q.depth_greater
-      + qual->flags.q.depth_less
-      + qual->flags.q.depth_unchanged;
-   if (depth_layout_count > 0
-       && !state->AMD_conservative_depth_enable
-       && !state->ARB_conservative_depth_enable) {
-       _mesa_glsl_error(loc, state,
-                        "extension GL_AMD_conservative_depth or "
-                        "GL_ARB_conservative_depth must be enabled "
-                        "to use depth layout qualifiers");
-   } else if (depth_layout_count > 0
-              && strcmp(var->name, "gl_FragDepth") != 0) {
-       _mesa_glsl_error(loc, state,
-                        "depth layout qualifiers can be applied only to "
-                        "gl_FragDepth");
-   } else if (depth_layout_count > 1
-              && strcmp(var->name, "gl_FragDepth") == 0) {
-      _mesa_glsl_error(loc, state,
-                       "at most one depth layout qualifier can be applied to "
-                       "gl_FragDepth");
-   }
-   if (qual->flags.q.depth_any)
-      var->data.depth_layout = ir_depth_layout_any;
-   else if (qual->flags.q.depth_greater)
-      var->data.depth_layout = ir_depth_layout_greater;
-   else if (qual->flags.q.depth_less)
-      var->data.depth_layout = ir_depth_layout_less;
-   else if (qual->flags.q.depth_unchanged)
-       var->data.depth_layout = ir_depth_layout_unchanged;
-   else
-       var->data.depth_layout = ir_depth_layout_none;
-
-   if (qual->flags.q.std140 ||
-       qual->flags.q.std430 ||
-       qual->flags.q.packed ||
-       qual->flags.q.shared) {
-      _mesa_glsl_error(loc, state,
-                       "uniform and shader storage block layout qualifiers "
-                       "std140, std430, packed, and shared can only be "
-                       "applied to uniform or shader storage blocks, not "
-                       "members");
-   }
-
    if (qual->flags.q.shared_storage && state->stage != MESA_SHADER_COMPUTE) {
       _mesa_glsl_error(loc, state,
                        "the shared storage qualifiers can only be used with "
                        "compute shaders");
    }
 
-   if (qual->flags.q.row_major || qual->flags.q.column_major) {
-      validate_matrix_layout_for_type(state, loc, var->type, var);
-   }
-
    apply_image_qualifier_to_variable(qual, var, state, loc);
-
-   /* From section 4.4.1.3 of the GLSL 4.50 specification (Fragment Shader
-    * Inputs):
-    *
-    *  "Fragment shaders also allow the following layout qualifier on in only
-    *   (not with variable declarations)
-    *     layout-qualifier-id
-    *        early_fragment_tests
-    *   [...]"
-    */
-   if (qual->flags.q.early_fragment_tests) {
-      _mesa_glsl_error(loc, state, "early_fragment_tests layout qualifier only "
-                       "valid in fragment shader input layout declaration.");
-   }
 }
 
 /**
@@ -3798,7 +3879,17 @@ handle_tess_ctrl_shader_output_decl(struct _mesa_glsl_parse_state *state,
    unsigned num_vertices = 0;
 
    if (state->tcs_output_vertices_specified) {
-      num_vertices = state->out_qualifier->vertices;
+      if (!state->out_qualifier->vertices->
+             process_qualifier_constant(state, "vertices",
+                                        &num_vertices, false)) {
+         return;
+      }
+
+      if (num_vertices > state->Const.MaxPatchVertices) {
+         _mesa_glsl_error(&loc, state, "vertices (%d) exceeds "
+                          "GL_MAX_PATCH_VERTICES", num_vertices);
+         return;
+      }
    }
 
    if (!var->type->is_array() && !var->data.patch) {
@@ -4032,9 +4123,18 @@ ast_declarator_list::hir(exec_list *instructions,
     */
    if (decl_type && decl_type->contains_atomic()) {
       if (type->qualifier.flags.q.explicit_binding &&
-          type->qualifier.flags.q.explicit_offset)
-         state->atomic_counter_offsets[type->qualifier.binding] =
-            type->qualifier.offset;
+          type->qualifier.flags.q.explicit_offset) {
+         unsigned qual_binding;
+         unsigned qual_offset;
+         if (process_qualifier_constant(state, &loc, "binding",
+                                        type->qualifier.binding,
+                                        &qual_binding)
+             && process_qualifier_constant(state, &loc, "offset",
+                                        type->qualifier.offset,
+                                        &qual_offset)) {
+            state->atomic_counter_offsets[qual_binding] = qual_offset;
+         }
+      }
    }
 
    if (this->declarations.is_empty()) {
@@ -4188,6 +4288,8 @@ ast_declarator_list::hir(exec_list *instructions,
 
       apply_type_qualifier_to_variable(& this->type->qualifier, var, state,
 				       & loc, false);
+      apply_layout_qualifier_to_variable(&this->type->qualifier, var, state,
+                                         &loc);
 
       if (this->type->qualifier.flags.q.invariant) {
          if (!is_varying_var(var, state->stage)) {
@@ -4983,7 +5085,7 @@ ast_function::hir(exec_list *instructions,
    /* From page 56 (page 62 of the PDF) of the GLSL 1.30 spec:
     * "No qualifier is allowed on the return type of a function."
     */
-   if (this->return_type->has_qualifiers()) {
+   if (this->return_type->has_qualifiers(state)) {
       YYLTYPE loc = this->get_location();
       _mesa_glsl_error(& loc, state,
                        "function `%s' return type has qualifiers", name);
@@ -5115,6 +5217,27 @@ ast_function::hir(exec_list *instructions,
    if (this->return_type->qualifier.flags.q.subroutine_def) {
       int idx;
 
+      if (this->return_type->qualifier.flags.q.explicit_index) {
+         unsigned qual_index;
+         if (process_qualifier_constant(state, &loc, "index",
+                                        this->return_type->qualifier.index,
+                                        &qual_index)) {
+            if (!state->has_explicit_uniform_location()) {
+               _mesa_glsl_error(&loc, state, "subroutine index requires "
+                                "GL_ARB_explicit_uniform_location or "
+                                "GLSL 4.30");
+            } else if (qual_index >= MAX_SUBROUTINES) {
+               _mesa_glsl_error(&loc, state,
+                                "invalid subroutine index (%d) index must "
+                                "be a number between 0 and "
+                                "GL_MAX_SUBROUTINES - 1 (%d)", qual_index,
+                                MAX_SUBROUTINES - 1);
+            } else {
+               f->subroutine_index = qual_index;
+            }
+         }
+      }
+
       f->num_subroutine_types = this->return_type->qualifier.subroutine_list->declarations.length();
       f->subroutine_types = ralloc_array(state, const struct glsl_type *,
                                          f->num_subroutine_types);
@@ -6046,27 +6169,19 @@ ast_type_specifier::hir(exec_list *instructions,
  * stored in \c *fields_ret.
  */
 unsigned
-ast_process_structure_or_interface_block(exec_list *instructions,
-                                         struct _mesa_glsl_parse_state *state,
-                                         exec_list *declarations,
-                                         YYLTYPE &loc,
-                                         glsl_struct_field **fields_ret,
-                                         bool is_interface,
-                                         enum glsl_matrix_layout matrix_layout,
-                                         bool allow_reserved_names,
-                                         ir_variable_mode var_mode,
-                                         ast_type_qualifier *layout)
+ast_process_struct_or_iface_block_members(exec_list *instructions,
+                                          struct _mesa_glsl_parse_state *state,
+                                          exec_list *declarations,
+                                          glsl_struct_field **fields_ret,
+                                          bool is_interface,
+                                          enum glsl_matrix_layout matrix_layout,
+                                          bool allow_reserved_names,
+                                          ir_variable_mode var_mode,
+                                          ast_type_qualifier *layout,
+                                          unsigned block_stream)
 {
    unsigned decl_count = 0;
 
-   /* For blocks that accept memory qualifiers (i.e. shader storage), verify
-    * that we don't have incompatible qualifiers
-    */
-   if (layout && layout->flags.q.read_only && layout->flags.q.write_only) {
-      _mesa_glsl_error(&loc, state,
-                       "Interface block sets both readonly and writeonly");
-   }
-
    /* Make an initial pass over the list of fields to determine how
     * many there are.  Each element in this list is an ast_declarator_list.
     * This means that we actually need to count the number of elements in the
@@ -6087,6 +6202,7 @@ ast_process_structure_or_interface_block(exec_list *instructions,
    unsigned i = 0;
    foreach_list_typed (ast_declarator_list, decl_list, link, declarations) {
       const char *type_name;
+      YYLTYPE loc = decl_list->get_location();
 
       decl_list->type->specifier->hir(instructions, state);
 
@@ -6101,74 +6217,120 @@ ast_process_structure_or_interface_block(exec_list *instructions,
       const glsl_type *decl_type =
          decl_list->type->glsl_type(& type_name, state);
 
-      foreach_list_typed (ast_declaration, decl, link,
-                          &decl_list->declarations) {
-         if (!allow_reserved_names)
-            validate_identifier(decl->identifier, loc, state);
+      const struct ast_type_qualifier *const qual =
+         &decl_list->type->qualifier;
 
-         /* From section 4.3.9 of the GLSL 4.40 spec:
-          *
-          *    "[In interface blocks] opaque types are not allowed."
+      /* From section 4.3.9 of the GLSL 4.40 spec:
+       *
+       *    "[In interface blocks] opaque types are not allowed."
+       *
+       * It should be impossible for decl_type to be NULL here.  Cases that
+       * might naturally lead to decl_type being NULL, especially for the
+       * is_interface case, will have resulted in compilation having
+       * already halted due to a syntax error.
+       */
+      assert(decl_type);
+
+      if (is_interface && decl_type->contains_opaque()) {
+         _mesa_glsl_error(&loc, state,
+                          "uniform/buffer in non-default interface block contains "
+                          "opaque variable");
+      }
+
+      if (decl_type->contains_atomic()) {
+         /* From section 4.1.7.3 of the GLSL 4.40 spec:
           *
-          * It should be impossible for decl_type to be NULL here.  Cases that
-          * might naturally lead to decl_type being NULL, especially for the
-          * is_interface case, will have resulted in compilation having
-          * already halted due to a syntax error.
+          *    "Members of structures cannot be declared as atomic counter
+          *     types."
           */
-         assert(decl_type);
+         _mesa_glsl_error(&loc, state, "atomic counter in structure, "
+                          "shader storage block or uniform block");
+      }
 
-         if (is_interface && decl_type->contains_opaque()) {
-            YYLTYPE loc = decl_list->get_location();
-            _mesa_glsl_error(&loc, state,
-                             "uniform/buffer in non-default interface block contains "
-                             "opaque variable");
-         }
+      if (decl_type->contains_image()) {
+         /* FINISHME: Same problem as with atomic counters.
+          * FINISHME: Request clarification from Khronos and add
+          * FINISHME: spec quotation here.
+          */
+         _mesa_glsl_error(&loc, state,
+                          "image in structure, shader storage block or "
+                          "uniform block");
+      }
 
-         if (decl_type->contains_atomic()) {
-            /* From section 4.1.7.3 of the GLSL 4.40 spec:
-             *
-             *    "Members of structures cannot be declared as atomic counter
-             *     types."
-             */
-            YYLTYPE loc = decl_list->get_location();
-            _mesa_glsl_error(&loc, state, "atomic counter in structure, "
-                             "shader storage block or uniform block");
-         }
+      if (qual->flags.q.explicit_binding) {
+         _mesa_glsl_error(&loc, state,
+                          "binding layout qualifier cannot be applied "
+                          "to struct or interface block members");
+      }
 
-         if (decl_type->contains_image()) {
-            /* FINISHME: Same problem as with atomic counters.
-             * FINISHME: Request clarification from Khronos and add
-             * FINISHME: spec quotation here.
-             */
-            YYLTYPE loc = decl_list->get_location();
-            _mesa_glsl_error(&loc, state,
-                             "image in structure, shader storage block or "
-                             "uniform block");
+      if (qual->flags.q.std140 ||
+          qual->flags.q.std430 ||
+          qual->flags.q.packed ||
+          qual->flags.q.shared) {
+         _mesa_glsl_error(&loc, state,
+                          "uniform/shader storage block layout qualifiers "
+                          "std140, std430, packed, and shared can only be "
+                          "applied to uniform/shader storage blocks, not "
+                          "members");
+      }
+
+      if (qual->flags.q.constant) {
+         _mesa_glsl_error(&loc, state,
+                          "const storage qualifier cannot be applied "
+                          "to struct or interface block members");
+      }
+
+      /* From Section 4.4.2.3 (Geometry Outputs) of the GLSL 4.50 spec:
+       *
+       *   "A block member may be declared with a stream identifier, but
+       *   the specified stream must match the stream associated with the
+       *   containing block."
+       */
+      if (qual->flags.q.explicit_stream) {
+         unsigned qual_stream;
+         if (process_qualifier_constant(state, &loc, "stream",
+                                        qual->stream, &qual_stream) &&
+             qual_stream != block_stream) {
+            _mesa_glsl_error(&loc, state, "stream layout qualifier on "
+                             "interface block member does not match "
+                             "the interface block (%d vs %d)", qual->stream,
+                             block_stream);
          }
+      }
 
-         const struct ast_type_qualifier *const qual =
-            & decl_list->type->qualifier;
+      if (qual->flags.q.uniform && qual->has_interpolation()) {
+         _mesa_glsl_error(&loc, state,
+                          "interpolation qualifiers cannot be used "
+                          "with uniform interface blocks");
+      }
 
-         if (qual->flags.q.explicit_binding)
-            validate_binding_qualifier(state, &loc, decl_type, qual);
+      if ((qual->flags.q.uniform || !is_interface) &&
+          qual->has_auxiliary_storage()) {
+         _mesa_glsl_error(&loc, state,
+                          "auxiliary storage qualifiers cannot be used "
+                          "in uniform blocks or structures.");
+      }
 
-         if (qual->flags.q.std140 ||
-             qual->flags.q.std430 ||
-             qual->flags.q.packed ||
-             qual->flags.q.shared) {
+      if (qual->flags.q.row_major || qual->flags.q.column_major) {
+         if (!qual->flags.q.uniform && !qual->flags.q.buffer) {
             _mesa_glsl_error(&loc, state,
-                             "uniform/shader storage block layout qualifiers "
-                             "std140, std430, packed, and shared can only be "
-                             "applied to uniform/shader storage blocks, not "
-                             "members");
-         }
+                             "row_major and column_major can only be "
+                             "applied to interface blocks");
+         } else
+            validate_matrix_layout_for_type(state, &loc, decl_type, NULL);
+      }
 
-         if (qual->flags.q.constant) {
-            YYLTYPE loc = decl_list->get_location();
-            _mesa_glsl_error(&loc, state,
-                             "const storage qualifier cannot be applied "
-                             "to struct or interface block members");
-         }
+      if (qual->flags.q.read_only && qual->flags.q.write_only) {
+         _mesa_glsl_error(&loc, state, "buffer variable can't be both "
+                          "readonly and writeonly.");
+      }
+
+      foreach_list_typed (ast_declaration, decl, link,
+                          &decl_list->declarations) {
+         YYLTYPE loc = decl->get_location();
+
+         if (!allow_reserved_names)
+            validate_identifier(decl->identifier, loc, state);
 
          const struct glsl_type *field_type =
             process_array_type(&loc, decl_type, decl->array_specifier, state);
@@ -6183,42 +6345,6 @@ ast_process_structure_or_interface_block(exec_list *instructions,
          fields[i].patch = qual->flags.q.patch ? 1 : 0;
          fields[i].precision = qual->precision;
 
-         /* From Section 4.4.2.3 (Geometry Outputs) of the GLSL 4.50 spec:
-          *
-          *   "A block member may be declared with a stream identifier, but
-          *   the specified stream must match the stream associated with the
-          *   containing block."
-          */
-         if (qual->flags.q.explicit_stream &&
-             qual->stream != layout->stream) {
-            _mesa_glsl_error(&loc, state, "stream layout qualifier on "
-                             "interface block member `%s' does not match "
-                             "the interface block (%d vs %d)",
-                             fields[i].name, qual->stream, layout->stream);
-         }
-
-         if (qual->flags.q.row_major || qual->flags.q.column_major) {
-            if (!qual->flags.q.uniform && !qual->flags.q.buffer) {
-               _mesa_glsl_error(&loc, state,
-                                "row_major and column_major can only be "
-                                "applied to interface blocks");
-            } else
-               validate_matrix_layout_for_type(state, &loc, field_type, NULL);
-         }
-
-         if (qual->flags.q.uniform && qual->has_interpolation()) {
-            _mesa_glsl_error(&loc, state,
-                             "interpolation qualifiers cannot be used "
-                             "with uniform interface blocks");
-         }
-
-         if ((qual->flags.q.uniform || !is_interface) &&
-             qual->has_auxiliary_storage()) {
-            _mesa_glsl_error(&loc, state,
-                             "auxiliary storage qualifiers cannot be used "
-                             "in uniform blocks or structures.");
-         }
-
          /* Propogate row- / column-major information down the fields of the
           * structure or interface block.  Structures need this data because
           * the structure may contain a structure that contains ... a matrix
@@ -6248,29 +6374,20 @@ ast_process_structure_or_interface_block(exec_list *instructions,
           * be defined inside shader storage buffer objects
           */
          if (layout && var_mode == ir_var_shader_storage) {
-            if (qual->flags.q.read_only && qual->flags.q.write_only) {
-               _mesa_glsl_error(&loc, state,
-                                "buffer variable `%s' can't be "
-                                "readonly and writeonly.", fields[i].name);
-            }
-
             /* For readonly and writeonly qualifiers the field definition,
              * if set, overwrites the layout qualifier.
              */
-            bool read_only = layout->flags.q.read_only;
-            bool write_only = layout->flags.q.write_only;
-
             if (qual->flags.q.read_only) {
-               read_only = true;
-               write_only = false;
+               fields[i].image_read_only = true;
+               fields[i].image_write_only = false;
             } else if (qual->flags.q.write_only) {
-               read_only = false;
-               write_only = true;
+               fields[i].image_read_only = false;
+               fields[i].image_write_only = true;
+            } else {
+               fields[i].image_read_only = layout->flags.q.read_only;
+               fields[i].image_write_only = layout->flags.q.write_only;
             }
 
-            fields[i].image_read_only = read_only;
-            fields[i].image_write_only = write_only;
-
             /* For other qualifiers, we set the flag if either the layout
              * qualifier or the field qualifier are set
              */
@@ -6328,16 +6445,16 @@ ast_struct_specifier::hir(exec_list *instructions,
 
    glsl_struct_field *fields;
    unsigned decl_count =
-      ast_process_structure_or_interface_block(instructions,
-                                               state,
-                                               &this->declarations,
-                                               loc,
-                                               &fields,
-                                               false,
-                                               GLSL_MATRIX_LAYOUT_INHERITED,
-                                               false /* allow_reserved_names */,
-                                               ir_var_auto,
-                                               NULL);
+      ast_process_struct_or_iface_block_members(instructions,
+                                                state,
+                                                &this->declarations,
+                                                &fields,
+                                                false,
+                                                GLSL_MATRIX_LAYOUT_INHERITED,
+                                                false /* allow_reserved_names */,
+                                                ir_var_auto,
+                                                NULL,
+                                                0 /* for interface only */);
 
    validate_identifier(this->name, loc, state);
 
@@ -6483,17 +6600,36 @@ ast_interface_block::hir(exec_list *instructions,
     */
    state->struct_specifier_depth++;
 
+   /* For blocks that accept memory qualifiers (i.e. shader storage), verify
+    * that we don't have incompatible qualifiers
+    */
+   if (this->layout.flags.q.read_only && this->layout.flags.q.write_only) {
+      _mesa_glsl_error(&loc, state,
+                       "Interface block sets both readonly and writeonly");
+   }
+
+   unsigned qual_stream;
+   if (!process_qualifier_constant(state, &loc, "stream", this->layout.stream,
+                                   &qual_stream) ||
+       !validate_stream_qualifier(&loc, state, qual_stream)) {
+      /* If the stream qualifier is invalid it doesn't make sense to continue
+       * on and try to compare stream layouts on member variables against it
+       * so just return early.
+       */
+      return NULL;
+   }
+
    unsigned int num_variables =
-      ast_process_structure_or_interface_block(&declared_variables,
-                                               state,
-                                               &this->declarations,
-                                               loc,
-                                               &fields,
-                                               true,
-                                               matrix_layout,
-                                               redeclaring_per_vertex,
-                                               var_mode,
-                                               &this->layout);
+      ast_process_struct_or_iface_block_members(&declared_variables,
+                                                state,
+                                                &this->declarations,
+                                                &fields,
+                                                true,
+                                                matrix_layout,
+                                                redeclaring_per_vertex,
+                                                var_mode,
+                                                &this->layout,
+                                                qual_stream);
 
    state->struct_specifier_depth--;
 
@@ -6604,6 +6740,8 @@ ast_interface_block::hir(exec_list *instructions,
                earlier_per_vertex->fields.structure[j].sample;
             fields[i].patch =
                earlier_per_vertex->fields.structure[j].patch;
+            fields[i].precision =
+               earlier_per_vertex->fields.structure[j].precision;
          }
       }
 
@@ -6633,8 +6771,6 @@ ast_interface_block::hir(exec_list *instructions,
                                         num_variables,
                                         packing,
                                         this->block_name);
-   if (this->layout.flags.q.explicit_binding)
-      validate_binding_qualifier(state, &loc, block_type, &this->layout);
 
    if (!state->symbols->add_interface(block_type->name, block_type, var_mode)) {
       YYLTYPE loc = this->get_location();
@@ -6765,10 +6901,6 @@ ast_interface_block::hir(exec_list *instructions,
                              "not allowed");
          }
 
-         if (this->layout.flags.q.explicit_binding)
-            validate_binding_qualifier(state, &loc, block_array_type,
-                                       &this->layout);
-
          var = new(state) ir_variable(block_array_type,
                                       this->instance_name,
                                       var_mode);
@@ -6830,14 +6962,12 @@ ast_interface_block::hir(exec_list *instructions,
          earlier->reinit_interface_type(block_type);
          delete var;
       } else {
-         /* Propagate the "binding" keyword into this UBO's fields;
-          * the UBO declaration itself doesn't get an ir_variable unless it
-          * has an instance name.  This is ugly.
-          */
-         var->data.explicit_binding = this->layout.flags.q.explicit_binding;
-         var->data.binding = this->layout.binding;
+         if (this->layout.flags.q.explicit_binding) {
+            apply_explicit_binding(state, &loc, var, var->type,
+                                   &this->layout);
+         }
 
-         var->data.stream = this->layout.stream;
+         var->data.stream = qual_stream;
 
          state->symbols->add_variable(var);
          instructions->push_tail(var);
@@ -6857,7 +6987,7 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.centroid = fields[i].centroid;
          var->data.sample = fields[i].sample;
          var->data.patch = fields[i].patch;
-         var->data.stream = this->layout.stream;
+         var->data.stream = qual_stream;
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
@@ -6914,8 +7044,10 @@ ast_interface_block::hir(exec_list *instructions,
           * The UBO declaration itself doesn't get an ir_variable unless it
           * has an instance name.  This is ugly.
           */
-         var->data.explicit_binding = this->layout.flags.q.explicit_binding;
-         var->data.binding = this->layout.binding;
+         if (this->layout.flags.q.explicit_binding) {
+            apply_explicit_binding(state, &loc, var,
+                                   var->get_interface_type(), &this->layout);
+         }
 
          if (var->type->is_unsized_array()) {
             if (var->is_in_shader_storage_block()) {
@@ -6997,22 +7129,18 @@ ast_tcs_output_layout::hir(exec_list *instructions,
 {
    YYLTYPE loc = this->get_location();
 
-   /* If any tessellation control output layout declaration preceded this
-    * one, make sure it was consistent with this one.
-    */
-   if (state->tcs_output_vertices_specified &&
-       state->out_qualifier->vertices != this->vertices) {
-      _mesa_glsl_error(&loc, state,
-		       "tessellation control shader output layout does not "
-		       "match previous declaration");
-      return NULL;
+   unsigned num_vertices;
+   if (!state->out_qualifier->vertices->
+          process_qualifier_constant(state, "vertices", &num_vertices,
+                                     false)) {
+      /* return here to stop cascading incorrect error messages */
+     return NULL;
    }
 
    /* If any shader outputs occurred before this declaration and specified an
     * array size, make sure the size they specified is consistent with the
     * primitive type.
     */
-   unsigned num_vertices = this->vertices;
    if (state->tcs_output_size != 0 && state->tcs_output_size != num_vertices) {
       _mesa_glsl_error(&loc, state,
 		       "this tessellation control shader output layout "
@@ -7120,20 +7248,6 @@ ast_cs_input_layout::hir(exec_list *instructions,
 {
    YYLTYPE loc = this->get_location();
 
-   /* If any compute input layout declaration preceded this one, make sure it
-    * was consistent with this one.
-    */
-   if (state->cs_input_local_size_specified) {
-      for (int i = 0; i < 3; i++) {
-         if (state->cs_input_local_size[i] != this->local_size[i]) {
-            _mesa_glsl_error(&loc, state,
-                             "compute shader input layout does not match"
-                             " previous declaration");
-            return NULL;
-         }
-      }
-   }
-
    /* From the ARB_compute_shader specification:
     *
     *     If the local size of the shader in any dimension is greater
@@ -7146,15 +7260,30 @@ ast_cs_input_layout::hir(exec_list *instructions,
     * report it at compile time as well.
     */
    GLuint64 total_invocations = 1;
+   unsigned qual_local_size[3];
    for (int i = 0; i < 3; i++) {
-      if (this->local_size[i] > state->ctx->Const.MaxComputeWorkGroupSize[i]) {
+
+      char *local_size_str = ralloc_asprintf(NULL, "invalid local_size_%c",
+                                             'x' + i);
+      /* Infer a local_size of 1 for unspecified dimensions */
+      if (this->local_size[i] == NULL) {
+         qual_local_size[i] = 1;
+      } else if (!this->local_size[i]->
+             process_qualifier_constant(state, local_size_str,
+                                        &qual_local_size[i], false)) {
+         ralloc_free(local_size_str);
+         return NULL;
+      }
+      ralloc_free(local_size_str);
+
+      if (qual_local_size[i] > state->ctx->Const.MaxComputeWorkGroupSize[i]) {
          _mesa_glsl_error(&loc, state,
                           "local_size_%c exceeds MAX_COMPUTE_WORK_GROUP_SIZE"
                           " (%d)", 'x' + i,
                           state->ctx->Const.MaxComputeWorkGroupSize[i]);
          break;
       }
-      total_invocations *= this->local_size[i];
+      total_invocations *= qual_local_size[i];
       if (total_invocations >
           state->ctx->Const.MaxComputeWorkGroupInvocations) {
          _mesa_glsl_error(&loc, state,
@@ -7165,9 +7294,23 @@ ast_cs_input_layout::hir(exec_list *instructions,
       }
    }
 
+   /* If any compute input layout declaration preceded this one, make sure it
+    * was consistent with this one.
+    */
+   if (state->cs_input_local_size_specified) {
+      for (int i = 0; i < 3; i++) {
+         if (state->cs_input_local_size[i] != qual_local_size[i]) {
+            _mesa_glsl_error(&loc, state,
+                             "compute shader input layout does not match"
+                             " previous declaration");
+            return NULL;
+         }
+      }
+   }
+
    state->cs_input_local_size_specified = true;
    for (int i = 0; i < 3; i++)
-      state->cs_input_local_size[i] = this->local_size[i];
+      state->cs_input_local_size[i] = qual_local_size[i];
 
    /* We may now declare the built-in constant gl_WorkGroupSize (see
     * builtin_variable_generator::generate_constants() for why we didn't
@@ -7182,7 +7325,7 @@ ast_cs_input_layout::hir(exec_list *instructions,
    ir_constant_data data;
    memset(&data, 0, sizeof(data));
    for (int i = 0; i < 3; i++)
-      data.u[i] = this->local_size[i];
+      data.u[i] = qual_local_size[i];
    var->constant_value = new(var) ir_constant(glsl_type::uvec3_type, &data);
    var->constant_initializer =
       new(var) ir_constant(glsl_type::uvec3_type, &data);
@@ -7198,6 +7341,8 @@ detect_conflicting_assignments(struct _mesa_glsl_parse_state *state,
 {
    bool gl_FragColor_assigned = false;
    bool gl_FragData_assigned = false;
+   bool gl_FragSecondaryColor_assigned = false;
+   bool gl_FragSecondaryData_assigned = false;
    bool user_defined_fs_output_assigned = false;
    ir_variable *user_defined_fs_output = NULL;
 
@@ -7215,6 +7360,10 @@ detect_conflicting_assignments(struct _mesa_glsl_parse_state *state,
          gl_FragColor_assigned = true;
       else if (strcmp(var->name, "gl_FragData") == 0)
          gl_FragData_assigned = true;
+	else if (strcmp(var->name, "gl_SecondaryFragColorEXT") == 0)
+         gl_FragSecondaryColor_assigned = true;
+	else if (strcmp(var->name, "gl_SecondaryFragDataEXT") == 0)
+         gl_FragSecondaryData_assigned = true;
       else if (!is_gl_identifier(var->name)) {
          if (state->stage == MESA_SHADER_FRAGMENT &&
              var->data.mode == ir_var_shader_out) {
@@ -7246,11 +7395,29 @@ detect_conflicting_assignments(struct _mesa_glsl_parse_state *state,
       _mesa_glsl_error(&loc, state, "fragment shader writes to both "
                        "`gl_FragColor' and `%s'",
                        user_defined_fs_output->name);
+   } else if (gl_FragSecondaryColor_assigned && gl_FragSecondaryData_assigned) {
+      _mesa_glsl_error(&loc, state, "fragment shader writes to both "
+                       "`gl_FragSecondaryColorEXT' and"
+                       " `gl_FragSecondaryDataEXT'");
+   } else if (gl_FragColor_assigned && gl_FragSecondaryData_assigned) {
+      _mesa_glsl_error(&loc, state, "fragment shader writes to both "
+                       "`gl_FragColor' and"
+                       " `gl_FragSecondaryDataEXT'");
+   } else if (gl_FragData_assigned && gl_FragSecondaryColor_assigned) {
+      _mesa_glsl_error(&loc, state, "fragment shader writes to both "
+                       "`gl_FragData' and"
+                       " `gl_FragSecondaryColorEXT'");
    } else if (gl_FragData_assigned && user_defined_fs_output_assigned) {
       _mesa_glsl_error(&loc, state, "fragment shader writes to both "
                        "`gl_FragData' and `%s'",
                        user_defined_fs_output->name);
    }
+
+   if ((gl_FragSecondaryColor_assigned || gl_FragSecondaryData_assigned) &&
+       !state->EXT_blend_func_extended_enable) {
+      _mesa_glsl_error(&loc, state,
+                       "Dual source blending requires EXT_blend_func_extended");
+   }
 }
 
 
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 79134c19893..03ed4dcfa2a 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -38,13 +38,16 @@ ast_type_specifier::print(void) const
 }
 
 bool
-ast_fully_specified_type::has_qualifiers() const
+ast_fully_specified_type::has_qualifiers(_mesa_glsl_parse_state *state) const
 {
    /* 'subroutine' isnt a real qualifier. */
    ast_type_qualifier subroutine_only;
    subroutine_only.flags.i = 0;
    subroutine_only.flags.q.subroutine = 1;
    subroutine_only.flags.q.subroutine_def = 1;
+   if (state->has_explicit_uniform_location()) {
+      subroutine_only.flags.q.explicit_index = 1;
+   }
    return (this->qualifier.flags.i & ~subroutine_only.flags.i) != 0;
 }
 
@@ -169,41 +172,32 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
    }
 
    if (q.flags.q.max_vertices) {
-      if (this->flags.q.max_vertices && this->max_vertices != q.max_vertices) {
+      if (this->max_vertices) {
+         this->max_vertices->merge_qualifier(q.max_vertices);
+      } else {
+         this->max_vertices = q.max_vertices;
+      }
+   }
+
+   if (q.flags.q.subroutine_def) {
+      if (this->flags.q.subroutine_def) {
 	 _mesa_glsl_error(loc, state,
-			  "geometry shader set conflicting max_vertices "
-			  "(%d and %d)", this->max_vertices, q.max_vertices);
-	 return false;
+			  "conflicting subroutine qualifiers used");
+      } else {
+         this->subroutine_list = q.subroutine_list;
       }
-      this->max_vertices = q.max_vertices;
    }
 
    if (q.flags.q.invocations) {
-      if (this->flags.q.invocations && this->invocations != q.invocations) {
-         _mesa_glsl_error(loc, state,
-                          "geometry shader set conflicting invocations "
-                          "(%d and %d)", this->invocations, q.invocations);
-         return false;
+      if (this->invocations) {
+         this->invocations->merge_qualifier(q.invocations);
+      } else {
+         this->invocations = q.invocations;
       }
-      this->invocations = q.invocations;
    }
 
    if (state->stage == MESA_SHADER_GEOMETRY &&
        state->has_explicit_attrib_stream()) {
-      if (q.flags.q.stream && q.stream >= state->ctx->Const.MaxVertexStreams) {
-         _mesa_glsl_error(loc, state,
-                          "`stream' value is larger than MAX_VERTEX_STREAMS - 1 "
-                          "(%d > %d)",
-                          q.stream, state->ctx->Const.MaxVertexStreams - 1);
-      }
-      if (this->flags.q.explicit_stream &&
-          this->stream >= state->ctx->Const.MaxVertexStreams) {
-         _mesa_glsl_error(loc, state,
-                          "`stream' value is larger than MAX_VERTEX_STREAMS - 1 "
-                          "(%d > %d)",
-                          this->stream, state->ctx->Const.MaxVertexStreams - 1);
-      }
-
       if (!this->flags.q.explicit_stream) {
          if (q.flags.q.stream) {
             this->flags.q.stream = 1;
@@ -222,14 +216,11 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
    }
 
    if (q.flags.q.vertices) {
-      if (this->flags.q.vertices && this->vertices != q.vertices) {
-	 _mesa_glsl_error(loc, state,
-			  "tessellation control shader set conflicting "
-			  "vertices (%d and %d)",
-			  this->vertices, q.vertices);
-	 return false;
+      if (this->vertices) {
+         this->vertices->merge_qualifier(q.vertices);
+      } else {
+         this->vertices = q.vertices;
       }
-      this->vertices = q.vertices;
    }
 
    if (q.flags.q.vertex_spacing) {
@@ -266,15 +257,11 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
 
    for (int i = 0; i < 3; i++) {
       if (q.flags.q.local_size & (1 << i)) {
-         if ((this->flags.q.local_size & (1 << i)) &&
-             this->local_size[i] != q.local_size[i]) {
-            _mesa_glsl_error(loc, state,
-                             "compute shader set conflicting values for "
-                             "local_size_%c (%d and %d)", 'x' + i,
-                             this->local_size[i], q.local_size[i]);
-            return false;
+         if (this->local_size[i]) {
+            this->local_size[i]->merge_qualifier(q.local_size[i]);
+         } else {
+            this->local_size[i] = q.local_size[i];
          }
-         this->local_size[i] = q.local_size[i];
       }
    }
 
@@ -313,7 +300,7 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
    const bool r = this->merge_qualifier(loc, state, q);
 
    if (state->stage == MESA_SHADER_TESS_CTRL) {
-      node = new(mem_ctx) ast_tcs_output_layout(*loc, q.vertices);
+      node = new(mem_ctx) ast_tcs_output_layout(*loc);
    }
 
    return r;
@@ -417,15 +404,13 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
       state->in_qualifier->prim_type = q.prim_type;
    }
 
-   if (this->flags.q.invocations &&
-       q.flags.q.invocations &&
-       this->invocations != q.invocations) {
-      _mesa_glsl_error(loc, state,
-                       "conflicting invocations counts specified");
-      return false;
-   } else if (q.flags.q.invocations) {
+   if (q.flags.q.invocations) {
       this->flags.q.invocations = 1;
-      this->invocations = q.invocations;
+      if (this->invocations) {
+         this->invocations->merge_qualifier(q.invocations);
+      } else {
+         this->invocations = q.invocations;
+      }
    }
 
    if (q.flags.q.early_fragment_tests) {
@@ -468,15 +453,67 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
    if (create_gs_ast) {
       node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
    } else if (create_cs_ast) {
-      /* Infer a local_size of 1 for every unspecified dimension */
-      unsigned local_size[3];
-      for (int i = 0; i < 3; i++) {
-         if (q.flags.q.local_size & (1 << i))
-            local_size[i] = q.local_size[i];
-         else
-            local_size[i] = 1;
+      node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size);
+   }
+
+   return true;
+}
+
+bool
+ast_layout_expression::process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+                                                  const char *qual_indentifier,
+                                                  unsigned *value,
+                                                  bool can_be_zero)
+{
+   int min_value = 0;
+   bool first_pass = true;
+   *value = 0;
+
+   if (!can_be_zero)
+      min_value = 1;
+
+   for (exec_node *node = layout_const_expressions.head;
+           !node->is_tail_sentinel(); node = node->next) {
+
+      exec_list dummy_instructions;
+      ast_node *const_expression = exec_node_data(ast_node, node, link);
+
+      ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
+
+      ir_constant *const const_int = ir->constant_expression_value();
+      if (const_int == NULL || !const_int->type->is_integer()) {
+         YYLTYPE loc = const_expression->get_location();
+         _mesa_glsl_error(&loc, state, "%s must be an integral constant "
+                          "expression", qual_indentifier);
+         return false;
+      }
+
+      if (const_int->value.i[0] < min_value) {
+         YYLTYPE loc = const_expression->get_location();
+         _mesa_glsl_error(&loc, state, "%s layout qualifier is invalid "
+                          "(%d < %d)", qual_indentifier,
+                          const_int->value.i[0], min_value);
+         return false;
       }
-      node = new(mem_ctx) ast_cs_input_layout(*loc, local_size);
+
+      if (!first_pass && *value != const_int->value.u[0]) {
+         YYLTYPE loc = const_expression->get_location();
+         _mesa_glsl_error(&loc, state, "%s layout qualifier does not "
+		          "match previous declaration (%d vs %d)",
+                          qual_indentifier, *value, const_int->value.i[0]);
+         return false;
+      } else {
+         first_pass = false;
+         *value = const_int->value.u[0];
+      }
+
+      /* If the location is const (and we've verified that
+       * it is) then no instructions should have been emitted
+       * when we converted it to HIR. If they were emitted,
+       * then either the location isn't const after all, or
+       * we are emitting unnecessary instructions.
+       */
+      assert(dummy_instructions.is_empty());
    }
 
    return true;
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 13494446b59..881ee2b6b55 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -290,6 +290,20 @@ texture_multisample_array(const _mesa_glsl_parse_state *state)
 }
 
 static bool
+texture_samples_identical(const _mesa_glsl_parse_state *state)
+{
+   return texture_multisample(state) &&
+          state->EXT_shader_samples_identical_enable;
+}
+
+static bool
+texture_samples_identical_array(const _mesa_glsl_parse_state *state)
+{
+   return texture_multisample_array(state) &&
+          state->EXT_shader_samples_identical_enable;
+}
+
+static bool
 fs_texture_cube_map_array(const _mesa_glsl_parse_state *state)
 {
    return state->stage == MESA_SHADER_FRAGMENT &&
@@ -724,6 +738,7 @@ private:
 
    BA2(textureQueryLod);
    B1(textureQueryLevels);
+   BA2(textureSamplesIdentical);
    B1(dFdx);
    B1(dFdy);
    B1(fwidth);
@@ -2210,6 +2225,16 @@ builtin_builder::create_builtins()
 
                 NULL);
 
+   add_function("textureSamplesIdenticalEXT",
+                _textureSamplesIdentical(texture_samples_identical, glsl_type::sampler2DMS_type,  glsl_type::ivec2_type),
+                _textureSamplesIdentical(texture_samples_identical, glsl_type::isampler2DMS_type, glsl_type::ivec2_type),
+                _textureSamplesIdentical(texture_samples_identical, glsl_type::usampler2DMS_type, glsl_type::ivec2_type),
+
+                _textureSamplesIdentical(texture_samples_identical_array, glsl_type::sampler2DMSArray_type,  glsl_type::ivec3_type),
+                _textureSamplesIdentical(texture_samples_identical_array, glsl_type::isampler2DMSArray_type, glsl_type::ivec3_type),
+                _textureSamplesIdentical(texture_samples_identical_array, glsl_type::usampler2DMSArray_type, glsl_type::ivec3_type),
+                NULL);
+
    add_function("texture1D",
                 _texture(ir_tex, v110,         glsl_type::vec4_type,  glsl_type::sampler1D_type, glsl_type::float_type),
                 _texture(ir_txb, v110_fs_only, glsl_type::vec4_type,  glsl_type::sampler1D_type, glsl_type::float_type),
@@ -3573,7 +3598,16 @@ builtin_builder::_isinf(builtin_available_predicate avail, const glsl_type *type
 
    ir_constant_data infinities;
    for (int i = 0; i < type->vector_elements; i++) {
-      infinities.f[i] = INFINITY;
+      switch (type->base_type) {
+      case GLSL_TYPE_FLOAT:
+         infinities.f[i] = INFINITY;
+         break;
+      case GLSL_TYPE_DOUBLE:
+         infinities.d[i] = INFINITY;
+         break;
+      default:
+         unreachable("unknown type");
+      }
    }
 
    body.emit(ret(equal(abs(x), imm(type, infinities))));
@@ -4675,6 +4709,25 @@ builtin_builder::_textureQueryLevels(const glsl_type *sampler_type)
    return sig;
 }
 
+ir_function_signature *
+builtin_builder::_textureSamplesIdentical(builtin_available_predicate avail,
+                                          const glsl_type *sampler_type,
+                                          const glsl_type *coord_type)
+{
+   ir_variable *s = in_var(sampler_type, "sampler");
+   ir_variable *P = in_var(coord_type, "P");
+   const glsl_type *return_type = glsl_type::bool_type;
+   MAKE_SIG(return_type, avail, 2, s, P);
+
+   ir_texture *tex = new(mem_ctx) ir_texture(ir_samples_identical);
+   tex->coordinate = var_ref(P);
+   tex->set_sampler(var_ref(s), return_type);
+
+   body.emit(ret(tex));
+
+   return sig;
+}
+
 UNOP(dFdx, ir_unop_dFdx, fs_oes_derivatives)
 UNOP(dFdxCoarse, ir_unop_dFdx_coarse, fs_derivative_control)
 UNOP(dFdxFine, ir_unop_dFdx_fine, fs_derivative_control)
@@ -5243,8 +5296,8 @@ builtin_builder::_image_size_prototype(const glsl_type *image_type,
 
 ir_function_signature *
 builtin_builder::_image_samples_prototype(const glsl_type *image_type,
-                                          unsigned num_arguments,
-                                          unsigned flags)
+                                          unsigned /* num_arguments */,
+                                          unsigned /* flags */)
 {
    ir_variable *image = in_var(image_type, "image");
    ir_function_signature *sig =
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index b06c1bc5c12..e8eab808a19 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -327,6 +327,7 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
    this->fields[this->num_fields].centroid = 0;
    this->fields[this->num_fields].sample = 0;
    this->fields[this->num_fields].patch = 0;
+   this->fields[this->num_fields].precision = GLSL_PRECISION_NONE;
    this->num_fields++;
 }
 
@@ -376,6 +377,11 @@ private:
       return add_variable(name, type, ir_var_shader_out, slot);
    }
 
+   ir_variable *add_index_output(int slot, int index, const glsl_type *type, const char *name)
+   {
+      return add_index_variable(name, type, ir_var_shader_out, slot, index);
+   }
+
    ir_variable *add_system_value(int slot, const glsl_type *type,
                                  const char *name)
    {
@@ -384,6 +390,8 @@ private:
 
    ir_variable *add_variable(const char *name, const glsl_type *type,
                              enum ir_variable_mode mode, int slot);
+   ir_variable *add_index_variable(const char *name, const glsl_type *type,
+                             enum ir_variable_mode mode, int slot, int index);
    ir_variable *add_uniform(const glsl_type *type, const char *name);
    ir_variable *add_const(const char *name, int value);
    ir_variable *add_const_ivec3(const char *name, int x, int y, int z);
@@ -429,6 +437,46 @@ builtin_variable_generator::builtin_variable_generator(
 {
 }
 
+ir_variable *
+builtin_variable_generator::add_index_variable(const char *name,
+                                         const glsl_type *type,
+                                         enum ir_variable_mode mode, int slot, int index)
+{
+   ir_variable *var = new(symtab) ir_variable(type, name, mode);
+   var->data.how_declared = ir_var_declared_implicitly;
+
+   switch (var->data.mode) {
+   case ir_var_auto:
+   case ir_var_shader_in:
+   case ir_var_uniform:
+   case ir_var_system_value:
+      var->data.read_only = true;
+      break;
+   case ir_var_shader_out:
+   case ir_var_shader_storage:
+      break;
+   default:
+      /* The only variables that are added using this function should be
+       * uniforms, shader storage, shader inputs, and shader outputs, constants
+       * (which use ir_var_auto), and system values.
+       */
+      assert(0);
+      break;
+   }
+
+   var->data.location = slot;
+   var->data.explicit_location = (slot >= 0);
+   var->data.explicit_index = 1;
+   var->data.index = index;
+
+   /* Once the variable is created an initialized, add it to the symbol table
+    * and add the declaration to the IR stream.
+    */
+   instructions->push_tail(var);
+
+   symtab->add_variable(var);
+   return var;
+}
 
 ir_variable *
 builtin_variable_generator::add_variable(const char *name,
@@ -580,6 +628,14 @@ builtin_variable_generator::generate_constants()
          add_const("gl_MaxVaryingVectors",
                    state->ctx->Const.MaxVarying);
       }
+
+      /* EXT_blend_func_extended brings a built in constant
+       * for determining number of dual source draw buffers
+       */
+      if (state->EXT_blend_func_extended_enable) {
+         add_const("gl_MaxDualSourceDrawBuffersEXT",
+                   state->Const.MaxDualSourceDrawBuffers);
+      }
    } else {
       add_const("gl_MaxVertexUniformComponents",
                 state->Const.MaxVertexUniformComponents);
@@ -1016,6 +1072,19 @@ builtin_variable_generator::generate_fs_special_vars()
                  array(vec4_t, state->Const.MaxDrawBuffers), "gl_FragData");
    }
 
+   if (state->es_shader && state->language_version == 100 && state->EXT_blend_func_extended_enable) {
+      /* We make an assumption here that there will only ever be one dual-source draw buffer
+       * In case this assumption is ever proven to be false, make sure to assert here
+       * since we don't handle this case.
+       * In practice, this issue will never arise since no hardware will support it.
+       */
+      assert(state->Const.MaxDualSourceDrawBuffers <= 1);
+      add_index_output(FRAG_RESULT_DATA0, 1, vec4_t, "gl_SecondaryFragColorEXT");
+      add_index_output(FRAG_RESULT_DATA0, 1,
+                       array(vec4_t, state->Const.MaxDualSourceDrawBuffers),
+                       "gl_SecondaryFragDataEXT");
+   }
+
    /* gl_FragDepth has always been in desktop GLSL, but did not appear in GLSL
     * ES 1.00.
     */
@@ -1186,6 +1255,7 @@ builtin_variable_generator::generate_varyings()
          var->data.centroid = fields[i].centroid;
          var->data.sample = fields[i].sample;
          var->data.patch = fields[i].patch;
+         var->data.precision = fields[i].precision;
          var->init_interface_type(per_vertex_out_type);
       }
    }
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index 6aa7abec00e..2fd4cf04079 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2384,6 +2384,8 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
                  add_builtin_define(parser, "GL_OES_standard_derivatives", 1);
               if (extensions->ARB_texture_multisample)
                  add_builtin_define(parser, "GL_OES_texture_storage_multisample_2d_array", 1);
+              if (extensions->ARB_blend_func_extended)
+                 add_builtin_define(parser, "GL_EXT_blend_func_extended", 1);
 	   }
 	} else {
 	   add_builtin_define(parser, "GL_ARB_draw_buffers", 1);
@@ -2510,6 +2512,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	if (extensions != NULL) {
 	   if (extensions->EXT_shader_integer_mix)
 	      add_builtin_define(parser, "GL_EXT_shader_integer_mix", 1);
+
+	   if (extensions->EXT_shader_samples_identical)
+	      add_builtin_define(parser, "GL_EXT_shader_samples_identical", 1);
 	}
 
 	if (version >= 150)
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index adf6a05acce..5a8f98019d1 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -298,7 +298,6 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <node> conditionopt
 %type <node> for_init_statement
 %type <for_rest_statement> for_rest_statement
-%type <n> integer_constant
 %type <node> layout_defaults
 
 %right THEN ELSE
@@ -1152,11 +1151,6 @@ layout_qualifier_id_list:
    }
    ;
 
-integer_constant:
-   INTCONSTANT { $$ = $1; }
-   | UINTCONSTANT { $$ = $1; }
-   ;
-
 layout_qualifier_id:
    any_identifier
    {
@@ -1453,9 +1447,18 @@ layout_qualifier_id:
          YYERROR;
       }
    }
-   | any_identifier '=' integer_constant
+   | any_identifier '=' constant_expression
    {
       memset(& $$, 0, sizeof($$));
+      void *ctx = state;
+
+      if ($3->oper != ast_int_constant &&
+          $3->oper != ast_uint_constant &&
+          !state->has_enhanced_layouts()) {
+         _mesa_glsl_error(& @1, state,
+                          "compile-time constant expressions require "
+                          "GLSL 4.40 or ARB_enhanced_layouts");
+      }
 
       if (match_layout_qualifier("location", $1, state) == 0) {
          $$.flags.q.explicit_location = 1;
@@ -1466,24 +1469,17 @@ layout_qualifier_id:
                                "GL_ARB_explicit_attrib_location layout "
                                "identifier `%s' used", $1);
          }
-
-         if ($3 >= 0) {
-            $$.location = $3;
-         } else {
-             _mesa_glsl_error(& @3, state, "invalid location %d specified", $3);
-             YYERROR;
-         }
+         $$.location = $3;
       }
 
       if (match_layout_qualifier("index", $1, state) == 0) {
-         $$.flags.q.explicit_index = 1;
-
-         if ($3 >= 0) {
-            $$.index = $3;
-         } else {
-            _mesa_glsl_error(& @3, state, "invalid index %d specified", $3);
+         if (state->es_shader && !state->EXT_blend_func_extended_enable) {
+            _mesa_glsl_error(& @3, state, "index layout qualifier requires EXT_blend_func_extended");
             YYERROR;
          }
+
+         $$.flags.q.explicit_index = 1;
+         $$.index = $3;
       }
 
       if ((state->has_420pack() ||
@@ -1502,18 +1498,11 @@ layout_qualifier_id:
 
       if (match_layout_qualifier("max_vertices", $1, state) == 0) {
          $$.flags.q.max_vertices = 1;
-
-         if ($3 < 0) {
+         $$.max_vertices = new(ctx) ast_layout_expression(@1, $3);
+         if (!state->is_version(150, 0)) {
             _mesa_glsl_error(& @3, state,
-                             "invalid max_vertices %d specified", $3);
-            YYERROR;
-         } else {
-            $$.max_vertices = $3;
-            if (!state->is_version(150, 0)) {
-               _mesa_glsl_error(& @3, state,
-                                "#version 150 max_vertices qualifier "
-                                "specified", $3);
-            }
+                             "#version 150 max_vertices qualifier "
+                             "specified", $3);
          }
       }
 
@@ -1521,15 +1510,8 @@ layout_qualifier_id:
          if (match_layout_qualifier("stream", $1, state) == 0 &&
              state->check_explicit_attrib_stream_allowed(& @3)) {
             $$.flags.q.stream = 1;
-
-            if ($3 < 0) {
-               _mesa_glsl_error(& @3, state,
-                                "invalid stream %d specified", $3);
-               YYERROR;
-            } else {
-               $$.flags.q.explicit_stream = 1;
-               $$.stream = $3;
-            }
+            $$.flags.q.explicit_stream = 1;
+            $$.stream = $3;
          }
       }
 
@@ -1541,12 +1523,7 @@ layout_qualifier_id:
       for (int i = 0; i < 3; i++) {
          if (match_layout_qualifier(local_size_qualifiers[i], $1,
                                     state) == 0) {
-            if ($3 <= 0) {
-               _mesa_glsl_error(& @3, state,
-                                "invalid %s of %d specified",
-                                local_size_qualifiers[i], $3);
-               YYERROR;
-            } else if (!state->has_compute_shader()) {
+            if (!state->has_compute_shader()) {
                _mesa_glsl_error(& @3, state,
                                 "%s qualifier requires GLSL 4.30 or "
                                 "GLSL ES 3.10 or ARB_compute_shader",
@@ -1554,7 +1531,7 @@ layout_qualifier_id:
                YYERROR;
             } else {
                $$.flags.q.local_size |= (1 << i);
-               $$.local_size[i] = $3;
+               $$.local_size[i] = new(ctx) ast_layout_expression(@1, $3);
             }
             break;
          }
@@ -1562,48 +1539,24 @@ layout_qualifier_id:
 
       if (match_layout_qualifier("invocations", $1, state) == 0) {
          $$.flags.q.invocations = 1;
-
-         if ($3 <= 0) {
+         $$.invocations = new(ctx) ast_layout_expression(@1, $3);
+         if (!state->is_version(400, 0) &&
+             !state->ARB_gpu_shader5_enable) {
             _mesa_glsl_error(& @3, state,
-                             "invalid invocations %d specified", $3);
-            YYERROR;
-         } else if ($3 > MAX_GEOMETRY_SHADER_INVOCATIONS) {
-            _mesa_glsl_error(& @3, state,
-                             "invocations (%d) exceeds "
-                             "GL_MAX_GEOMETRY_SHADER_INVOCATIONS", $3);
-            YYERROR;
-         } else {
-            $$.invocations = $3;
-            if (!state->is_version(400, 0) &&
-                !state->ARB_gpu_shader5_enable) {
-               _mesa_glsl_error(& @3, state,
-                                "GL_ARB_gpu_shader5 invocations "
-                                "qualifier specified", $3);
-            }
+                             "GL_ARB_gpu_shader5 invocations "
+                             "qualifier specified", $3);
          }
       }
 
       /* Layout qualifiers for tessellation control shaders. */
       if (match_layout_qualifier("vertices", $1, state) == 0) {
          $$.flags.q.vertices = 1;
-
-         if ($3 <= 0) {
-            _mesa_glsl_error(& @3, state,
-                             "invalid vertices (%d) specified", $3);
-            YYERROR;
-         } else if ($3 > (int)state->Const.MaxPatchVertices) {
-            _mesa_glsl_error(& @3, state,
-                             "vertices (%d) exceeds "
-                             "GL_MAX_PATCH_VERTICES", $3);
-            YYERROR;
-         } else {
-            $$.vertices = $3;
-            if (!state->ARB_tessellation_shader_enable &&
-                !state->is_version(400, 0)) {
-               _mesa_glsl_error(& @1, state,
-                                "vertices qualifier requires GLSL 4.00 or "
-                                "ARB_tessellation_shader");
-            }
+         $$.vertices = new(ctx) ast_layout_expression(@1, $3);
+         if (!state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "vertices qualifier requires GLSL 4.00 or "
+                             "ARB_tessellation_shader");
          }
       }
 
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 02584c62a4d..b41b64af2c1 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -104,6 +104,8 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
 
    this->Const.MaxDrawBuffers = ctx->Const.MaxDrawBuffers;
 
+   this->Const.MaxDualSourceDrawBuffers = ctx->Const.MaxDualSourceDrawBuffers;
+
    /* 1.50 constants */
    this->Const.MaxVertexOutputComponents = ctx->Const.Program[MESA_SHADER_VERTEX].MaxOutputComponents;
    this->Const.MaxGeometryInputComponents = ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxInputComponents;
@@ -646,9 +648,11 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(AMD_shader_trinary_minmax,      true,  false,     dummy_true),
    EXT(AMD_vertex_shader_layer,        true,  false,     AMD_vertex_shader_layer),
    EXT(AMD_vertex_shader_viewport_index, true,  false,   AMD_vertex_shader_viewport_index),
+   EXT(EXT_blend_func_extended,        false,  true,     ARB_blend_func_extended),
    EXT(EXT_draw_buffers,               false,  true,     dummy_true),
    EXT(EXT_separate_shader_objects,    false, true,      dummy_true),
    EXT(EXT_shader_integer_mix,         true,  true,      EXT_shader_integer_mix),
+   EXT(EXT_shader_samples_identical,   true,  true,      EXT_shader_samples_identical),
    EXT(EXT_texture_array,              true,  false,     EXT_texture_array),
 };
 
@@ -1646,8 +1650,20 @@ set_shader_inout_layout(struct gl_shader *shader,
    switch (shader->Stage) {
    case MESA_SHADER_TESS_CTRL:
       shader->TessCtrl.VerticesOut = 0;
-      if (state->tcs_output_vertices_specified)
-         shader->TessCtrl.VerticesOut = state->out_qualifier->vertices;
+      if (state->tcs_output_vertices_specified) {
+         unsigned vertices;
+         if (state->out_qualifier->vertices->
+               process_qualifier_constant(state, "vertices", &vertices,
+                                          false)) {
+
+            YYLTYPE loc = state->out_qualifier->vertices->get_location();
+            if (vertices > state->Const.MaxPatchVertices) {
+               _mesa_glsl_error(&loc, state, "vertices (%d) exceeds "
+                                "GL_MAX_PATCH_VERTICES", vertices);
+            }
+            shader->TessCtrl.VerticesOut = vertices;
+         }
+      }
       break;
    case MESA_SHADER_TESS_EVAL:
       shader->TessEval.PrimitiveMode = PRIM_UNKNOWN;
@@ -1668,8 +1684,14 @@ set_shader_inout_layout(struct gl_shader *shader,
       break;
    case MESA_SHADER_GEOMETRY:
       shader->Geom.VerticesOut = 0;
-      if (state->out_qualifier->flags.q.max_vertices)
-         shader->Geom.VerticesOut = state->out_qualifier->max_vertices;
+      if (state->out_qualifier->flags.q.max_vertices) {
+         unsigned qual_max_vertices;
+         if (state->out_qualifier->max_vertices->
+               process_qualifier_constant(state, "max_vertices",
+                                          &qual_max_vertices, true)) {
+            shader->Geom.VerticesOut = qual_max_vertices;
+         }
+      }
 
       if (state->gs_input_prim_type_specified) {
          shader->Geom.InputType = state->in_qualifier->prim_type;
@@ -1684,8 +1706,22 @@ set_shader_inout_layout(struct gl_shader *shader,
       }
 
       shader->Geom.Invocations = 0;
-      if (state->in_qualifier->flags.q.invocations)
-         shader->Geom.Invocations = state->in_qualifier->invocations;
+      if (state->in_qualifier->flags.q.invocations) {
+         unsigned invocations;
+         if (state->in_qualifier->invocations->
+               process_qualifier_constant(state, "invocations",
+                                          &invocations, false)) {
+
+            YYLTYPE loc = state->in_qualifier->invocations->get_location();
+            if (invocations > MAX_GEOMETRY_SHADER_INVOCATIONS) {
+               _mesa_glsl_error(&loc, state,
+                                "invocations (%d) exceeds "
+                                "GL_MAX_GEOMETRY_SHADER_INVOCATIONS",
+                                invocations);
+            }
+            shader->Geom.Invocations = invocations;
+         }
+      }
       break;
 
    case MESA_SHADER_COMPUTE:
@@ -1797,6 +1833,9 @@ _mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader,
    if (shader->InfoLog)
       ralloc_free(shader->InfoLog);
 
+   if (!state->error)
+      set_shader_inout_layout(shader, state);
+
    shader->symbols = new(shader->ir) glsl_symbol_table;
    shader->CompileStatus = !state->error;
    shader->InfoLog = state->info_log;
@@ -1804,9 +1843,6 @@ _mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader,
    shader->IsES = state->es_shader;
    shader->uses_builtin_functions = state->uses_builtin_functions;
 
-   if (!state->error)
-      set_shader_inout_layout(shader, state);
-
    /* Retain any live IR, but trash the rest. */
    reparent_ir(shader->ir, shader->ir);
 
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 1d8c1b8799f..17ff0b5af79 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -380,6 +380,9 @@ struct _mesa_glsl_parse_state {
       /* ARB_draw_buffers */
       unsigned MaxDrawBuffers;
 
+      /* ARB_blend_func_extended */
+      unsigned MaxDualSourceDrawBuffers;
+
       /* 3.00 ES */
       int MinProgramTexelOffset;
       int MaxProgramTexelOffset;
@@ -595,12 +598,16 @@ struct _mesa_glsl_parse_state {
    bool AMD_vertex_shader_layer_warn;
    bool AMD_vertex_shader_viewport_index_enable;
    bool AMD_vertex_shader_viewport_index_warn;
+   bool EXT_blend_func_extended_enable;
+   bool EXT_blend_func_extended_warn;
    bool EXT_draw_buffers_enable;
    bool EXT_draw_buffers_warn;
    bool EXT_separate_shader_objects_enable;
    bool EXT_separate_shader_objects_warn;
    bool EXT_shader_integer_mix_enable;
    bool EXT_shader_integer_mix_warn;
+   bool EXT_shader_samples_identical_enable;
+   bool EXT_shader_samples_identical_warn;
    bool EXT_texture_array_enable;
    bool EXT_texture_array_warn;
    /*@}*/
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 8933b230177..ca520f547a1 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -1421,12 +1421,11 @@ ir_dereference::is_lvalue() const
 }
 
 
-static const char * const tex_opcode_strs[] = { "tex", "txb", "txl", "txd", "txf", "txf_ms", "txs", "lod", "tg4", "query_levels", "texture_samples" };
+static const char * const tex_opcode_strs[] = { "tex", "txb", "txl", "txd", "txf", "txf_ms", "txs", "lod", "tg4", "query_levels", "texture_samples", "samples_identical" };
 
 const char *ir_texture::opcode_string()
 {
-   assert((unsigned int) op <=
-	  sizeof(tex_opcode_strs) / sizeof(tex_opcode_strs[0]));
+   assert((unsigned int) op < ARRAY_SIZE(tex_opcode_strs));
    return tex_opcode_strs[op];
 }
 
@@ -1456,6 +1455,10 @@ ir_texture::set_sampler(ir_dereference *sampler, const glsl_type *type)
    } else if (this->op == ir_lod) {
       assert(type->vector_elements == 2);
       assert(type->base_type == GLSL_TYPE_FLOAT);
+   } else if (this->op == ir_samples_identical) {
+      assert(type == glsl_type::bool_type);
+      assert(sampler->type->base_type == GLSL_TYPE_SAMPLER);
+      assert(sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS);
    } else {
       assert(sampler->type->sampler_type == (int) type->base_type);
       if (sampler->type->sampler_shadow)
@@ -1676,6 +1679,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
    this->data.interpolation = INTERP_QUALIFIER_NONE;
    this->data.max_array_access = 0;
    this->data.atomic.offset = 0;
+   this->data.precision = GLSL_PRECISION_NONE;
    this->data.image_read_only = false;
    this->data.image_write_only = false;
    this->data.image_coherent = false;
@@ -1842,6 +1846,7 @@ ir_function_signature::replace_parameters(exec_list *new_params)
 ir_function::ir_function(const char *name)
    : ir_instruction(ir_type_function)
 {
+   this->subroutine_index = -1;
    this->name = ralloc_strdup(this, name);
 }
 
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index d59dee1e369..e1109eec1d3 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1171,6 +1171,8 @@ public:
     */
    int num_subroutine_types;
    const struct glsl_type **subroutine_types;
+
+   int subroutine_index;
 };
 
 inline const char *ir_function_signature::function_name() const
@@ -1965,6 +1967,7 @@ enum ir_texture_opcode {
    ir_tg4,		/**< Texture gather */
    ir_query_levels,     /**< Texture levels query */
    ir_texture_samples,  /**< Texture samples query */
+   ir_samples_identical, /**< Query whether all samples are definitely identical. */
 };
 
 
@@ -1991,6 +1994,7 @@ enum ir_texture_opcode {
  * (lod <type> <sampler> <coordinate>)
  * (tg4 <type> <sampler> <coordinate> <offset> <component>)
  * (query_levels <type> <sampler>)
+ * (samples_identical <sampler> <coordinate>)
  */
 class ir_texture : public ir_rvalue {
 public:
diff --git a/src/glsl/ir_clone.cpp b/src/glsl/ir_clone.cpp
index d6b06eeec87..2aef4fcb4ac 100644
--- a/src/glsl/ir_clone.cpp
+++ b/src/glsl/ir_clone.cpp
@@ -223,6 +223,7 @@ ir_texture::clone(void *mem_ctx, struct hash_table *ht) const
    case ir_lod:
    case ir_query_levels:
    case ir_texture_samples:
+   case ir_samples_identical:
       break;
    case ir_txb:
       new_tex->lod_info.bias = this->lod_info.bias->clone(mem_ctx, ht);
@@ -269,6 +270,7 @@ ir_function::clone(void *mem_ctx, struct hash_table *ht) const
    ir_function *copy = new(mem_ctx) ir_function(this->name);
 
    copy->is_subroutine = this->is_subroutine;
+   copy->subroutine_index = this->subroutine_index;
    copy->num_subroutine_types = this->num_subroutine_types;
    copy->subroutine_types = ralloc_array(mem_ctx, const struct glsl_type *, copy->num_subroutine_types);
    for (int i = 0; i < copy->num_subroutine_types; i++)
diff --git a/src/glsl/ir_equals.cpp b/src/glsl/ir_equals.cpp
index 5f0785e0ece..b86f4ea16bb 100644
--- a/src/glsl/ir_equals.cpp
+++ b/src/glsl/ir_equals.cpp
@@ -58,8 +58,13 @@ ir_constant::equals(const ir_instruction *ir, enum ir_node_type) const
       return false;
 
    for (unsigned i = 0; i < type->components(); i++) {
-      if (value.u[i] != other->value.u[i])
-         return false;
+      if (type->base_type == GLSL_TYPE_DOUBLE) {
+         if (value.d[i] != other->value.d[i])
+            return false;
+      } else {
+         if (value.u[i] != other->value.u[i])
+            return false;
+      }
    }
 
    return true;
@@ -152,6 +157,7 @@ ir_texture::equals(const ir_instruction *ir, enum ir_node_type ignore) const
    case ir_lod:
    case ir_query_levels:
    case ir_texture_samples:
+   case ir_samples_identical:
       break;
    case ir_txb:
       if (!lod_info.bias->equals(other->lod_info.bias, ignore))
diff --git a/src/glsl/ir_hv_accept.cpp b/src/glsl/ir_hv_accept.cpp
index 6495cc4581d..213992af28c 100644
--- a/src/glsl/ir_hv_accept.cpp
+++ b/src/glsl/ir_hv_accept.cpp
@@ -195,6 +195,7 @@ ir_texture::accept(ir_hierarchical_visitor *v)
    case ir_lod:
    case ir_query_levels:
    case ir_texture_samples:
+   case ir_samples_identical:
       break;
    case ir_txb:
       s = this->lod_info.bias->accept(v);
diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp
index 42b03fdea52..fd7bc2eea98 100644
--- a/src/glsl/ir_print_visitor.cpp
+++ b/src/glsl/ir_print_visitor.cpp
@@ -268,6 +268,14 @@ void ir_print_visitor::visit(ir_texture *ir)
 {
    fprintf(f, "(%s ", ir->opcode_string());
 
+   if (ir->op == ir_samples_identical) {
+      ir->sampler->accept(this);
+      fprintf(f, " ");
+      ir->coordinate->accept(this);
+      fprintf(f, ")");
+      return;
+   }
+
    print_type(f, ir->type);
    fprintf(f, " ");
 
@@ -334,6 +342,8 @@ void ir_print_visitor::visit(ir_texture *ir)
    case ir_tg4:
       ir->lod_info.component->accept(this);
       break;
+   case ir_samples_identical:
+      unreachable(!"ir_samples_identical was already handled");
    };
    fprintf(f, ")");
 }
diff --git a/src/glsl/ir_rvalue_visitor.cpp b/src/glsl/ir_rvalue_visitor.cpp
index a6966f546bc..6486838b8b8 100644
--- a/src/glsl/ir_rvalue_visitor.cpp
+++ b/src/glsl/ir_rvalue_visitor.cpp
@@ -59,6 +59,7 @@ ir_rvalue_base_visitor::rvalue_visit(ir_texture *ir)
    case ir_lod:
    case ir_query_levels:
    case ir_texture_samples:
+   case ir_samples_identical:
       break;
    case ir_txb:
       handle_rvalue(&ir->lod_info.bias);
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 7e77a675db1..c0b4b3e820c 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -766,7 +766,7 @@ public:
                    gl_shader_stage consumer_stage);
    ~varying_matches();
    void record(ir_variable *producer_var, ir_variable *consumer_var);
-   unsigned assign_locations();
+   unsigned assign_locations(uint64_t reserved_slots);
    void store_locations() const;
 
 private:
@@ -986,7 +986,7 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
  * passed to varying_matches::record().
  */
 unsigned
-varying_matches::assign_locations()
+varying_matches::assign_locations(uint64_t reserved_slots)
 {
    /* Sort varying matches into an order that makes them easy to pack. */
    qsort(this->matches, this->num_matches, sizeof(*this->matches),
@@ -1013,6 +1013,10 @@ varying_matches::assign_locations()
           != this->matches[i].packing_class) {
          *location = ALIGN(*location, 4);
       }
+      while ((*location < MAX_VARYING * 4u) &&
+            (reserved_slots & (1u << *location / 4u))) {
+         *location = ALIGN(*location + 1, 4);
+      }
 
       this->matches[i].generic_location = *location;
 
@@ -1376,6 +1380,38 @@ canonicalize_shader_io(exec_list *ir, enum ir_variable_mode io_mode)
 }
 
 /**
+ * Generate a bitfield map of the explicit locations for shader varyings.
+ *
+ * In theory a 32 bits value will be enough but a 64 bits value is future proof.
+ */
+uint64_t
+reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode)
+{
+   assert(io_mode == ir_var_shader_in || io_mode == ir_var_shader_out);
+   assert(MAX_VARYING <= 64); /* avoid an overflow of the returned value */
+
+   uint64_t slots = 0;
+   int var_slot;
+
+   if (!stage)
+      return slots;
+
+   foreach_in_list(ir_instruction, node, stage->ir) {
+      ir_variable *const var = node->as_variable();
+
+      if (var == NULL || var->data.mode != io_mode || !var->data.explicit_location)
+         continue;
+
+      var_slot = var->data.location - VARYING_SLOT_VAR0;
+      if (var_slot >= 0 && var_slot < MAX_VARYING)
+         slots |= 1u << var_slot;
+   }
+
+   return slots;
+}
+
+
+/**
  * Assign locations for all variables that are produced in one pipeline stage
  * (the "producer") and consumed in the next stage (the "consumer").
  *
@@ -1550,7 +1586,11 @@ assign_varying_locations(struct gl_context *ctx,
          matches.record(matched_candidate->toplevel_var, NULL);
    }
 
-   const unsigned slots_used = matches.assign_locations();
+   const uint64_t reserved_slots =
+      reserved_varying_slot(producer, ir_var_shader_out) |
+      reserved_varying_slot(consumer, ir_var_shader_in);
+
+   const unsigned slots_used = matches.assign_locations(reserved_slots);
    matches.store_locations();
 
    for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index db00f8febc6..331d9a28007 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3864,10 +3864,43 @@ link_assign_subroutine_types(struct gl_shader_program *prog)
          sh->SubroutineFunctions[sh->NumSubroutineFunctions].types =
             ralloc_array(sh, const struct glsl_type *,
                          fn->num_subroutine_types);
+
+         /* From Section 4.4.4(Subroutine Function Layout Qualifiers) of the
+          * GLSL 4.5 spec:
+          *
+          *    "Each subroutine with an index qualifier in the shader must be
+          *    given a unique index, otherwise a compile or link error will be
+          *    generated."
+          */
+         for (unsigned j = 0; j < sh->NumSubroutineFunctions; j++) {
+            if (sh->SubroutineFunctions[j].index != -1 &&
+                sh->SubroutineFunctions[j].index == fn->subroutine_index) {
+               linker_error(prog, "each subroutine index qualifier in the "
+                            "shader must be unique\n");
+               return;
+            }
+         }
+         sh->SubroutineFunctions[sh->NumSubroutineFunctions].index =
+            fn->subroutine_index;
+
          for (int j = 0; j < fn->num_subroutine_types; j++)
             sh->SubroutineFunctions[sh->NumSubroutineFunctions].types[j] = fn->subroutine_types[j];
          sh->NumSubroutineFunctions++;
       }
+
+      /* Assign index for subroutines without an explicit index*/
+      int index = 0;
+      for (unsigned j = 0; j < sh->NumSubroutineFunctions; j++) {
+         while (sh->SubroutineFunctions[j].index == -1) {
+            for (unsigned k = 0; k < sh->NumSubroutineFunctions; k++) {
+               if (sh->SubroutineFunctions[k].index == index)
+                  break;
+               else if (k == sh->NumSubroutineFunctions - 1)
+                  sh->SubroutineFunctions[j].index = index;
+            }
+            index++;
+         }
+      }
    }
 }
 
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index d8df3544f10..a26300d1d26 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -31,6 +31,7 @@
 #include "ir_visitor.h"
 #include "ir_hierarchical_visitor.h"
 #include "ir.h"
+#include "main/imports.h"
 
 /*
  * pass to lower GLSL IR to NIR
@@ -147,16 +148,10 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
 
    nir_lower_outputs_to_temporaries(shader);
 
-   /* TODO: Use _mesa_fls instead */
-   unsigned num_textures = 0;
-   for (unsigned i = 0; i < 8 * sizeof(sh->Program->SamplersUsed); i++)
-      if (sh->Program->SamplersUsed & (1 << i))
-         num_textures = i;
-
    shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
    if (shader_prog->Label)
       shader->info.label = ralloc_strdup(shader, shader_prog->Label);
-   shader->info.num_textures = num_textures;
+   shader->info.num_textures = _mesa_fls(sh->Program->SamplersUsed);
    shader->info.num_ubos = sh->NumUniformBlocks;
    shader->info.num_abos = shader_prog->NumAtomicBuffers;
    shader->info.num_ssbos = sh->NumShaderStorageBlocks;
@@ -174,6 +169,10 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
       shader_prog->TransformFeedback.NumVarying > 0;
 
    switch (stage) {
+   case MESA_SHADER_TESS_CTRL:
+      shader->info.tcs.vertices_out = shader_prog->TessCtrl.VerticesOut;
+      break;
+
    case MESA_SHADER_GEOMETRY:
       shader->info.gs.vertices_in = shader_prog->Geom.VerticesIn;
       shader->info.gs.output_primitive = sh->Geom.OutputType;
@@ -244,6 +243,8 @@ constant_copy(ir_constant *ir, void *mem_ctx)
 
    unsigned total_elems = ir->type->components();
    unsigned i;
+
+   ret->num_elements = 0;
    switch (ir->type->base_type) {
    case GLSL_TYPE_UINT:
       for (i = 0; i < total_elems; i++)
@@ -268,6 +269,8 @@ constant_copy(ir_constant *ir, void *mem_ctx)
    case GLSL_TYPE_STRUCT:
       ret->elements = ralloc_array(mem_ctx, nir_constant *,
                                    ir->type->length);
+      ret->num_elements = ir->type->length;
+
       i = 0;
       foreach_in_list(ir_constant, field, &ir->components) {
          ret->elements[i] = constant_copy(field, mem_ctx);
@@ -278,6 +281,7 @@ constant_copy(ir_constant *ir, void *mem_ctx)
    case GLSL_TYPE_ARRAY:
       ret->elements = ralloc_array(mem_ctx, nir_constant *,
                                    ir->type->length);
+      ret->num_elements = ir->type->length;
 
       for (i = 0; i < ir->type->length; i++)
          ret->elements[i] = constant_copy(ir->array_elements[i], mem_ctx);
@@ -297,15 +301,6 @@ nir_visitor::visit(ir_variable *ir)
    var->type = ir->type;
    var->name = ralloc_strdup(var, ir->name);
 
-   if (ir->is_interface_instance() && ir->get_max_ifc_array_access() != NULL) {
-      unsigned size = ir->get_interface_type()->length;
-      var->max_ifc_array_access = ralloc_array(var, unsigned, size);
-      memcpy(var->max_ifc_array_access, ir->get_max_ifc_array_access(),
-             size * sizeof(unsigned));
-   } else {
-      var->max_ifc_array_access = NULL;
-   }
-
    var->data.read_only = ir->data.read_only;
    var->data.centroid = ir->data.centroid;
    var->data.sample = ir->data.sample;
@@ -1543,9 +1538,9 @@ nir_visitor::visit(ir_expression *ir)
       result = supports_ints ? nir_ior(&b, srcs[0], srcs[1])
                              : nir_for(&b, srcs[0], srcs[1]);
       break;
-   case ir_binop_logic_xor: result = nir_ixor(&b, srcs[0], srcs[1]); break;
-      result = supports_ints ? nir_ior(&b, srcs[0], srcs[1])
-                             : nir_for(&b, srcs[0], srcs[1]);
+   case ir_binop_logic_xor:
+      result = supports_ints ? nir_ixor(&b, srcs[0], srcs[1])
+                             : nir_fxor(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_lshift: result = nir_ishl(&b, srcs[0], srcs[1]); break;
    case ir_binop_rshift:
@@ -1808,6 +1803,11 @@ nir_visitor::visit(ir_texture *ir)
       num_srcs = 0;
       break;
 
+   case ir_samples_identical:
+      op = nir_texop_samples_identical;
+      num_srcs = 1; /* coordinate */
+      break;
+
    default:
       unreachable("not reached");
    }
@@ -1835,8 +1835,9 @@ nir_visitor::visit(ir_texture *ir)
    case GLSL_TYPE_INT:
       instr->dest_type = nir_type_int;
       break;
+   case GLSL_TYPE_BOOL:
    case GLSL_TYPE_UINT:
-      instr->dest_type = nir_type_unsigned;
+      instr->dest_type = nir_type_uint;
       break;
    default:
       unreachable("not reached");
diff --git a/src/glsl/nir/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp
index 3e9d38f7707..64b5c0cb106 100644
--- a/src/glsl/nir/glsl_types.cpp
+++ b/src/glsl/nir/glsl_types.cpp
@@ -130,6 +130,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].image_coherent = fields[i].image_coherent;
       this->fields.structure[i].image_volatile = fields[i].image_volatile;
       this->fields.structure[i].image_restrict = fields[i].image_restrict;
+      this->fields.structure[i].precision = fields[i].precision;
    }
 
    mtx_unlock(&glsl_type::mutex);
diff --git a/src/glsl/nir/glsl_types.h b/src/glsl/nir/glsl_types.h
index 14c2aa49f85..1aafa5cd547 100644
--- a/src/glsl/nir/glsl_types.h
+++ b/src/glsl/nir/glsl_types.h
@@ -858,7 +858,7 @@ struct glsl_struct_field {
    /**
     * Precision qualifier
     */
-   unsigned precision;
+   unsigned precision:2;
 
    /**
     * Image qualifiers, applicable to buffer variables defined in shader
@@ -873,7 +873,8 @@ struct glsl_struct_field {
 #ifdef __cplusplus
    glsl_struct_field(const struct glsl_type *_type, const char *_name)
       : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
-        sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0)
+        sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0),
+        precision(GLSL_PRECISION_NONE)
    {
       /* empty */
    }
diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index 3157ff82d99..79df6d3df94 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -107,6 +107,10 @@ void
 nir_shader_add_variable(nir_shader *shader, nir_variable *var)
 {
    switch (var->data.mode) {
+   case nir_var_all:
+      assert(!"invalid mode");
+      break;
+
    case nir_var_local:
       assert(!"nir_shader_add_variable cannot be used for local variables");
       break;
@@ -312,6 +316,14 @@ nir_block_create(nir_shader *shader)
    block->predecessors = _mesa_set_create(block, _mesa_hash_pointer,
                                           _mesa_key_pointer_equal);
    block->imm_dom = NULL;
+   /* XXX maybe it would be worth it to defer allocation?  This
+    * way it doesn't get allocated for shader ref's that never run
+    * nir_calc_dominance?  For example, state-tracker creates an
+    * initial IR, clones that, runs appropriate lowering pass, passes
+    * to driver which does common lowering/opt, and then stores ref
+    * which is later used to do state specific lowering and futher
+    * opt.  Do any of the references not need dominance metadata?
+    */
    block->dom_frontier = _mesa_set_create(block, _mesa_hash_pointer,
                                           _mesa_key_pointer_equal);
 
@@ -1306,21 +1318,62 @@ nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src)
 {
    assert(!new_src.is_ssa || def != new_src.ssa);
 
-   nir_foreach_use_safe(def, use_src) {
-      nir_instr *src_parent_instr = use_src->parent_instr;
-      list_del(&use_src->use_link);
-      nir_src_copy(use_src, &new_src, src_parent_instr);
-      src_add_all_uses(use_src, src_parent_instr, NULL);
-   }
+   nir_foreach_use_safe(def, use_src)
+      nir_instr_rewrite_src(use_src->parent_instr, use_src, new_src);
+
+   nir_foreach_if_use_safe(def, use_src)
+      nir_if_rewrite_condition(use_src->parent_if, new_src);
+}
+
+static bool
+is_instr_between(nir_instr *start, nir_instr *end, nir_instr *between)
+{
+   assert(start->block == end->block);
+
+   if (between->block != start->block)
+      return false;
+
+   /* Search backwards looking for "between" */
+   while (start != end) {
+      if (between == end)
+         return true;
 
-   nir_foreach_if_use_safe(def, use_src) {
-      nir_if *src_parent_if = use_src->parent_if;
-      list_del(&use_src->use_link);
-      nir_src_copy(use_src, &new_src, src_parent_if);
-      src_add_all_uses(use_src, NULL, src_parent_if);
+      end = nir_instr_prev(end);
+      assert(end);
    }
+
+   return false;
 }
 
+/* Replaces all uses of the given SSA def with the given source but only if
+ * the use comes after the after_me instruction.  This can be useful if you
+ * are emitting code to fix up the result of some instruction: you can freely
+ * use the result in that code and then call rewrite_uses_after and pass the
+ * last fixup instruction as after_me and it will replace all of the uses you
+ * want without touching the fixup code.
+ *
+ * This function assumes that after_me is in the same block as
+ * def->parent_instr and that after_me comes after def->parent_instr.
+ */
+void
+nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
+                               nir_instr *after_me)
+{
+   assert(!new_src.is_ssa || def != new_src.ssa);
+
+   nir_foreach_use_safe(def, use_src) {
+      assert(use_src->parent_instr != def->parent_instr);
+      /* Since def already dominates all of its uses, the only way a use can
+       * not be dominated by after_me is if it is between def and after_me in
+       * the instruction list.
+       */
+      if (!is_instr_between(def->parent_instr, after_me, use_src->parent_instr))
+         nir_instr_rewrite_src(use_src->parent_instr, use_src, new_src);
+   }
+
+   nir_foreach_if_use_safe(def, use_src)
+      nir_if_rewrite_condition(use_src->parent_if, new_src);
+}
 
 static bool foreach_cf_node(nir_cf_node *node, nir_foreach_block_cb cb,
                             bool reverse, void *state);
@@ -1571,6 +1624,8 @@ nir_intrinsic_from_system_value(gl_system_value val)
       return nir_intrinsic_load_tess_level_inner;
    case SYSTEM_VALUE_VERTICES_IN:
       return nir_intrinsic_load_patch_vertices_in;
+   case SYSTEM_VALUE_HELPER_INVOCATION:
+      return nir_intrinsic_load_helper_invocation;
    default:
       unreachable("system value does not directly correspond to intrinsic");
    }
@@ -1614,6 +1669,8 @@ nir_system_value_from_intrinsic(nir_intrinsic_op intrin)
       return SYSTEM_VALUE_TESS_LEVEL_INNER;
    case nir_intrinsic_load_patch_vertices_in:
       return SYSTEM_VALUE_VERTICES_IN;
+   case nir_intrinsic_load_helper_invocation:
+      return SYSTEM_VALUE_HELPER_INVOCATION;
    default:
       unreachable("intrinsic doesn't produce a system value");
    }
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index df0e6f1f54a..b7374e17407 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -82,6 +82,7 @@ typedef struct {
 } nir_state_slot;
 
 typedef enum {
+   nir_var_all = -1,
    nir_var_shader_in,
    nir_var_shader_out,
    nir_var_global,
@@ -111,6 +112,11 @@ typedef struct nir_constant {
     */
    union nir_constant_data value;
 
+   /* we could get this from the var->type but makes clone *much* easier to
+    * not have to care about the type.
+    */
+   unsigned num_elements;
+
    /* Array elements / Structure Fields */
    struct nir_constant **elements;
 } nir_constant;
@@ -147,19 +153,6 @@ typedef struct {
     */
    char *name;
 
-   /**
-    * For variables which satisfy the is_interface_instance() predicate, this
-    * points to an array of integers such that if the ith member of the
-    * interface block is an array, max_ifc_array_access[i] is the maximum
-    * array element of that member that has been accessed.  If the ith member
-    * of the interface block is not an array, max_ifc_array_access[i] is
-    * unused.
-    *
-    * For variables whose type is not an interface block, this pointer is
-    * NULL.
-    */
-   unsigned *max_ifc_array_access;
-
    struct nir_variable_data {
 
       /**
@@ -654,7 +647,7 @@ typedef enum {
    nir_type_invalid = 0, /* Not a valid type */
    nir_type_float,
    nir_type_int,
-   nir_type_unsigned,
+   nir_type_uint,
    nir_type_bool
 } nir_alu_type;
 
@@ -977,6 +970,9 @@ typedef enum {
    nir_texop_tg4,                /**< Texture gather */
    nir_texop_query_levels,       /**< Texture levels query */
    nir_texop_texture_samples,    /**< Texture samples query */
+   nir_texop_samples_identical,  /**< Query whether all samples are definitely
+                                  * identical.
+                                  */
 } nir_texop;
 
 typedef struct {
@@ -1069,6 +1065,7 @@ nir_tex_instr_dest_size(nir_tex_instr *instr)
 
    case nir_texop_texture_samples:
    case nir_texop_query_levels:
+   case nir_texop_samples_identical:
       return 1;
 
    default:
@@ -1079,6 +1076,31 @@ nir_tex_instr_dest_size(nir_tex_instr *instr)
    }
 }
 
+/* Returns true if this texture operation queries something about the texture
+ * rather than actually sampling it.
+ */
+static inline bool
+nir_tex_instr_is_query(nir_tex_instr *instr)
+{
+   switch (instr->op) {
+   case nir_texop_txs:
+   case nir_texop_lod:
+   case nir_texop_texture_samples:
+   case nir_texop_query_levels:
+      return true;
+   case nir_texop_tex:
+   case nir_texop_txb:
+   case nir_texop_txl:
+   case nir_texop_txd:
+   case nir_texop_txf:
+   case nir_texop_txf_ms:
+   case nir_texop_tg4:
+      return false;
+   default:
+      unreachable("Invalid texture opcode");
+   }
+}
+
 static inline unsigned
 nir_tex_instr_src_size(nir_tex_instr *instr, unsigned src)
 {
@@ -1353,6 +1375,7 @@ typedef enum {
    nir_metadata_block_index = 0x1,
    nir_metadata_dominance = 0x2,
    nir_metadata_live_ssa_defs = 0x4,
+   nir_metadata_not_properly_reset = 0x8,
 } nir_metadata;
 
 typedef struct {
@@ -1578,6 +1601,11 @@ typedef struct nir_shader_info {
       struct {
          unsigned local_size[3];
       } cs;
+
+      struct {
+         /** The number of vertices in the TCS output patch. */
+         unsigned vertices_out;
+      } tcs;
    };
 } nir_shader_info;
 
@@ -1910,6 +1938,8 @@ void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
 void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
                       unsigned num_components, const char *name);
 void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src);
+void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
+                                    nir_instr *after_me);
 
 /* visits basic blocks in source-code order */
 typedef bool (*nir_foreach_block_cb)(nir_block *block, void *state);
@@ -1937,10 +1967,16 @@ void nir_index_blocks(nir_function_impl *impl);
 void nir_print_shader(nir_shader *shader, FILE *fp);
 void nir_print_instr(const nir_instr *instr, FILE *fp);
 
+nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
+
 #ifdef DEBUG
 void nir_validate_shader(nir_shader *shader);
+void nir_metadata_set_validation_flag(nir_shader *shader);
+void nir_metadata_check_validation_flag(nir_shader *shader);
 #else
 static inline void nir_validate_shader(nir_shader *shader) { (void) shader; }
+static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
+static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
 #endif /* DEBUG */
 
 void nir_calc_dominance_impl(nir_function_impl *impl);
@@ -2032,9 +2068,22 @@ typedef struct nir_lower_tex_options {
    unsigned saturate_s;
    unsigned saturate_t;
    unsigned saturate_r;
+
+   /* Bitmask of samplers that need swizzling.
+    *
+    * If (swizzle_result & (1 << sampler_index)), then the swizzle in
+    * swizzles[sampler_index] is applied to the result of the texturing
+    * operation.
+    */
+   unsigned swizzle_result;
+
+   /* A swizzle for each sampler.  Values 0-3 represent x, y, z, or w swizzles
+    * while 4 and 5 represent 0 and 1 respectively.
+    */
+   uint8_t swizzles[32][4];
 } nir_lower_tex_options;
 
-void nir_lower_tex(nir_shader *shader,
+bool nir_lower_tex(nir_shader *shader,
                    const nir_lower_tex_options *options);
 
 void nir_lower_idiv(nir_shader *shader);
diff --git a/src/glsl/nir/nir_builder.h b/src/glsl/nir/nir_builder.h
index 205aa067b0b..fe41c74b608 100644
--- a/src/glsl/nir/nir_builder.h
+++ b/src/glsl/nir/nir_builder.h
@@ -256,7 +256,7 @@ nir_swizzle(nir_builder *build, nir_ssa_def *src, unsigned swiz[4],
 {
    nir_alu_src alu_src = { NIR_SRC_INIT };
    alu_src.src = nir_src_for_ssa(src);
-   for (int i = 0; i < 4; i++)
+   for (unsigned i = 0; i < num_components; i++)
       alu_src.swizzle[i] = swiz[i];
 
    return use_fmov ? nir_fmov_alu(build, alu_src, num_components) :
@@ -290,6 +290,8 @@ nir_channel(nir_builder *b, nir_ssa_def *def, unsigned c)
 /**
  * Turns a nir_src into a nir_ssa_def * so it can be passed to
  * nir_build_alu()-based builder calls.
+ *
+ * See nir_ssa_for_alu_src() for alu instructions.
  */
 static inline nir_ssa_def *
 nir_ssa_for_src(nir_builder *build, nir_src src, int num_components)
@@ -305,6 +307,25 @@ nir_ssa_for_src(nir_builder *build, nir_src src, int num_components)
    return nir_imov_alu(build, alu, num_components);
 }
 
+/**
+ * Similar to nir_ssa_for_src(), but for alu src's, respecting the
+ * nir_alu_src's swizzle.
+ */
+static inline nir_ssa_def *
+nir_ssa_for_alu_src(nir_builder *build, nir_alu_instr *instr, unsigned srcn)
+{
+   static uint8_t trivial_swizzle[4] = { 0, 1, 2, 3 };
+   nir_alu_src *src = &instr->src[srcn];
+   unsigned num_components = nir_ssa_alu_instr_src_components(instr, srcn);
+
+   if (src->src.is_ssa && (src->src.ssa->num_components == num_components) &&
+       !src->abs && !src->negate &&
+       (memcmp(src->swizzle, trivial_swizzle, num_components) == 0))
+      return src->src.ssa;
+
+   return nir_imov_alu(build, *src, num_components);
+}
+
 static inline nir_ssa_def *
 nir_load_var(nir_builder *build, nir_variable *var)
 {
diff --git a/src/glsl/nir/nir_clone.c b/src/glsl/nir/nir_clone.c
new file mode 100644
index 00000000000..68b72ef5381
--- /dev/null
+++ b/src/glsl/nir/nir_clone.c
@@ -0,0 +1,674 @@
+/*
+ * Copyright © 2015 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_control_flow_private.h"
+
+/* Secret Decoder Ring:
+ *   clone_foo():
+ *        Allocate and clone a foo.
+ *   __clone_foo():
+ *        Clone body of foo (ie. parent class, embedded struct, etc)
+ */
+
+typedef struct {
+   /* maps orig ptr -> cloned ptr: */
+   struct hash_table *ptr_table;
+
+   /* List of phi sources. */
+   struct list_head phi_srcs;
+
+   /* new shader object, used as memctx for just about everything else: */
+   nir_shader *ns;
+} clone_state;
+
+static void
+init_clone_state(clone_state *state)
+{
+   state->ptr_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                              _mesa_key_pointer_equal);
+   list_inithead(&state->phi_srcs);
+}
+
+static void
+free_clone_state(clone_state *state)
+{
+   _mesa_hash_table_destroy(state->ptr_table, NULL);
+}
+
+static void *
+lookup_ptr(clone_state *state, const void *ptr)
+{
+   struct hash_entry *entry;
+
+   if (!ptr)
+      return NULL;
+
+   entry = _mesa_hash_table_search(state->ptr_table, ptr);
+   assert(entry && "Failed to find pointer!");
+   if (!entry)
+      return NULL;
+
+   return entry->data;
+}
+
+static void
+store_ptr(clone_state *state, void *nptr, const void *ptr)
+{
+   _mesa_hash_table_insert(state->ptr_table, ptr, nptr);
+}
+
+static nir_constant *
+clone_constant(clone_state *state, const nir_constant *c, nir_variable *nvar)
+{
+   nir_constant *nc = ralloc(nvar, nir_constant);
+
+   nc->value = c->value;
+   nc->num_elements = c->num_elements;
+   nc->elements = ralloc_array(nvar, nir_constant *, c->num_elements);
+   for (unsigned i = 0; i < c->num_elements; i++) {
+      nc->elements[i] = clone_constant(state, c->elements[i], nvar);
+   }
+
+   return nc;
+}
+
+/* NOTE: for cloning nir_variable's, bypass nir_variable_create to avoid
+ * having to deal with locals and globals separately:
+ */
+static nir_variable *
+clone_variable(clone_state *state, const nir_variable *var)
+{
+   nir_variable *nvar = rzalloc(state->ns, nir_variable);
+   store_ptr(state, nvar, var);
+
+   nvar->type = var->type;
+   nvar->name = ralloc_strdup(nvar, var->name);
+   nvar->data = var->data;
+   nvar->num_state_slots = var->num_state_slots;
+   nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots);
+   memcpy(nvar->state_slots, var->state_slots,
+          var->num_state_slots * sizeof(nir_state_slot));
+   if (var->constant_initializer) {
+      nvar->constant_initializer =
+         clone_constant(state, var->constant_initializer, nvar);
+   }
+   nvar->interface_type = var->interface_type;
+
+   return nvar;
+}
+
+/* clone list of nir_variable: */
+static void
+clone_var_list(clone_state *state, struct exec_list *dst,
+               const struct exec_list *list)
+{
+   exec_list_make_empty(dst);
+   foreach_list_typed(nir_variable, var, node, list) {
+      nir_variable *nvar = clone_variable(state, var);
+      exec_list_push_tail(dst, &nvar->node);
+   }
+}
+
+/* NOTE: for cloning nir_register's, bypass nir_global/local_reg_create()
+ * to avoid having to deal with locals and globals separately:
+ */
+static nir_register *
+clone_register(clone_state *state, const nir_register *reg)
+{
+   nir_register *nreg = rzalloc(state->ns, nir_register);
+   store_ptr(state, nreg, reg);
+
+   nreg->num_components = reg->num_components;
+   nreg->num_array_elems = reg->num_array_elems;
+   nreg->index = reg->index;
+   nreg->name = ralloc_strdup(nreg, reg->name);
+   nreg->is_global = reg->is_global;
+   nreg->is_packed = reg->is_packed;
+
+   /* reconstructing uses/defs/if_uses handled by nir_instr_insert() */
+   list_inithead(&nreg->uses);
+   list_inithead(&nreg->defs);
+   list_inithead(&nreg->if_uses);
+
+   return nreg;
+}
+
+/* clone list of nir_register: */
+static void
+clone_reg_list(clone_state *state, struct exec_list *dst,
+               const struct exec_list *list)
+{
+   exec_list_make_empty(dst);
+   foreach_list_typed(nir_register, reg, node, list) {
+      nir_register *nreg = clone_register(state, reg);
+      exec_list_push_tail(dst, &nreg->node);
+   }
+}
+
+static void
+__clone_src(clone_state *state, void *ninstr_or_if,
+            nir_src *nsrc, const nir_src *src)
+{
+   nsrc->is_ssa = src->is_ssa;
+   if (src->is_ssa) {
+      nsrc->ssa = lookup_ptr(state, src->ssa);
+   } else {
+      nsrc->reg.reg = lookup_ptr(state, src->reg.reg);
+      if (src->reg.indirect) {
+         nsrc->reg.indirect = ralloc(ninstr_or_if, nir_src);
+         __clone_src(state, ninstr_or_if, nsrc->reg.indirect, src->reg.indirect);
+      }
+      nsrc->reg.base_offset = src->reg.base_offset;
+   }
+}
+
+static void
+__clone_dst(clone_state *state, nir_instr *ninstr,
+            nir_dest *ndst, const nir_dest *dst)
+{
+   ndst->is_ssa = dst->is_ssa;
+   if (dst->is_ssa) {
+      nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, dst->ssa.name);
+      store_ptr(state, &ndst->ssa, &dst->ssa);
+   } else {
+      ndst->reg.reg = lookup_ptr(state, dst->reg.reg);
+      if (dst->reg.indirect) {
+         ndst->reg.indirect = ralloc(ninstr, nir_src);
+         __clone_src(state, ninstr, ndst->reg.indirect, dst->reg.indirect);
+      }
+      ndst->reg.base_offset = dst->reg.base_offset;
+   }
+}
+
+static nir_deref *clone_deref(clone_state *state, const nir_deref *deref,
+                              nir_instr *ninstr, nir_deref *parent);
+
+static nir_deref_var *
+clone_deref_var(clone_state *state, const nir_deref_var *dvar,
+                nir_instr *ninstr)
+{
+   nir_variable *nvar = lookup_ptr(state, dvar->var);
+   nir_deref_var *ndvar = nir_deref_var_create(ninstr, nvar);
+
+   if (dvar->deref.child)
+      ndvar->deref.child = clone_deref(state, dvar->deref.child,
+                                       ninstr, &ndvar->deref);
+
+   return ndvar;
+}
+
+static nir_deref_array *
+clone_deref_array(clone_state *state, const nir_deref_array *darr,
+                  nir_instr *ninstr, nir_deref *parent)
+{
+   nir_deref_array *ndarr = nir_deref_array_create(parent);
+
+   ndarr->deref.type = darr->deref.type;
+   if (darr->deref.child)
+      ndarr->deref.child = clone_deref(state, darr->deref.child,
+                                       ninstr, &ndarr->deref);
+
+   ndarr->deref_array_type = darr->deref_array_type;
+   ndarr->base_offset = darr->base_offset;
+   if (ndarr->deref_array_type == nir_deref_array_type_indirect)
+      __clone_src(state, ninstr, &ndarr->indirect, &darr->indirect);
+
+   return ndarr;
+}
+
+static nir_deref_struct *
+clone_deref_struct(clone_state *state, const nir_deref_struct *dstr,
+                   nir_instr *ninstr, nir_deref *parent)
+{
+   nir_deref_struct *ndstr = nir_deref_struct_create(parent, dstr->index);
+
+   ndstr->deref.type = dstr->deref.type;
+   if (dstr->deref.child)
+      ndstr->deref.child = clone_deref(state, dstr->deref.child,
+                                       ninstr, &ndstr->deref);
+
+   return ndstr;
+}
+
+static nir_deref *
+clone_deref(clone_state *state, const nir_deref *dref,
+            nir_instr *ninstr, nir_deref *parent)
+{
+   switch (dref->deref_type) {
+   case nir_deref_type_array:
+      return &clone_deref_array(state, nir_deref_as_array(dref),
+                                ninstr, parent)->deref;
+   case nir_deref_type_struct:
+      return &clone_deref_struct(state, nir_deref_as_struct(dref),
+                                 ninstr, parent)->deref;
+   default:
+      unreachable("bad deref type");
+      return NULL;
+   }
+}
+
+static nir_alu_instr *
+clone_alu(clone_state *state, const nir_alu_instr *alu)
+{
+   nir_alu_instr *nalu = nir_alu_instr_create(state->ns, alu->op);
+
+   __clone_dst(state, &nalu->instr, &nalu->dest.dest, &alu->dest.dest);
+   nalu->dest.saturate = alu->dest.saturate;
+   nalu->dest.write_mask = alu->dest.write_mask;
+
+   for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
+      __clone_src(state, &nalu->instr, &nalu->src[i].src, &alu->src[i].src);
+      nalu->src[i].negate = alu->src[i].negate;
+      nalu->src[i].abs = alu->src[i].abs;
+      memcpy(nalu->src[i].swizzle, alu->src[i].swizzle,
+             sizeof(nalu->src[i].swizzle));
+   }
+
+   return nalu;
+}
+
+static nir_intrinsic_instr *
+clone_intrinsic(clone_state *state, const nir_intrinsic_instr *itr)
+{
+   nir_intrinsic_instr *nitr =
+      nir_intrinsic_instr_create(state->ns, itr->intrinsic);
+
+   unsigned num_variables = nir_intrinsic_infos[itr->intrinsic].num_variables;
+   unsigned num_srcs = nir_intrinsic_infos[itr->intrinsic].num_srcs;
+
+   if (nir_intrinsic_infos[itr->intrinsic].has_dest)
+      __clone_dst(state, &nitr->instr, &nitr->dest, &itr->dest);
+
+   nitr->num_components = itr->num_components;
+   memcpy(nitr->const_index, itr->const_index, sizeof(nitr->const_index));
+
+   for (unsigned i = 0; i < num_variables; i++) {
+      nitr->variables[i] = clone_deref_var(state, itr->variables[i],
+                                           &nitr->instr);
+   }
+
+   for (unsigned i = 0; i < num_srcs; i++)
+      __clone_src(state, &nitr->instr, &nitr->src[i], &itr->src[i]);
+
+   return nitr;
+}
+
+static nir_load_const_instr *
+clone_load_const(clone_state *state, const nir_load_const_instr *lc)
+{
+   nir_load_const_instr *nlc =
+      nir_load_const_instr_create(state->ns, lc->def.num_components);
+
+   memcpy(&nlc->value, &lc->value, sizeof(nlc->value));
+
+   store_ptr(state, &nlc->def, &lc->def);
+
+   return nlc;
+}
+
+static nir_ssa_undef_instr *
+clone_ssa_undef(clone_state *state, const nir_ssa_undef_instr *sa)
+{
+   nir_ssa_undef_instr *nsa =
+      nir_ssa_undef_instr_create(state->ns, sa->def.num_components);
+
+   store_ptr(state, &nsa->def, &sa->def);
+
+   return nsa;
+}
+
+static nir_tex_instr *
+clone_tex(clone_state *state, const nir_tex_instr *tex)
+{
+   nir_tex_instr *ntex = nir_tex_instr_create(state->ns, tex->num_srcs);
+
+   ntex->sampler_dim = tex->sampler_dim;
+   ntex->dest_type = tex->dest_type;
+   ntex->op = tex->op;
+   __clone_dst(state, &ntex->instr, &ntex->dest, &tex->dest);
+   for (unsigned i = 0; i < ntex->num_srcs; i++) {
+      ntex->src[i].src_type = tex->src[i].src_type;
+      __clone_src(state, &ntex->instr, &ntex->src[i].src, &tex->src[i].src);
+   }
+   ntex->coord_components = tex->coord_components;
+   ntex->is_array = tex->is_array;
+   ntex->is_shadow = tex->is_shadow;
+   ntex->is_new_style_shadow = tex->is_new_style_shadow;
+   memcpy(ntex->const_offset, tex->const_offset, sizeof(ntex->const_offset));
+   ntex->component = tex->component;
+   ntex->texture_index = tex->texture_index;
+   ntex->texture_array_size = tex->texture_array_size;
+   if (tex->texture)
+      ntex->texture = clone_deref_var(state, tex->texture, &ntex->instr);
+   ntex->sampler_index = tex->sampler_index;
+   if (tex->sampler)
+      ntex->sampler = clone_deref_var(state, tex->sampler, &ntex->instr);
+
+   return ntex;
+}
+
+static nir_phi_instr *
+clone_phi(clone_state *state, const nir_phi_instr *phi, nir_block *nblk)
+{
+   nir_phi_instr *nphi = nir_phi_instr_create(state->ns);
+
+   __clone_dst(state, &nphi->instr, &nphi->dest, &phi->dest);
+
+   /* Cloning a phi node is a bit different from other instructions.  The
+    * sources of phi instructions are the only time where we can use an SSA
+    * def before it is defined.  In order to handle this, we just copy over
+    * the sources from the old phi instruction directly and then fix them up
+    * in a second pass once all the instrutions in the function have been
+    * properly cloned.
+    *
+    * In order to ensure that the copied sources (which are the same as the
+    * old phi instruction's sources for now) don't get inserted into the old
+    * shader's use-def lists, we have to add the phi instruction *before* we
+    * set up its sources.
+    */
+   nir_instr_insert_after_block(nblk, &nphi->instr);
+
+   foreach_list_typed(nir_phi_src, src, node, &phi->srcs) {
+      nir_phi_src *nsrc = ralloc(nphi, nir_phi_src);
+
+      /* Just copy the old source for now. */
+      memcpy(nsrc, src, sizeof(*src));
+
+      /* Since we're not letting nir_insert_instr handle use/def stuff for us,
+       * we have to set the parent_instr manually.  It doesn't really matter
+       * when we do it, so we might as well do it here.
+       */
+      nsrc->src.parent_instr = &nphi->instr;
+
+      /* Stash it in the list of phi sources.  We'll walk this list and fix up
+       * sources at the very end of clone_function_impl.
+       */
+      list_add(&nsrc->src.use_link, &state->phi_srcs);
+
+      exec_list_push_tail(&nphi->srcs, &nsrc->node);
+   }
+
+   return nphi;
+}
+
+static nir_jump_instr *
+clone_jump(clone_state *state, const nir_jump_instr *jmp)
+{
+   nir_jump_instr *njmp = nir_jump_instr_create(state->ns, jmp->type);
+
+   return njmp;
+}
+
+static nir_call_instr *
+clone_call(clone_state *state, const nir_call_instr *call)
+{
+   nir_function_overload *ncallee = lookup_ptr(state, call->callee);
+   nir_call_instr *ncall = nir_call_instr_create(state->ns, ncallee);
+
+   for (unsigned i = 0; i < ncall->num_params; i++)
+      ncall->params[i] = clone_deref_var(state, call->params[i], &ncall->instr);
+
+   ncall->return_deref = clone_deref_var(state, call->return_deref,
+                                         &ncall->instr);
+
+   return ncall;
+}
+
+static nir_instr *
+clone_instr(clone_state *state, const nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_alu:
+      return &clone_alu(state, nir_instr_as_alu(instr))->instr;
+   case nir_instr_type_intrinsic:
+      return &clone_intrinsic(state, nir_instr_as_intrinsic(instr))->instr;
+   case nir_instr_type_load_const:
+      return &clone_load_const(state, nir_instr_as_load_const(instr))->instr;
+   case nir_instr_type_ssa_undef:
+      return &clone_ssa_undef(state, nir_instr_as_ssa_undef(instr))->instr;
+   case nir_instr_type_tex:
+      return &clone_tex(state, nir_instr_as_tex(instr))->instr;
+   case nir_instr_type_phi:
+      unreachable("Cannot clone phis with clone_instr");
+   case nir_instr_type_jump:
+      return &clone_jump(state, nir_instr_as_jump(instr))->instr;
+   case nir_instr_type_call:
+      return &clone_call(state, nir_instr_as_call(instr))->instr;
+   case nir_instr_type_parallel_copy:
+      unreachable("Cannot clone parallel copies");
+   default:
+      unreachable("bad instr type");
+      return NULL;
+   }
+}
+
+static nir_block *
+clone_block(clone_state *state, struct exec_list *cf_list, const nir_block *blk)
+{
+   /* Don't actually create a new block.  Just use the one from the tail of
+    * the list.  NIR guarantees that the tail of the list is a block and that
+    * no two blocks are side-by-side in the IR;  It should be empty.
+    */
+   nir_block *nblk =
+      exec_node_data(nir_block, exec_list_get_tail(cf_list), cf_node.node);
+   assert(nblk->cf_node.type == nir_cf_node_block);
+   assert(exec_list_is_empty(&nblk->instr_list));
+
+   /* We need this for phi sources */
+   store_ptr(state, nblk, blk);
+
+   nir_foreach_instr(blk, instr) {
+      if (instr->type == nir_instr_type_phi) {
+         /* Phi instructions are a bit of a special case when cloning because
+          * we don't want inserting the instruction to automatically handle
+          * use/defs for us.  Instead, we need to wait until all the
+          * blocks/instructions are in so that we can set their sources up.
+          */
+         clone_phi(state, nir_instr_as_phi(instr), nblk);
+      } else {
+         nir_instr *ninstr = clone_instr(state, instr);
+         nir_instr_insert_after_block(nblk, ninstr);
+      }
+   }
+
+   return nblk;
+}
+
+static void
+clone_cf_list(clone_state *state, struct exec_list *dst,
+              const struct exec_list *list);
+
+static nir_if *
+clone_if(clone_state *state, struct exec_list *cf_list, const nir_if *i)
+{
+   nir_if *ni = nir_if_create(state->ns);
+
+   __clone_src(state, ni, &ni->condition, &i->condition);
+
+   nir_cf_node_insert_end(cf_list, &ni->cf_node);
+
+   clone_cf_list(state, &ni->then_list, &i->then_list);
+   clone_cf_list(state, &ni->else_list, &i->else_list);
+
+   return ni;
+}
+
+static nir_loop *
+clone_loop(clone_state *state, struct exec_list *cf_list, const nir_loop *loop)
+{
+   nir_loop *nloop = nir_loop_create(state->ns);
+
+   nir_cf_node_insert_end(cf_list, &nloop->cf_node);
+
+   clone_cf_list(state, &nloop->body, &loop->body);
+
+   return nloop;
+}
+
+/* clone list of nir_cf_node: */
+static void
+clone_cf_list(clone_state *state, struct exec_list *dst,
+              const struct exec_list *list)
+{
+   foreach_list_typed(nir_cf_node, cf, node, list) {
+      switch (cf->type) {
+      case nir_cf_node_block:
+         clone_block(state, dst, nir_cf_node_as_block(cf));
+         break;
+      case nir_cf_node_if:
+         clone_if(state, dst, nir_cf_node_as_if(cf));
+         break;
+      case nir_cf_node_loop:
+         clone_loop(state, dst, nir_cf_node_as_loop(cf));
+         break;
+      default:
+         unreachable("bad cf type");
+      }
+   }
+}
+
+static nir_function_impl *
+clone_function_impl(clone_state *state, const nir_function_impl *fi,
+                    nir_function_overload *nfo)
+{
+   nir_function_impl *nfi = nir_function_impl_create(nfo);
+
+   clone_var_list(state, &nfi->locals, &fi->locals);
+   clone_reg_list(state, &nfi->registers, &fi->registers);
+   nfi->reg_alloc = fi->reg_alloc;
+
+   nfi->num_params = fi->num_params;
+   nfi->params = ralloc_array(state->ns, nir_variable *, fi->num_params);
+   for (unsigned i = 0; i < fi->num_params; i++) {
+      nfi->params[i] = lookup_ptr(state, fi->params[i]);
+   }
+   nfi->return_var = lookup_ptr(state, fi->return_var);
+
+   assert(list_empty(&state->phi_srcs));
+
+   clone_cf_list(state, &nfi->body, &fi->body);
+
+   /* After we've cloned almost everything, we have to walk the list of phi
+    * sources and fix them up.  Thanks to loops, the block and SSA value for a
+    * phi source may not be defined when we first encounter it.  Instead, we
+    * add it to the phi_srcs list and we fix it up here.
+    */
+   list_for_each_entry_safe(nir_phi_src, src, &state->phi_srcs, src.use_link) {
+      src->pred = lookup_ptr(state, src->pred);
+      assert(src->src.is_ssa);
+      src->src.ssa = lookup_ptr(state, src->src.ssa);
+
+      /* Remove from this list and place in the uses of the SSA def */
+      list_del(&src->src.use_link);
+      list_addtail(&src->src.use_link, &src->src.ssa->uses);
+   }
+   assert(list_empty(&state->phi_srcs));
+
+   /* All metadata is invalidated in the cloning process */
+   nfi->valid_metadata = 0;
+
+   return nfi;
+}
+
+static nir_function_overload *
+clone_function_overload(clone_state *state, const nir_function_overload *fo,
+                        nir_function *nfxn)
+{
+   nir_function_overload *nfo = nir_function_overload_create(nfxn);
+
+   /* Needed for call instructions */
+   store_ptr(state, nfo, fo);
+
+   nfo->num_params = fo->num_params;
+   nfo->params = ralloc_array(state->ns, nir_parameter, fo->num_params);
+   memcpy(nfo->params, fo->params, sizeof(nir_parameter) * fo->num_params);
+
+   nfo->return_type = fo->return_type;
+
+   /* At first glance, it looks like we should clone the function_impl here.
+    * However, call instructions need to be able to reference at least the
+    * overload and those will get processed as we clone the function_impl's.
+    * We stop here and do function_impls as a second pass.
+    */
+
+   return nfo;
+}
+
+static nir_function *
+clone_function(clone_state *state, const nir_function *fxn, nir_shader *ns)
+{
+   assert(ns == state->ns);
+   nir_function *nfxn = nir_function_create(ns, fxn->name);
+
+   foreach_list_typed(nir_function_overload, fo, node, &fxn->overload_list)
+      clone_function_overload(state, fo, nfxn);
+
+   return nfxn;
+}
+
+nir_shader *
+nir_shader_clone(void *mem_ctx, const nir_shader *s)
+{
+   clone_state state;
+   init_clone_state(&state);
+
+   nir_shader *ns = nir_shader_create(mem_ctx, s->stage, s->options);
+   state.ns = ns;
+
+   clone_var_list(&state, &ns->uniforms, &s->uniforms);
+   clone_var_list(&state, &ns->inputs,   &s->inputs);
+   clone_var_list(&state, &ns->outputs,  &s->outputs);
+   clone_var_list(&state, &ns->globals,  &s->globals);
+   clone_var_list(&state, &ns->system_values, &s->system_values);
+
+   /* Go through and clone functions and overloads */
+   foreach_list_typed(nir_function, fxn, node, &s->functions)
+      clone_function(&state, fxn, ns);
+
+   /* Only after all overloads are cloned can we clone the actual function
+    * implementations.  This is because nir_call_instr's need to reference the
+    * overloads of other functions and we don't know what order the functions
+    * will have in the list.
+    */
+   nir_foreach_overload(s, fo) {
+      nir_function_overload *nfo = lookup_ptr(&state, fo);
+      clone_function_impl(&state, fo->impl, nfo);
+   }
+
+   clone_reg_list(&state, &ns->registers, &s->registers);
+   ns->reg_alloc = s->reg_alloc;
+
+   ns->info = s->info;
+   ns->info.name = ralloc_strdup(ns, ns->info.name);
+   if (ns->info.label)
+      ns->info.label = ralloc_strdup(ns, ns->info.label);
+
+   ns->num_inputs = s->num_inputs;
+   ns->num_uniforms = s->num_uniforms;
+   ns->num_outputs = s->num_outputs;
+
+   free_clone_state(&state);
+
+   return ns;
+}
diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py
index 2ba8554645d..b16ef503c92 100644
--- a/src/glsl/nir/nir_constant_expressions.py
+++ b/src/glsl/nir/nir_constant_expressions.py
@@ -213,7 +213,7 @@ unpack_half_1x16(uint16_t u)
 }
 
 /* Some typed vector structures to make things like src0.y work */
-% for type in ["float", "int", "unsigned", "bool"]:
+% for type in ["float", "int", "uint", "bool"]:
 struct ${type}_vec {
    ${type} x;
    ${type} y;
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index 0a134aff211..de30db61eea 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -244,6 +244,7 @@ SYSTEM_VALUE(local_invocation_id, 3, 0)
 SYSTEM_VALUE(work_group_id, 3, 0)
 SYSTEM_VALUE(user_clip_plane, 4, 1) /* const_index[0] is user_clip_plane[idx] */
 SYSTEM_VALUE(num_work_groups, 3, 0)
+SYSTEM_VALUE(helper_invocation, 1, 0)
 
 /*
  * The format of the indices depends on the type of the load.  For uniforms,
diff --git a/src/glsl/nir/nir_lower_clip.c b/src/glsl/nir/nir_lower_clip.c
index 31ccfb2c02b..c58c7785b3f 100644
--- a/src/glsl/nir/nir_lower_clip.c
+++ b/src/glsl/nir/nir_lower_clip.c
@@ -55,9 +55,11 @@ create_clipdist_var(nir_shader *shader, unsigned drvloc,
 
    if (output) {
       exec_list_push_tail(&shader->outputs, &var->node);
+      shader->num_outputs++; /* TODO use type_size() */
    }
    else {
       exec_list_push_tail(&shader->inputs, &var->node);
+      shader->num_inputs++;  /* TODO use type_size() */
    }
    return var;
 }
diff --git a/src/glsl/nir/nir_lower_idiv.c b/src/glsl/nir/nir_lower_idiv.c
index c961178c53a..f64b3eac8a0 100644
--- a/src/glsl/nir/nir_lower_idiv.c
+++ b/src/glsl/nir/nir_lower_idiv.c
@@ -52,10 +52,8 @@ convert_instr(nir_builder *bld, nir_alu_instr *alu)
 
    bld->cursor = nir_before_instr(&alu->instr);
 
-   numer = nir_ssa_for_src(bld, alu->src[0].src,
-                           nir_ssa_alu_instr_src_components(alu, 0));
-   denom = nir_ssa_for_src(bld, alu->src[1].src,
-                           nir_ssa_alu_instr_src_components(alu, 1));
+   numer = nir_ssa_for_alu_src(bld, alu, 0);
+   denom = nir_ssa_for_alu_src(bld, alu, 1);
 
    if (is_signed) {
       af = nir_i2f(bld, numer);
@@ -96,7 +94,7 @@ convert_instr(nir_builder *bld, nir_alu_instr *alu)
    r = nir_imul(bld, q, b);
    r = nir_isub(bld, a, r);
 
-   r = nir_ige(bld, r, b);
+   r = nir_uge(bld, r, b);
    r = nir_b2i(bld, r);
 
    q = nir_iadd(bld, q, r);
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index 00a31458310..5683e69d865 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -197,7 +197,7 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
       nir_variable_mode mode = intrin->variables[0]->var->data.mode;
 
-      if (state->mode != -1 && state->mode != mode)
+      if (state->mode != nir_var_all && state->mode != mode)
          continue;
 
       if (mode != nir_var_shader_in &&
diff --git a/src/glsl/nir/nir_lower_tex.c b/src/glsl/nir/nir_lower_tex.c
index 8aaa48ab568..93ebf8e78a9 100644
--- a/src/glsl/nir/nir_lower_tex.c
+++ b/src/glsl/nir/nir_lower_tex.c
@@ -41,6 +41,7 @@
 typedef struct {
    nir_builder b;
    const nir_lower_tex_options *options;
+   bool progress;
 } lower_tex_state;
 
 static void
@@ -133,6 +134,7 @@ get_texture_size(nir_builder *b, nir_tex_instr *tex)
    txs->op = nir_texop_txs;
    txs->sampler_dim = GLSL_SAMPLER_DIM_RECT;
    txs->sampler_index = tex->sampler_index;
+   txs->dest_type = nir_type_int;
 
    /* only single src, the lod: */
    txs->src[0].src = nir_src_for_ssa(nir_imm_int(b, 0));
@@ -213,6 +215,66 @@ saturate_src(nir_builder *b, nir_tex_instr *tex, unsigned sat_mask)
    }
 }
 
+static nir_ssa_def *
+get_zero_or_one(nir_builder *b, nir_alu_type type, uint8_t swizzle_val)
+{
+   nir_const_value v;
+
+   memset(&v, 0, sizeof(v));
+
+   if (swizzle_val == 4) {
+      v.u[0] = v.u[1] = v.u[2] = v.u[3] = 0;
+   } else {
+      assert(swizzle_val == 5);
+      if (type == nir_type_float)
+         v.f[0] = v.f[1] = v.f[2] = v.f[3] = 1.0;
+      else
+         v.u[0] = v.u[1] = v.u[2] = v.u[3] = 1;
+   }
+
+   return nir_build_imm(b, 4, v);
+}
+
+static void
+swizzle_result(nir_builder *b, nir_tex_instr *tex, const uint8_t swizzle[4])
+{
+   assert(tex->dest.is_ssa);
+
+   b->cursor = nir_after_instr(&tex->instr);
+
+   nir_ssa_def *swizzled;
+   if (tex->op == nir_texop_tg4) {
+      if (swizzle[tex->component] < 4) {
+         /* This one's easy */
+         tex->component = swizzle[tex->component];
+         return;
+      } else {
+         swizzled = get_zero_or_one(b, tex->dest_type, swizzle[tex->component]);
+      }
+   } else {
+      assert(nir_tex_instr_dest_size(tex) == 4);
+      if (swizzle[0] < 4 && swizzle[1] < 4 &&
+          swizzle[2] < 4 && swizzle[3] < 4) {
+         unsigned swiz[4] = { swizzle[0], swizzle[1], swizzle[2], swizzle[3] };
+         /* We have no 0's or 1's, just emit a swizzling MOV */
+         swizzled = nir_swizzle(b, &tex->dest.ssa, swiz, 4, false);
+      } else {
+         nir_ssa_def *srcs[4];
+         for (unsigned i = 0; i < 4; i++) {
+            if (swizzle[i] < 4) {
+               srcs[i] = nir_channel(b, &tex->dest.ssa, swizzle[i]);
+            } else {
+               srcs[i] = get_zero_or_one(b, tex->dest_type, swizzle[i]);
+            }
+         }
+         swizzled = nir_vec(b, srcs, 4);
+      }
+   }
+
+   nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, nir_src_for_ssa(swizzled),
+                                  swizzled->parent_instr);
+}
+
 static bool
 nir_lower_tex_block(nir_block *block, void *void_state)
 {
@@ -239,15 +301,28 @@ nir_lower_tex_block(nir_block *block, void *void_state)
       /* If we are clamping any coords, we must lower projector first
        * as clamping happens *after* projection:
        */
-      if (lower_txp || sat_mask)
+      if (lower_txp || sat_mask) {
          project_src(b, tex);
+         state->progress = true;
+      }
 
       if ((tex->sampler_dim == GLSL_SAMPLER_DIM_RECT) &&
-          state->options->lower_rect)
+          state->options->lower_rect) {
          lower_rect(b, tex);
+         state->progress = true;
+      }
 
-      if (sat_mask)
+      if (sat_mask) {
          saturate_src(b, tex, sat_mask);
+         state->progress = true;
+      }
+
+      if (((1 << tex->sampler_index) & state->options->swizzle_result) &&
+          !nir_tex_instr_is_query(tex) &&
+          !(tex->is_shadow && tex->is_new_style_shadow)) {
+         swizzle_result(b, tex, state->options->swizzles[tex->sampler_index]);
+         state->progress = true;
+      }
    }
 
    return true;
@@ -264,13 +339,17 @@ nir_lower_tex_impl(nir_function_impl *impl, lower_tex_state *state)
                                nir_metadata_dominance);
 }
 
-void
+bool
 nir_lower_tex(nir_shader *shader, const nir_lower_tex_options *options)
 {
    lower_tex_state state;
    state.options = options;
+   state.progress = false;
+
    nir_foreach_overload(shader, overload) {
       if (overload->impl)
          nir_lower_tex_impl(overload->impl, &state);
    }
+
+   return state.progress;
 }
diff --git a/src/glsl/nir/nir_lower_two_sided_color.c b/src/glsl/nir/nir_lower_two_sided_color.c
index db519bf513b..6995b9d6bc1 100644
--- a/src/glsl/nir/nir_lower_two_sided_color.c
+++ b/src/glsl/nir/nir_lower_two_sided_color.c
@@ -60,6 +60,8 @@ create_input(nir_shader *shader, unsigned drvloc, gl_varying_slot slot)
 
    exec_list_push_tail(&shader->inputs, &var->node);
 
+   shader->num_inputs++;     /* TODO use type_size() */
+
    return var;
 }
 
diff --git a/src/glsl/nir/nir_metadata.c b/src/glsl/nir/nir_metadata.c
index 6de981f430f..d5324b35a78 100644
--- a/src/glsl/nir/nir_metadata.c
+++ b/src/glsl/nir/nir_metadata.c
@@ -52,3 +52,39 @@ nir_metadata_preserve(nir_function_impl *impl, nir_metadata preserved)
 {
    impl->valid_metadata &= preserved;
 }
+
+#ifdef DEBUG
+/**
+ * Make sure passes properly invalidate metadata (part 1).
+ *
+ * Call this before running a pass to set a bogus metadata flag, which will
+ * only be preserved if the pass forgets to call nir_metadata_preserve().
+ */
+void
+nir_metadata_set_validation_flag(nir_shader *shader)
+{
+   nir_foreach_overload(shader, overload) {
+      if (overload->impl) {
+         overload->impl->valid_metadata |= nir_metadata_not_properly_reset;
+      }
+   }
+}
+
+/**
+ * Make sure passes properly invalidate metadata (part 2).
+ *
+ * Call this after a pass makes progress to verify that the bogus metadata set by
+ * the earlier function was properly thrown away.  Note that passes may not call
+ * nir_metadata_preserve() if they don't actually make any changes at all.
+ */
+void
+nir_metadata_check_validation_flag(nir_shader *shader)
+{
+   nir_foreach_overload(shader, overload) {
+      if (overload->impl) {
+         assert(!(overload->impl->valid_metadata &
+                  nir_metadata_not_properly_reset));
+      }
+   }
+}
+#endif
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 3c0f1da94af..37d3dfc4588 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -91,7 +91,7 @@ class Opcode(object):
 tfloat = "float"
 tint = "int"
 tbool = "bool"
-tunsigned = "unsigned"
+tuint = "uint"
 
 commutative = "commutative "
 associative = "associative "
@@ -156,7 +156,7 @@ unop("fsqrt", tfloat, "sqrtf(src0)")
 unop("fexp2", tfloat, "exp2f(src0)")
 unop("flog2", tfloat, "log2f(src0)")
 unop_convert("f2i", tfloat, tint, "src0") # Float-to-integer conversion.
-unop_convert("f2u", tfloat, tunsigned, "src0") # Float-to-unsigned conversion
+unop_convert("f2u", tfloat, tuint, "src0") # Float-to-unsigned conversion
 unop_convert("i2f", tint, tfloat, "src0") # Integer-to-float conversion.
 # Float-to-boolean conversion
 unop_convert("f2b", tfloat, tbool, "src0 != 0.0f")
@@ -165,7 +165,7 @@ unop_convert("b2f", tbool, tfloat, "src0 ? 1.0f : 0.0f")
 # Int-to-boolean conversion
 unop_convert("i2b", tint, tbool, "src0 != 0")
 unop_convert("b2i", tbool, tint, "src0 ? 1 : 0") # Boolean-to-int conversion
-unop_convert("u2f", tunsigned, tfloat, "src0") #Unsigned-to-float conversion.
+unop_convert("u2f", tuint, tfloat, "src0") # Unsigned-to-float conversion.
 
 unop_reduce("bany", 1, tbool, tbool, "{src}", "{src0} || {src1}", "{src}")
 unop_reduce("ball", 1, tbool, tbool, "{src}", "{src0} && {src1}", "{src}")
@@ -205,13 +205,13 @@ unop("fddy_coarse", tfloat, "0.0f")
 # Floating point pack and unpack operations.
 
 def pack_2x16(fmt):
-   unop_horiz("pack_" + fmt + "_2x16", 1, tunsigned, 2, tfloat, """
+   unop_horiz("pack_" + fmt + "_2x16", 1, tuint, 2, tfloat, """
 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 """.replace("fmt", fmt))
 
 def pack_4x8(fmt):
-   unop_horiz("pack_" + fmt + "_4x8", 1, tunsigned, 4, tfloat, """
+   unop_horiz("pack_" + fmt + "_4x8", 1, tuint, 4, tfloat, """
 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
@@ -219,13 +219,13 @@ dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 """.replace("fmt", fmt))
 
 def unpack_2x16(fmt):
-   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tunsigned, """
+   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tuint, """
 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 """.replace("fmt", fmt))
 
 def unpack_4x8(fmt):
-   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tunsigned, """
+   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tuint, """
 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
@@ -248,22 +248,22 @@ unpack_2x16("half")
 # Lowered floating point unpacking operations.
 
 
-unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned,
+unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tuint,
            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
-unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned,
+unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tuint,
            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 
 
 # Bit operations, part of ARB_gpu_shader5.
 
 
-unop("bitfield_reverse", tunsigned, """
+unop("bitfield_reverse", tuint, """
 /* we're not winning any awards for speed here, but that's ok */
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++)
    dst |= ((src0 >> bit) & 1) << (31 - bit);
 """)
-unop("bit_count", tunsigned, """
+unop("bit_count", tuint, """
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++) {
    if ((src0 >> bit) & 1)
@@ -271,7 +271,7 @@ for (unsigned bit = 0; bit < 32; bit++) {
 }
 """)
 
-unop_convert("ufind_msb", tunsigned, tint, """
+unop_convert("ufind_msb", tuint, tint, """
 dst = -1;
 for (int bit = 31; bit > 0; bit--) {
    if ((src0 >> bit) & 1) {
@@ -358,25 +358,25 @@ binop("imul", tint, commutative + associative, "src0 * src1")
 binop("imul_high", tint, commutative,
       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 # high 32-bits of unsigned integer multiply
-binop("umul_high", tunsigned, commutative,
+binop("umul_high", tuint, commutative,
       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 
 binop("fdiv", tfloat, "", "src0 / src1")
 binop("idiv", tint, "", "src0 / src1")
-binop("udiv", tunsigned, "", "src0 / src1")
+binop("udiv", tuint, "", "src0 / src1")
 
 # returns a boolean representing the carry resulting from the addition of
 # the two unsigned arguments.
 
-binop_convert("uadd_carry", tbool, tunsigned, commutative, "src0 + src1 < src0")
+binop_convert("uadd_carry", tbool, tuint, commutative, "src0 + src1 < src0")
 
 # returns a boolean representing the borrow resulting from the subtraction
 # of the two unsigned arguments.
 
-binop_convert("usub_borrow", tbool, tunsigned, "", "src1 < src0")
+binop_convert("usub_borrow", tbool, tuint, "", "src1 < src0")
 
 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
-binop("umod", tunsigned, "", "src1 == 0 ? 0 : src0 % src1")
+binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 
 #
 # Comparisons
@@ -393,8 +393,8 @@ binop_compare("ilt", tint, "", "src0 < src1")
 binop_compare("ige", tint, "", "src0 >= src1")
 binop_compare("ieq", tint, commutative, "src0 == src1")
 binop_compare("ine", tint, commutative, "src0 != src1")
-binop_compare("ult", tunsigned, "", "src0 < src1")
-binop_compare("uge", tunsigned, "", "src0 >= src1")
+binop_compare("ult", tuint, "", "src0 < src1")
+binop_compare("uge", tuint, "", "src0 >= src1")
 
 # integer-aware GLSL-style comparisons that compare floats and ints
 
@@ -425,7 +425,7 @@ binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not E
 
 binop("ishl", tint, "", "src0 << src1")
 binop("ishr", tint, "", "src0 >> src1")
-binop("ushr", tunsigned, "", "src0 >> src1")
+binop("ushr", tuint, "", "src0 >> src1")
 
 # bitwise logic operators
 #
@@ -433,9 +433,9 @@ binop("ushr", tunsigned, "", "src0 >> src1")
 # integers.
 
 
-binop("iand", tunsigned, commutative + associative, "src0 & src1")
-binop("ior", tunsigned, commutative + associative, "src0 | src1")
-binop("ixor", tunsigned, commutative + associative, "src0 ^ src1")
+binop("iand", tuint, commutative + associative, "src0 & src1")
+binop("ior", tuint, commutative + associative, "src0 | src1")
+binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 
 
 # floating point logic operators
@@ -463,10 +463,10 @@ opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 
 binop("fmin", tfloat, "", "fminf(src0, src1)")
 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
-binop("umin", tunsigned, commutative + associative, "src1 > src0 ? src0 : src1")
+binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
-binop("umax", tunsigned, commutative + associative, "src1 > src0 ? src1 : src0")
+binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 
 # Saturated vector add for 4 8bit ints.
 binop("usadd_4x8", tint, commutative + associative, """
@@ -515,10 +515,10 @@ for (int i = 0; i < 32; i += 8) {
 
 binop("fpow", tfloat, "", "powf(src0, src1)")
 
-binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat,
+binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat,
             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 
-binop_convert("bfm", tunsigned, tint, "", """
+binop_convert("bfm", tuint, tint, "", """
 int offset = src0, bits = src1;
 if (offset < 0 || bits < 0 || offset + bits > 32)
    dst = 0; /* undefined per the spec */
@@ -535,7 +535,7 @@ if (!isnormal(dst))
 
 # Combines the first component of each input to make a 2-component vector.
 
-binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned, """
+binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 dst.x = src0.x;
 dst.y = src1.x;
 """)
@@ -543,9 +543,9 @@ dst.y = src1.x;
 def triop(name, ty, const_expr):
    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
-   opcode(name, output_size, tunsigned,
+   opcode(name, output_size, tuint,
    [src1_size, src2_size, src3_size],
-   [tunsigned, tunsigned, tunsigned], "", const_expr)
+   [tuint, tuint, tuint], "", const_expr)
 
 triop("ffma", tfloat, "src0 * src1 + src2")
 
@@ -559,11 +559,11 @@ triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 
 
 triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
-opcode("bcsel", 0, tunsigned, [0, 0, 0],
-      [tbool, tunsigned, tunsigned], "", "src0 ? src1 : src2")
+opcode("bcsel", 0, tuint, [0, 0, 0],
+      [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 
-triop("bfi", tunsigned, """
-unsigned mask = src0, insert = src1 & mask, base = src2;
+triop("bfi", tuint, """
+unsigned mask = src0, insert = src1, base = src2;
 if (mask == 0) {
    dst = base;
 } else {
@@ -572,12 +572,12 @@ if (mask == 0) {
       tmp >>= 1;
       insert <<= 1;
    }
-   dst = (base & ~mask) | insert;
+   dst = (base & ~mask) | (insert & mask);
 }
 """)
 
-opcode("ubitfield_extract", 0, tunsigned,
-       [0, 1, 1], [tunsigned, tint, tint], "", """
+opcode("ubitfield_extract", 0, tuint,
+       [0, 1, 1], [tuint, tint, tint], "", """
 unsigned base = src0;
 int offset = src1.x, bits = src2.x;
 if (bits == 0) {
@@ -611,13 +611,13 @@ dst.z = src2.x;
 
 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
                  src4_size, const_expr):
-   opcode(name, output_size, tunsigned,
+   opcode(name, output_size, tuint,
           [src1_size, src2_size, src3_size, src4_size],
-          [tunsigned, tunsigned, tunsigned, tunsigned],
+          [tuint, tuint, tuint, tuint],
           "", const_expr)
 
-opcode("bitfield_insert", 0, tunsigned, [0, 0, 1, 1],
-       [tunsigned, tunsigned, tint, tint], "", """
+opcode("bitfield_insert", 0, tuint, [0, 0, 1, 1],
+       [tuint, tuint, tint, tint], "", """
 unsigned base = src0, insert = src1;
 int offset = src2.x, bits = src3.x;
 if (bits == 0) {
diff --git a/src/glsl/nir/nir_opt_copy_propagate.c b/src/glsl/nir/nir_opt_copy_propagate.c
index 7d8bdd7f2ca..cfc8e331128 100644
--- a/src/glsl/nir/nir_opt_copy_propagate.c
+++ b/src/glsl/nir/nir_opt_copy_propagate.c
@@ -55,10 +55,15 @@ static bool is_move(nir_alu_instr *instr)
 
 static bool is_vec(nir_alu_instr *instr)
 {
-   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       if (!instr->src[i].src.is_ssa)
          return false;
 
+      /* we handle modifiers in a separate pass */
+      if (instr->src[i].abs || instr->src[i].negate)
+         return false;
+   }
+
    return instr->op == nir_op_vec2 ||
           instr->op == nir_op_vec3 ||
           instr->op == nir_op_vec4;
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index 2db209d434d..76bfc47c2a0 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -512,7 +512,9 @@ print_tex_instr(nir_tex_instr *instr, print_state *state)
    case nir_texop_texture_samples:
       fprintf(fp, "texture_samples ");
       break;
-
+   case nir_texop_samples_identical:
+      fprintf(fp, "samples_identical ");
+      break;
    default:
       unreachable("Invalid texture operation");
       break;
@@ -985,6 +987,16 @@ nir_print_shader(nir_shader *shader, FILE *fp)
 
    fprintf(fp, "shader: %s\n", gl_shader_stage_name(shader->stage));
 
+   if (shader->info.name)
+      fprintf(fp, "name: %s\n", shader->info.name);
+
+   if (shader->info.label)
+      fprintf(fp, "label: %s\n", shader->info.label);
+
+   fprintf(fp, "inputs: %u\n", shader->num_inputs);
+   fprintf(fp, "outputs: %u\n", shader->num_outputs);
+   fprintf(fp, "uniforms: %u\n", shader->num_uniforms);
+
    nir_foreach_variable(var, &shader->uniforms) {
       print_var_decl(var, &state);
    }
diff --git a/src/glsl/nir/nir_search.c b/src/glsl/nir/nir_search.c
index bb154407914..56d7e8162f3 100644
--- a/src/glsl/nir/nir_search.c
+++ b/src/glsl/nir/nir_search.c
@@ -166,7 +166,7 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
          }
          return true;
       case nir_type_int:
-      case nir_type_unsigned:
+      case nir_type_uint:
       case nir_type_bool:
          for (unsigned i = 0; i < num_components; ++i) {
             if (load->value.i[new_swizzle[i]] != const_val->data.i)
@@ -310,7 +310,7 @@ construct_value(const nir_search_value *value, nir_alu_type type,
          load->def.name = ralloc_asprintf(mem_ctx, "%d", c->data.i);
          load->value.i[0] = c->data.i;
          break;
-      case nir_type_unsigned:
+      case nir_type_uint:
       case nir_type_bool:
          load->value.u[0] = c->data.u;
          break;
diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c
index ed374b921fa..06879d64ee2 100644
--- a/src/glsl/nir/nir_validate.c
+++ b/src/glsl/nir/nir_validate.c
@@ -290,11 +290,11 @@ validate_alu_instr(nir_alu_instr *instr, validate_state *state)
 {
    assert(instr->op < nir_num_opcodes);
 
-   validate_alu_dest(&instr->dest, state);
-
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       validate_alu_src(instr, i, state);
    }
+
+   validate_alu_dest(&instr->dest, state);
 }
 
 static void
@@ -375,6 +375,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
       validate_src(&instr->src[i], state);
    }
 
+   unsigned num_vars = nir_intrinsic_infos[instr->intrinsic].num_variables;
+   for (unsigned i = 0; i < num_vars; i++) {
+      validate_deref_var(instr, instr->variables[i], state);
+   }
+
    if (nir_intrinsic_infos[instr->intrinsic].has_dest) {
       unsigned components_written =
          nir_intrinsic_infos[instr->intrinsic].dest_components;
@@ -392,11 +397,6 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
       validate_dest(&instr->dest, state);
    }
 
-   unsigned num_vars = nir_intrinsic_infos[instr->intrinsic].num_variables;
-   for (unsigned i = 0; i < num_vars; i++) {
-      validate_deref_var(instr, instr->variables[i], state);
-   }
-
    switch (instr->intrinsic) {
    case nir_intrinsic_load_var: {
       const struct glsl_type *type =
@@ -434,8 +434,6 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
 static void
 validate_tex_instr(nir_tex_instr *instr, validate_state *state)
 {
-   validate_dest(&instr->dest, state);
-
    bool src_type_seen[nir_num_tex_src_types];
    for (unsigned i = 0; i < nir_num_tex_src_types; i++)
       src_type_seen[i] = false;
@@ -448,6 +446,8 @@ validate_tex_instr(nir_tex_instr *instr, validate_state *state)
 
    if (instr->sampler != NULL)
       validate_deref_var(instr, instr->sampler, state);
+
+   validate_dest(&instr->dest, state);
 }
 
 static void
diff --git a/src/glsl/nir/spirv_to_nir.c b/src/glsl/nir/spirv_to_nir.c
index 70610ca0f66..86282d25e0a 100644
--- a/src/glsl/nir/spirv_to_nir.c
+++ b/src/glsl/nir/spirv_to_nir.c
@@ -2026,7 +2026,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
    switch (glsl_get_sampler_result_type(sampler_type)) {
    case GLSL_TYPE_FLOAT:   instr->dest_type = nir_type_float;     break;
    case GLSL_TYPE_INT:     instr->dest_type = nir_type_int;       break;
-   case GLSL_TYPE_UINT:    instr->dest_type = nir_type_unsigned;  break;
+   case GLSL_TYPE_UINT:    instr->dest_type = nir_type_uint;  break;
    case GLSL_TYPE_BOOL:    instr->dest_type = nir_type_bool;      break;
    default:
       unreachable("Invalid base type for sampler result");
diff --git a/src/glsl/opt_tree_grafting.cpp b/src/glsl/opt_tree_grafting.cpp
index e38a0e93058..cd58213c019 100644
--- a/src/glsl/opt_tree_grafting.cpp
+++ b/src/glsl/opt_tree_grafting.cpp
@@ -275,6 +275,7 @@ ir_tree_grafting_visitor::visit_enter(ir_texture *ir)
    case ir_lod:
    case ir_query_levels:
    case ir_texture_samples:
+   case ir_samples_identical:
       break;
    case ir_txb:
       if (do_graft(&ir->lod_info.bias))
diff --git a/src/glx/Makefile.am b/src/glx/Makefile.am
index e64955e3b3e..00925455b07 100644
--- a/src/glx/Makefile.am
+++ b/src/glx/Makefile.am
@@ -133,6 +133,8 @@ if HAVE_DRI3
 libglx_la_SOURCES += \
 	dri3_glx.c \
 	dri3_priv.h
+
+libglx_la_LIBADD += $(top_builddir)/src/loader/libloader_dri3_helper.la
 endif
 
 if HAVE_APPLEDRI
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index 96f13e6a07b..ee243126731 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -78,40 +78,109 @@
 #include "loader.h"
 #include "dri2.h"
 
-static const struct glx_context_vtable dri3_context_vtable;
+static struct dri3_drawable *
+loader_drawable_to_dri3_drawable(struct loader_dri3_drawable *draw) {
+   size_t offset = offsetof(struct dri3_drawable, loader_drawable);
+   return (struct dri3_drawable *)(((void*) draw) - offset);
+}
 
-static inline void
-dri3_fence_reset(xcb_connection_t *c, struct dri3_buffer *buffer)
+static int
+glx_dri3_get_swap_interval(struct loader_dri3_drawable *draw)
 {
-   xshmfence_reset(buffer->shm_fence);
+   struct dri3_drawable *priv = loader_drawable_to_dri3_drawable(draw);
+
+   return priv->swap_interval;
 }
 
-static inline void
-dri3_fence_set(struct dri3_buffer *buffer)
+static int
+glx_dri3_clamp_swap_interval(struct loader_dri3_drawable *draw, int interval)
 {
-   xshmfence_trigger(buffer->shm_fence);
+   return interval;
 }
 
-static inline void
-dri3_fence_trigger(xcb_connection_t *c, struct dri3_buffer *buffer)
+static void
+glx_dri3_set_swap_interval(struct loader_dri3_drawable *draw, int interval)
 {
-   xcb_sync_trigger_fence(c, buffer->sync_fence);
+   struct dri3_drawable *priv = loader_drawable_to_dri3_drawable(draw);
+
+   priv->swap_interval = interval;
 }
 
-static inline void
-dri3_fence_await(xcb_connection_t *c, struct dri3_buffer *buffer)
+static void
+glx_dri3_set_drawable_size(struct loader_dri3_drawable *draw,
+                           int width, int height)
 {
-   xcb_flush(c);
-   xshmfence_await(buffer->shm_fence);
+   /* Nothing to do */
 }
 
-static inline Bool
-dri3_fence_triggered(struct dri3_buffer *buffer)
+static bool
+glx_dri3_in_current_context(struct loader_dri3_drawable *draw)
 {
-   return xshmfence_query(buffer->shm_fence);
+   struct dri3_drawable *priv = loader_drawable_to_dri3_drawable(draw);
+   struct dri3_context *pcp = (struct dri3_context *) __glXGetCurrentContext();
+   struct dri3_screen *psc = (struct dri3_screen *) priv->base.psc;
+
+   return (&pcp->base != &dummyContext) && pcp->base.psc == &psc->base;
+}
+
+static __DRIcontext *
+glx_dri3_get_dri_context(struct loader_dri3_drawable *draw)
+{
+   struct glx_context *gc = __glXGetCurrentContext();
+
+   if (gc) {
+      struct dri3_context *dri3Ctx = (struct dri3_context *) gc;
+      return dri3Ctx->driContext;
+   }
+
+   return NULL;
 }
 
 static void
+glx_dri3_flush_drawable(struct loader_dri3_drawable *draw, unsigned flags)
+{
+   loader_dri3_flush(draw, flags, __DRI2_THROTTLE_SWAPBUFFER);
+}
+
+static void
+glx_dri3_show_fps(struct loader_dri3_drawable *draw, uint64_t current_ust)
+{
+   struct dri3_drawable *priv = loader_drawable_to_dri3_drawable(draw);
+   const uint64_t interval =
+      ((struct dri3_screen *) priv->base.psc)->show_fps_interval;
+
+   if (!interval)
+      return;
+
+   priv->frames++;
+
+   /* DRI3+Present together uses microseconds for UST. */
+   if (priv->previous_ust + interval * 1000000 <= current_ust) {
+      if (priv->previous_ust) {
+         fprintf(stderr, "libGL: FPS = %.1f\n",
+                 ((uint64_t) priv->frames * 1000000) /
+                 (double)(current_ust - priv->previous_ust));
+      }
+      priv->frames = 0;
+      priv->previous_ust = current_ust;
+   }
+}
+
+static struct loader_dri3_vtable glx_dri3_vtable = {
+   .get_swap_interval = glx_dri3_get_swap_interval,
+   .clamp_swap_interval = glx_dri3_clamp_swap_interval,
+   .set_swap_interval = glx_dri3_set_swap_interval,
+   .set_drawable_size = glx_dri3_set_drawable_size,
+   .in_current_context = glx_dri3_in_current_context,
+   .get_dri_context = glx_dri3_get_dri_context,
+   .flush_drawable = glx_dri3_flush_drawable,
+   .show_fps = glx_dri3_show_fps,
+};
+
+
+static const struct glx_context_vtable dri3_context_vtable;
+
+static void
 dri3_destroy_context(struct glx_context *context)
 {
    struct dri3_context *pcp = (struct dri3_context *) context;
@@ -143,7 +212,8 @@ dri3_bind_context(struct glx_context *context, struct glx_context *old,
       return GLXBadDrawable;
 
    if (!(*psc->core->bindContext) (pcp->driContext,
-                                   pdraw->driDrawable, pread->driDrawable))
+                                   pdraw->loader_drawable.dri_drawable,
+                                   pread->loader_drawable.dri_drawable))
       return GLXBadContext;
 
    return Success;
@@ -265,38 +335,12 @@ dri3_create_context(struct glx_screen *base,
 }
 
 static void
-dri3_free_render_buffer(struct dri3_drawable *pdraw, struct dri3_buffer *buffer);
-
-static void
-dri3_update_num_back(struct dri3_drawable *priv)
-{
-   priv->num_back = 1;
-   if (priv->flipping) {
-      if (!priv->is_pixmap && !(priv->present_capabilities & XCB_PRESENT_CAPABILITY_ASYNC))
-         priv->num_back++;
-      priv->num_back++;
-   }
-   if (priv->swap_interval == 0)
-      priv->num_back++;
-}
-
-static void
 dri3_destroy_drawable(__GLXDRIdrawable *base)
 {
-   struct dri3_screen *psc = (struct dri3_screen *) base->psc;
    struct dri3_drawable *pdraw = (struct dri3_drawable *) base;
-   xcb_connection_t     *c = XGetXCBConnection(pdraw->base.psc->dpy);
-   int i;
-
-   (*psc->core->destroyDrawable) (pdraw->driDrawable);
 
-   for (i = 0; i < DRI3_NUM_BUFFERS; i++) {
-      if (pdraw->buffers[i])
-         dri3_free_render_buffer(pdraw, pdraw->buffers[i]);
-   }
+   loader_dri3_drawable_fini(&pdraw->loader_drawable);
 
-   if (pdraw->special_event)
-      xcb_unregister_for_special_event(c, pdraw->special_event);
    free(pdraw);
 }
 
@@ -307,7 +351,6 @@ dri3_create_drawable(struct glx_screen *base, XID xDrawable,
    struct dri3_drawable *pdraw;
    struct dri3_screen *psc = (struct dri3_screen *) base;
    __GLXDRIconfigPrivate *config = (__GLXDRIconfigPrivate *) config_base;
-   GLint vblank_mode = DRI_CONF_VBLANK_DEF_INTERVAL_1;
 
    pdraw = calloc(1, sizeof(*pdraw));
    if (!pdraw)
@@ -317,158 +360,21 @@ dri3_create_drawable(struct glx_screen *base, XID xDrawable,
    pdraw->base.xDrawable = xDrawable;
    pdraw->base.drawable = drawable;
    pdraw->base.psc = &psc->base;
-   pdraw->swap_interval = 1; /* default may be overridden below */
-   pdraw->have_back = 0;
-   pdraw->have_fake_front = 0;
-
-   if (psc->config)
-      psc->config->configQueryi(psc->driScreen,
-                                "vblank_mode", &vblank_mode);
-
-   switch (vblank_mode) {
-   case DRI_CONF_VBLANK_NEVER:
-   case DRI_CONF_VBLANK_DEF_INTERVAL_0:
-      pdraw->swap_interval = 0;
-      break;
-   case DRI_CONF_VBLANK_DEF_INTERVAL_1:
-   case DRI_CONF_VBLANK_ALWAYS_SYNC:
-   default:
-      pdraw->swap_interval = 1;
-      break;
-   }
-
-   dri3_update_num_back(pdraw);
 
    (void) __glXInitialize(psc->base.dpy);
 
-   /* Create a new drawable */
-   pdraw->driDrawable =
-      (*psc->image_driver->createNewDrawable) (psc->driScreen,
-                                               config->driConfig, pdraw);
-
-   if (!pdraw->driDrawable) {
+   if (loader_dri3_drawable_init(XGetXCBConnection(base->dpy),
+                                 xDrawable, psc->driScreen,
+                                 psc->is_different_gpu, config->driConfig,
+                                 &psc->loader_dri3_ext, &glx_dri3_vtable,
+                                 &pdraw->loader_drawable)) {
       free(pdraw);
       return NULL;
    }
 
-   /*
-    * Make sure server has the same swap interval we do for the new
-    * drawable.
-    */
-   if (psc->vtable.setSwapInterval)
-      psc->vtable.setSwapInterval(&pdraw->base, pdraw->swap_interval);
-
    return &pdraw->base;
 }
 
-static void
-show_fps(struct dri3_drawable *draw, uint64_t current_ust)
-{
-   const uint64_t interval =
-      ((struct dri3_screen *) draw->base.psc)->show_fps_interval;
-
-   draw->frames++;
-
-   /* DRI3+Present together uses microseconds for UST. */
-   if (draw->previous_ust + interval * 1000000 <= current_ust) {
-      if (draw->previous_ust) {
-         fprintf(stderr, "libGL: FPS = %.1f\n",
-                 ((uint64_t) draw->frames * 1000000) /
-                 (double)(current_ust - draw->previous_ust));
-      }
-      draw->frames = 0;
-      draw->previous_ust = current_ust;
-   }
-}
-
-/*
- * Process one Present event
- */
-static void
-dri3_handle_present_event(struct dri3_drawable *priv, xcb_present_generic_event_t *ge)
-{
-   struct dri3_screen *psc = (struct dri3_screen *) priv->base.psc;
-
-   switch (ge->evtype) {
-   case XCB_PRESENT_CONFIGURE_NOTIFY: {
-      xcb_present_configure_notify_event_t *ce = (void *) ge;
-
-      priv->width = ce->width;
-      priv->height = ce->height;
-      break;
-   }
-   case XCB_PRESENT_COMPLETE_NOTIFY: {
-      xcb_present_complete_notify_event_t *ce = (void *) ge;
-
-      /* Compute the processed SBC number from the received 32-bit serial number merged
-       * with the upper 32-bits of the sent 64-bit serial number while checking for
-       * wrap
-       */
-      if (ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP) {
-         priv->recv_sbc = (priv->send_sbc & 0xffffffff00000000LL) | ce->serial;
-         if (priv->recv_sbc > priv->send_sbc)
-            priv->recv_sbc -= 0x100000000;
-         switch (ce->mode) {
-         case XCB_PRESENT_COMPLETE_MODE_FLIP:
-            priv->flipping = true;
-            break;
-         case XCB_PRESENT_COMPLETE_MODE_COPY:
-            priv->flipping = false;
-            break;
-         }
-         dri3_update_num_back(priv);
-
-         if (psc->show_fps_interval)
-            show_fps(priv, ce->ust);
-
-         priv->ust = ce->ust;
-         priv->msc = ce->msc;
-      } else {
-         priv->recv_msc_serial = ce->serial;
-         priv->notify_ust = ce->ust;
-         priv->notify_msc = ce->msc;
-      }
-      break;
-   }
-   case XCB_PRESENT_EVENT_IDLE_NOTIFY: {
-      xcb_present_idle_notify_event_t *ie = (void *) ge;
-      int b;
-
-      for (b = 0; b < sizeof (priv->buffers) / sizeof (priv->buffers[0]); b++) {
-         struct dri3_buffer        *buf = priv->buffers[b];
-
-         if (buf && buf->pixmap == ie->pixmap) {
-            buf->busy = 0;
-            if (priv->num_back <= b && b < DRI3_MAX_BACK) {
-               dri3_free_render_buffer(priv, buf);
-               priv->buffers[b] = NULL;
-            }
-            break;
-         }
-      }
-      break;
-   }
-   }
-   free(ge);
-}
-
-static bool
-dri3_wait_for_event(__GLXDRIdrawable *pdraw)
-{
-   xcb_connection_t *c = XGetXCBConnection(pdraw->psc->dpy);
-   struct dri3_drawable *priv = (struct dri3_drawable *) pdraw;
-   xcb_generic_event_t *ev;
-   xcb_present_generic_event_t *ge;
-
-   xcb_flush(c);
-   ev = xcb_wait_for_special_event(c, priv->special_event);
-   if (!ev)
-      return false;
-   ge = (void *) ev;
-   dri3_handle_present_event(priv, ge);
-   return true;
-}
-
 /** dri3_wait_for_msc
  *
  * Get the X server to send an event when the target msc/divisor/remainder is
@@ -478,32 +384,10 @@ static int
 dri3_wait_for_msc(__GLXDRIdrawable *pdraw, int64_t target_msc, int64_t divisor,
                   int64_t remainder, int64_t *ust, int64_t *msc, int64_t *sbc)
 {
-   xcb_connection_t *c = XGetXCBConnection(pdraw->psc->dpy);
    struct dri3_drawable *priv = (struct dri3_drawable *) pdraw;
-   uint32_t msc_serial;
-
-   /* Ask for the an event for the target MSC */
-   msc_serial = ++priv->send_msc_serial;
-   xcb_present_notify_msc(c,
-                          priv->base.xDrawable,
-                          msc_serial,
-                          target_msc,
-                          divisor,
-                          remainder);
-
-   xcb_flush(c);
-
-   /* Wait for the event */
-   if (priv->special_event) {
-      while ((int32_t) (msc_serial - priv->recv_msc_serial) > 0) {
-         if (!dri3_wait_for_event(pdraw))
-            return 0;
-      }
-   }
 
-   *ust = priv->notify_ust;
-   *msc = priv->notify_msc;
-   *sbc = priv->recv_sbc;
+   loader_dri3_wait_for_msc(&priv->loader_drawable, target_msc, divisor,
+                            remainder, ust, msc, sbc);
 
    return 1;
 }
@@ -532,101 +416,8 @@ dri3_wait_for_sbc(__GLXDRIdrawable *pdraw, int64_t target_sbc, int64_t *ust,
 {
    struct dri3_drawable *priv = (struct dri3_drawable *) pdraw;
 
-   /* From the GLX_OML_sync_control spec:
-    *
-    *     "If <target_sbc> = 0, the function will block until all previous
-    *      swaps requested with glXSwapBuffersMscOML for that window have
-    *      completed."
-    */
-   if (!target_sbc)
-      target_sbc = priv->send_sbc;
-
-   while (priv->recv_sbc < target_sbc) {
-      if (!dri3_wait_for_event(pdraw))
-         return 0;
-   }
-
-   *ust = priv->ust;
-   *msc = priv->msc;
-   *sbc = priv->recv_sbc;
-   return 1;
-}
-
-/**
- * Asks the driver to flush any queued work necessary for serializing with the
- * X command stream, and optionally the slightly more strict requirement of
- * glFlush() equivalence (which would require flushing even if nothing had
- * been drawn to a window system framebuffer, for example).
- */
-static void
-dri3_flush(struct dri3_screen *psc,
-           struct dri3_drawable *draw,
-           unsigned flags,
-           enum __DRI2throttleReason throttle_reason)
-{
-   struct glx_context *gc = __glXGetCurrentContext();
-
-   if (gc) {
-      struct dri3_context *dri3Ctx = (struct dri3_context *)gc;
-
-      (*psc->f->flush_with_flags)(dri3Ctx->driContext, draw->driDrawable, flags, throttle_reason);
-   }
-}
-
-static xcb_gcontext_t
-dri3_drawable_gc(struct dri3_drawable *priv)
-{
-   if (!priv->gc) {
-      uint32_t v;
-      xcb_connection_t *c = XGetXCBConnection(priv->base.psc->dpy);
-
-      v = 0;
-      xcb_create_gc(c,
-                    (priv->gc = xcb_generate_id(c)),
-                    priv->base.xDrawable,
-                    XCB_GC_GRAPHICS_EXPOSURES,
-                    &v);
-   }
-   return priv->gc;
-}
-
-static struct dri3_buffer *
-dri3_back_buffer(struct dri3_drawable *priv)
-{
-   return priv->buffers[DRI3_BACK_ID(priv->cur_back)];
-}
-
-static struct dri3_buffer *
-dri3_fake_front_buffer(struct dri3_drawable *priv)
-{
-   return priv->buffers[DRI3_FRONT_ID];
-}
-
-static void
-dri3_copy_area (xcb_connection_t *c  /**< */,
-                xcb_drawable_t    src_drawable  /**< */,
-                xcb_drawable_t    dst_drawable  /**< */,
-                xcb_gcontext_t    gc  /**< */,
-                int16_t           src_x  /**< */,
-                int16_t           src_y  /**< */,
-                int16_t           dst_x  /**< */,
-                int16_t           dst_y  /**< */,
-                uint16_t          width  /**< */,
-                uint16_t          height  /**< */)
-{
-   xcb_void_cookie_t cookie;
-
-   cookie = xcb_copy_area_checked(c,
-                                  src_drawable,
-                                  dst_drawable,
-                                  gc,
-                                  src_x,
-                                  src_y,
-                                  dst_x,
-                                  dst_y,
-                                  width,
-                                  height);
-   xcb_discard_reply(c, cookie.sequence);
+   return loader_dri3_wait_for_sbc(&priv->loader_drawable, target_sbc,
+                                   ust, msc, sbc);
 }
 
 static void
@@ -635,144 +426,27 @@ dri3_copy_sub_buffer(__GLXDRIdrawable *pdraw, int x, int y,
                      Bool flush)
 {
    struct dri3_drawable *priv = (struct dri3_drawable *) pdraw;
-   struct dri3_screen *psc = (struct dri3_screen *) pdraw->psc;
-   struct dri3_context *pcp = (struct dri3_context *) __glXGetCurrentContext();
-   xcb_connection_t     *c = XGetXCBConnection(priv->base.psc->dpy);
-   struct dri3_buffer *back;
-
-   unsigned flags = __DRI2_FLUSH_DRAWABLE;
-
-   /* Check we have the right attachments */
-   if (!priv->have_back || priv->is_pixmap)
-      return;
-
-   if (flush)
-      flags |= __DRI2_FLUSH_CONTEXT;
-   dri3_flush(psc, priv, flags, __DRI2_THROTTLE_SWAPBUFFER);
-
-   back = dri3_back_buffer(priv);
-   y = priv->height - y - height;
-
-   if (psc->is_different_gpu && (&pcp->base != &dummyContext) && pcp->base.psc == &psc->base) {
-      /* Update the linear buffer part of the back buffer
-       * for the dri3_copy_area operation
-       */
-      psc->image->blitImage(pcp->driContext,
-                            back->linear_buffer,
-                            back->image,
-                            0, 0, back->width,
-                            back->height,
-                            0, 0, back->width,
-                            back->height, __BLIT_FLAG_FLUSH);
-      /* We use blitImage to update our fake front,
-       */
-      if (priv->have_fake_front)
-         psc->image->blitImage(pcp->driContext,
-                               dri3_fake_front_buffer(priv)->image,
-                               back->image,
-                               x, y, width, height,
-                               x, y, width, height, __BLIT_FLAG_FLUSH);
-   }
-
-   dri3_fence_reset(c, back);
-   dri3_copy_area(c,
-                  dri3_back_buffer(priv)->pixmap,
-                  priv->base.xDrawable,
-                  dri3_drawable_gc(priv),
-                  x, y, x, y, width, height);
-   dri3_fence_trigger(c, back);
-   /* Refresh the fake front (if present) after we just damaged the real
-    * front.
-    */
-   if (priv->have_fake_front && !psc->is_different_gpu) {
-      dri3_fence_reset(c, dri3_fake_front_buffer(priv));
-      dri3_copy_area(c,
-                     dri3_back_buffer(priv)->pixmap,
-                     dri3_fake_front_buffer(priv)->pixmap,
-                     dri3_drawable_gc(priv),
-                     x, y, x, y, width, height);
-      dri3_fence_trigger(c, dri3_fake_front_buffer(priv));
-      dri3_fence_await(c, dri3_fake_front_buffer(priv));
-   }
-   dri3_fence_await(c, back);
-}
-
-static void
-dri3_copy_drawable(struct dri3_drawable *priv, Drawable dest, Drawable src)
-{
-   struct dri3_screen *psc = (struct dri3_screen *) priv->base.psc;
-   xcb_connection_t     *c = XGetXCBConnection(priv->base.psc->dpy);
 
-   dri3_flush(psc, priv, __DRI2_FLUSH_DRAWABLE, 0);
-
-   dri3_fence_reset(c, dri3_fake_front_buffer(priv));
-   dri3_copy_area(c,
-                  src, dest,
-                  dri3_drawable_gc(priv),
-                  0, 0, 0, 0, priv->width, priv->height);
-   dri3_fence_trigger(c, dri3_fake_front_buffer(priv));
-   dri3_fence_await(c, dri3_fake_front_buffer(priv));
+   loader_dri3_copy_sub_buffer(&priv->loader_drawable, x, y,
+                               width, height, flush);
 }
 
 static void
 dri3_wait_x(struct glx_context *gc)
 {
-   struct dri3_context *pcp = (struct dri3_context *) gc;
    struct dri3_drawable *priv = (struct dri3_drawable *)
       GetGLXDRIDrawable(gc->currentDpy, gc->currentDrawable);
-   struct dri3_screen *psc;
-   struct dri3_buffer *front;
 
-   if (priv == NULL || !priv->have_fake_front)
-      return;
-
-   psc = (struct dri3_screen *) priv->base.psc;
-   front = dri3_fake_front_buffer(priv);
-
-   dri3_copy_drawable(priv, front->pixmap, priv->base.xDrawable);
-
-   /* In the psc->is_different_gpu case, the linear buffer has been updated,
-    * but not yet the tiled buffer.
-    * Copy back to the tiled buffer we use for rendering.
-    * Note that we don't need flushing.
-    */
-   if (psc->is_different_gpu && (&pcp->base != &dummyContext) && pcp->base.psc == &psc->base)
-      psc->image->blitImage(pcp->driContext,
-                            front->image,
-                            front->linear_buffer,
-                            0, 0, front->width,
-                            front->height,
-                            0, 0, front->width,
-                            front->height, 0);
+   loader_dri3_wait_x(&priv->loader_drawable);
 }
 
 static void
 dri3_wait_gl(struct glx_context *gc)
 {
-   struct dri3_context *pcp = (struct dri3_context *) gc;
    struct dri3_drawable *priv = (struct dri3_drawable *)
       GetGLXDRIDrawable(gc->currentDpy, gc->currentDrawable);
-   struct dri3_screen *psc;
-   struct dri3_buffer *front;
-
-   if (priv == NULL || !priv->have_fake_front)
-      return;
-
-   psc = (struct dri3_screen *) priv->base.psc;
-   front = dri3_fake_front_buffer(priv);
 
-   /* In the psc->is_different_gpu case, we update the linear_buffer
-    * before updating the real front.
-    */
-   if (psc->is_different_gpu && (&pcp->base != &dummyContext) && pcp->base.psc == &psc->base)
-      psc->image->blitImage(pcp->driContext,
-                            front->linear_buffer,
-                            front->image,
-                            0, 0, front->width,
-                            front->height,
-                            0, 0, front->width,
-                            front->height, __BLIT_FLAG_FLUSH);
-   dri3_copy_drawable(priv, priv->base.xDrawable, front->pixmap);
+   loader_dri3_wait_gl(&priv->loader_drawable);
 }
 
 /**
@@ -782,8 +456,8 @@ dri3_wait_gl(struct glx_context *gc)
 static void
 dri3_flush_front_buffer(__DRIdrawable *driDrawable, void *loaderPrivate)
 {
-   struct glx_context *gc;
-   struct dri3_drawable *pdraw = loaderPrivate;
+   struct loader_dri3_drawable *draw = loaderPrivate;
+   struct dri3_drawable *pdraw = loader_drawable_to_dri3_drawable(draw);
    struct dri3_screen *psc;
 
    if (!pdraw)
@@ -796,699 +470,9 @@ dri3_flush_front_buffer(__DRIdrawable *driDrawable, void *loaderPrivate)
 
    (void) __glXInitialize(psc->base.dpy);
 
-   gc = __glXGetCurrentContext();
+   loader_dri3_flush(draw, __DRI2_FLUSH_DRAWABLE, __DRI2_THROTTLE_FLUSHFRONT);
 
-   dri3_flush(psc, pdraw, __DRI2_FLUSH_DRAWABLE, __DRI2_THROTTLE_FLUSHFRONT);
-
-   dri3_wait_gl(gc);
-}
-
-static uint32_t
-dri3_cpp_for_format(uint32_t format) {
-   switch (format) {
-   case  __DRI_IMAGE_FORMAT_R8:
-      return 1;
-   case  __DRI_IMAGE_FORMAT_RGB565:
-   case  __DRI_IMAGE_FORMAT_GR88:
-      return 2;
-   case  __DRI_IMAGE_FORMAT_XRGB8888:
-   case  __DRI_IMAGE_FORMAT_ARGB8888:
-   case  __DRI_IMAGE_FORMAT_ABGR8888:
-   case  __DRI_IMAGE_FORMAT_XBGR8888:
-   case  __DRI_IMAGE_FORMAT_XRGB2101010:
-   case  __DRI_IMAGE_FORMAT_ARGB2101010:
-   case  __DRI_IMAGE_FORMAT_SARGB8:
-      return 4;
-   case  __DRI_IMAGE_FORMAT_NONE:
-   default:
-      return 0;
-   }
-}
-
-
-/** dri3_alloc_render_buffer
- *
- * Use the driver createImage function to construct a __DRIimage, then
- * get a file descriptor for that and create an X pixmap from that
- *
- * Allocate an xshmfence for synchronization
- */
-static struct dri3_buffer *
-dri3_alloc_render_buffer(struct glx_screen *glx_screen, Drawable draw,
-                         unsigned int format, int width, int height, int depth)
-{
-   struct dri3_screen *psc = (struct dri3_screen *) glx_screen;
-   Display *dpy = glx_screen->dpy;
-   struct dri3_buffer *buffer;
-   __DRIimage *pixmap_buffer;
-   xcb_connection_t *c = XGetXCBConnection(dpy);
-   xcb_pixmap_t pixmap;
-   xcb_sync_fence_t sync_fence;
-   struct xshmfence *shm_fence;
-   int buffer_fd, fence_fd;
-   int stride;
-
-   /* Create an xshmfence object and
-    * prepare to send that to the X server
-    */
-
-   fence_fd = xshmfence_alloc_shm();
-   if (fence_fd < 0) {
-      ErrorMessageF("DRI3 Fence object allocation failure %s\n", strerror(errno));
-      return NULL;
-   }
-   shm_fence = xshmfence_map_shm(fence_fd);
-   if (shm_fence == NULL) {
-      ErrorMessageF("DRI3 Fence object map failure %s\n", strerror(errno));
-      goto no_shm_fence;
-   }
-
-   /* Allocate the image from the driver
-    */
-   buffer = calloc(1, sizeof (struct dri3_buffer));
-   if (!buffer)
-      goto no_buffer;
-
-   buffer->cpp = dri3_cpp_for_format(format);
-   if (!buffer->cpp) {
-      ErrorMessageF("DRI3 buffer format %d invalid\n", format);
-      goto no_image;
-   }
-
-   if (!psc->is_different_gpu) {
-      buffer->image = (*psc->image->createImage) (psc->driScreen,
-                                                  width, height,
-                                                  format,
-                                                  __DRI_IMAGE_USE_SHARE |
-                                                  __DRI_IMAGE_USE_SCANOUT,
-                                                  buffer);
-      pixmap_buffer = buffer->image;
-
-      if (!buffer->image) {
-         ErrorMessageF("DRI3 gpu image creation failure\n");
-         goto no_image;
-      }
-   } else {
-      buffer->image = (*psc->image->createImage) (psc->driScreen,
-                                                  width, height,
-                                                  format,
-                                                  0,
-                                                  buffer);
-
-      if (!buffer->image) {
-         ErrorMessageF("DRI3 other gpu image creation failure\n");
-         goto no_image;
-      }
-
-      buffer->linear_buffer = (*psc->image->createImage) (psc->driScreen,
-                                                          width, height,
-                                                          format,
-                                                          __DRI_IMAGE_USE_SHARE |
-                                                          __DRI_IMAGE_USE_LINEAR,
-                                                          buffer);
-      pixmap_buffer = buffer->linear_buffer;
-
-      if (!buffer->linear_buffer) {
-         ErrorMessageF("DRI3 gpu linear image creation failure\n");
-         goto no_linear_buffer;
-      }
-   }
-
-   /* X wants the stride, so ask the image for it
-    */
-   if (!(*psc->image->queryImage)(pixmap_buffer, __DRI_IMAGE_ATTRIB_STRIDE, &stride)) {
-      ErrorMessageF("DRI3 get image stride failed\n");
-      goto no_buffer_attrib;
-   }
-
-   buffer->pitch = stride;
-
-   if (!(*psc->image->queryImage)(pixmap_buffer, __DRI_IMAGE_ATTRIB_FD, &buffer_fd)) {
-      ErrorMessageF("DRI3 get image FD failed\n");
-      goto no_buffer_attrib;
-   }
-
-   xcb_dri3_pixmap_from_buffer(c,
-                               (pixmap = xcb_generate_id(c)),
-                               draw,
-                               buffer->size,
-                               width, height, buffer->pitch,
-                               depth, buffer->cpp * 8,
-                               buffer_fd);
-
-   xcb_dri3_fence_from_fd(c,
-                          pixmap,
-                          (sync_fence = xcb_generate_id(c)),
-                          false,
-                          fence_fd);
-
-   buffer->pixmap = pixmap;
-   buffer->own_pixmap = true;
-   buffer->sync_fence = sync_fence;
-   buffer->shm_fence = shm_fence;
-   buffer->width = width;
-   buffer->height = height;
-
-   /* Mark the buffer as idle
-    */
-   dri3_fence_set(buffer);
-
-   return buffer;
-
-no_buffer_attrib:
-   (*psc->image->destroyImage)(pixmap_buffer);
-no_linear_buffer:
-   if (psc->is_different_gpu)
-      (*psc->image->destroyImage)(buffer->image);
-no_image:
-   free(buffer);
-no_buffer:
-   xshmfence_unmap_shm(shm_fence);
-no_shm_fence:
-   close(fence_fd);
-   ErrorMessageF("DRI3 alloc_render_buffer failed\n");
-   return NULL;
-}
-
-/** dri3_free_render_buffer
- *
- * Free everything associated with one render buffer including pixmap, fence
- * stuff and the driver image
- */
-static void
-dri3_free_render_buffer(struct dri3_drawable *pdraw, struct dri3_buffer *buffer)
-{
-   struct dri3_screen   *psc = (struct dri3_screen *) pdraw->base.psc;
-   xcb_connection_t     *c = XGetXCBConnection(pdraw->base.psc->dpy);
-
-   if (buffer->own_pixmap)
-      xcb_free_pixmap(c, buffer->pixmap);
-   xcb_sync_destroy_fence(c, buffer->sync_fence);
-   xshmfence_unmap_shm(buffer->shm_fence);
-   (*psc->image->destroyImage)(buffer->image);
-   if (buffer->linear_buffer)
-      (*psc->image->destroyImage)(buffer->linear_buffer);
-   free(buffer);
-}
-
-
-/** dri3_flush_present_events
- *
- * Process any present events that have been received from the X server
- */
-static void
-dri3_flush_present_events(struct dri3_drawable *priv)
-{
-   xcb_connection_t     *c = XGetXCBConnection(priv->base.psc->dpy);
-
-   /* Check to see if any configuration changes have occurred
-    * since we were last invoked
-    */
-   if (priv->special_event) {
-      xcb_generic_event_t    *ev;
-
-      while ((ev = xcb_poll_for_special_event(c, priv->special_event)) != NULL) {
-         xcb_present_generic_event_t *ge = (void *) ev;
-         dri3_handle_present_event(priv, ge);
-      }
-   }
-}
-
-/** dri3_update_drawable
- *
- * Called the first time we use the drawable and then
- * after we receive present configure notify events to
- * track the geometry of the drawable
- */
-static int
-dri3_update_drawable(__DRIdrawable *driDrawable, void *loaderPrivate)
-{
-   struct dri3_drawable *priv = loaderPrivate;
-   xcb_connection_t     *c = XGetXCBConnection(priv->base.psc->dpy);
-
-   /* First time through, go get the current drawable geometry
-    */
-   if (priv->width == 0 || priv->height == 0 || priv->depth == 0) {
-      xcb_get_geometry_cookie_t                 geom_cookie;
-      xcb_get_geometry_reply_t                  *geom_reply;
-      xcb_void_cookie_t                         cookie;
-      xcb_generic_error_t                       *error;
-      xcb_present_query_capabilities_cookie_t   present_capabilities_cookie;
-      xcb_present_query_capabilities_reply_t    *present_capabilities_reply;
-
-
-      /* Try to select for input on the window.
-       *
-       * If the drawable is a window, this will get our events
-       * delivered.
-       *
-       * Otherwise, we'll get a BadWindow error back from this request which
-       * will let us know that the drawable is a pixmap instead.
-       */
-
-
-      cookie = xcb_present_select_input_checked(c,
-                                                (priv->eid = xcb_generate_id(c)),
-                                                priv->base.xDrawable,
-                                                XCB_PRESENT_EVENT_MASK_CONFIGURE_NOTIFY|
-                                                XCB_PRESENT_EVENT_MASK_COMPLETE_NOTIFY|
-                                                XCB_PRESENT_EVENT_MASK_IDLE_NOTIFY);
-
-      present_capabilities_cookie = xcb_present_query_capabilities(c, priv->base.xDrawable);
-
-      /* Create an XCB event queue to hold present events outside of the usual
-       * application event queue
-       */
-      priv->special_event = xcb_register_for_special_xge(c,
-                                                         &xcb_present_id,
-                                                         priv->eid,
-                                                         priv->stamp);
-
-      geom_cookie = xcb_get_geometry(c, priv->base.xDrawable);
-
-      geom_reply = xcb_get_geometry_reply(c, geom_cookie, NULL);
-
-      if (!geom_reply)
-         return false;
-
-      priv->width = geom_reply->width;
-      priv->height = geom_reply->height;
-      priv->depth = geom_reply->depth;
-      priv->is_pixmap = false;
-
-      free(geom_reply);
-
-      /* Check to see if our select input call failed. If it failed with a
-       * BadWindow error, then assume the drawable is a pixmap. Destroy the
-       * special event queue created above and mark the drawable as a pixmap
-       */
-
-      error = xcb_request_check(c, cookie);
-
-      present_capabilities_reply = xcb_present_query_capabilities_reply(c,
-                                                                        present_capabilities_cookie,
-                                                                        NULL);
-
-      if (present_capabilities_reply) {
-         priv->present_capabilities = present_capabilities_reply->capabilities;
-         free(present_capabilities_reply);
-      } else
-         priv->present_capabilities = 0;
-
-      if (error) {
-         if (error->error_code != BadWindow) {
-            free(error);
-            return false;
-         }
-         priv->is_pixmap = true;
-         xcb_unregister_for_special_event(c, priv->special_event);
-         priv->special_event = NULL;
-      }
-   }
-   dri3_flush_present_events(priv);
-   return true;
-}
-
-/* the DRIimage createImage function takes __DRI_IMAGE_FORMAT codes, while
- * the createImageFromFds call takes __DRI_IMAGE_FOURCC codes. To avoid
- * complete confusion, just deal in __DRI_IMAGE_FORMAT codes for now and
- * translate to __DRI_IMAGE_FOURCC codes in the call to createImageFromFds
- */
-static int
-image_format_to_fourcc(int format)
-{
-
-   /* Convert from __DRI_IMAGE_FORMAT to __DRI_IMAGE_FOURCC (sigh) */
-   switch (format) {
-   case __DRI_IMAGE_FORMAT_SARGB8: return __DRI_IMAGE_FOURCC_SARGB8888;
-   case __DRI_IMAGE_FORMAT_RGB565: return __DRI_IMAGE_FOURCC_RGB565;
-   case __DRI_IMAGE_FORMAT_XRGB8888: return __DRI_IMAGE_FOURCC_XRGB8888;
-   case __DRI_IMAGE_FORMAT_ARGB8888: return __DRI_IMAGE_FOURCC_ARGB8888;
-   case __DRI_IMAGE_FORMAT_ABGR8888: return __DRI_IMAGE_FOURCC_ABGR8888;
-   case __DRI_IMAGE_FORMAT_XBGR8888: return __DRI_IMAGE_FOURCC_XBGR8888;
-   }
-   return 0;
-}
-
-/** dri3_get_pixmap_buffer
- *
- * Get the DRM object for a pixmap from the X server and
- * wrap that with a __DRIimage structure using createImageFromFds
- */
-static struct dri3_buffer *
-dri3_get_pixmap_buffer(__DRIdrawable *driDrawable,
-                       unsigned int format,
-                       enum dri3_buffer_type buffer_type,
-                       void *loaderPrivate)
-{
-   struct dri3_drawable                 *pdraw = loaderPrivate;
-   int                                  buf_id = dri3_pixmap_buf_id(buffer_type);
-   struct dri3_buffer                   *buffer = pdraw->buffers[buf_id];
-   Pixmap                               pixmap;
-   xcb_dri3_buffer_from_pixmap_cookie_t bp_cookie;
-   xcb_dri3_buffer_from_pixmap_reply_t  *bp_reply;
-   int                                  *fds;
-   Display                              *dpy;
-   struct dri3_screen                   *psc;
-   xcb_connection_t                     *c;
-   xcb_sync_fence_t                     sync_fence;
-   struct xshmfence                     *shm_fence;
-   int                                  fence_fd;
-   __DRIimage                           *image_planar;
-   int                                  stride, offset;
-
-   if (buffer)
-      return buffer;
-
-   pixmap = pdraw->base.xDrawable;
-   psc = (struct dri3_screen *) pdraw->base.psc;
-   dpy = psc->base.dpy;
-   c = XGetXCBConnection(dpy);
-
-   buffer = calloc(1, sizeof (struct dri3_buffer));
-   if (!buffer)
-      goto no_buffer;
-
-   fence_fd = xshmfence_alloc_shm();
-   if (fence_fd < 0)
-      goto no_fence;
-   shm_fence = xshmfence_map_shm(fence_fd);
-   if (shm_fence == NULL) {
-      close (fence_fd);
-      goto no_fence;
-   }
-
-   xcb_dri3_fence_from_fd(c,
-                          pixmap,
-                          (sync_fence = xcb_generate_id(c)),
-                          false,
-                          fence_fd);
-
-   /* Get an FD for the pixmap object
-    */
-   bp_cookie = xcb_dri3_buffer_from_pixmap(c, pixmap);
-   bp_reply = xcb_dri3_buffer_from_pixmap_reply(c, bp_cookie, NULL);
-   if (!bp_reply)
-      goto no_image;
-   fds = xcb_dri3_buffer_from_pixmap_reply_fds(c, bp_reply);
-
-   stride = bp_reply->stride;
-   offset = 0;
-
-   /* createImageFromFds creates a wrapper __DRIimage structure which
-    * can deal with multiple planes for things like Yuv images. So, once
-    * we've gotten the planar wrapper, pull the single plane out of it and
-    * discard the wrapper.
-    */
-   image_planar = (*psc->image->createImageFromFds) (psc->driScreen,
-                                                     bp_reply->width,
-                                                     bp_reply->height,
-                                                     image_format_to_fourcc(format),
-                                                     fds, 1,
-                                                     &stride, &offset, buffer);
-   close(fds[0]);
-   if (!image_planar)
-      goto no_image;
-
-   buffer->image = (*psc->image->fromPlanar)(image_planar, 0, buffer);
-
-   (*psc->image->destroyImage)(image_planar);
-
-   if (!buffer->image)
-      goto no_image;
-
-   buffer->pixmap = pixmap;
-   buffer->own_pixmap = false;
-   buffer->width = bp_reply->width;
-   buffer->height = bp_reply->height;
-   buffer->buffer_type = buffer_type;
-   buffer->shm_fence = shm_fence;
-   buffer->sync_fence = sync_fence;
-
-   pdraw->buffers[buf_id] = buffer;
-   return buffer;
-
-no_image:
-   xcb_sync_destroy_fence(c, sync_fence);
-   xshmfence_unmap_shm(shm_fence);
-no_fence:
-   free(buffer);
-no_buffer:
-   return NULL;
-}
-
-/** dri3_find_back
- *
- * Find an idle back buffer. If there isn't one, then
- * wait for a present idle notify event from the X server
- */
-static int
-dri3_find_back(xcb_connection_t *c, struct dri3_drawable *priv)
-{
-   int  b;
-   xcb_generic_event_t *ev;
-   xcb_present_generic_event_t *ge;
-
-   for (;;) {
-      for (b = 0; b < priv->num_back; b++) {
-         int id = DRI3_BACK_ID((b + priv->cur_back) % priv->num_back);
-         struct dri3_buffer *buffer = priv->buffers[id];
-
-         if (!buffer || !buffer->busy) {
-            priv->cur_back = id;
-            return id;
-         }
-      }
-      xcb_flush(c);
-      ev = xcb_wait_for_special_event(c, priv->special_event);
-      if (!ev)
-         return -1;
-      ge = (void *) ev;
-      dri3_handle_present_event(priv, ge);
-   }
-}
-
-/** dri3_get_buffer
- *
- * Find a front or back buffer, allocating new ones as necessary
- */
-static struct dri3_buffer *
-dri3_get_buffer(__DRIdrawable *driDrawable,
-                unsigned int format,
-                enum dri3_buffer_type buffer_type,
-                void *loaderPrivate)
-{
-   struct dri3_context *pcp = (struct dri3_context *) __glXGetCurrentContext();
-   struct dri3_drawable *priv = loaderPrivate;
-   struct dri3_screen *psc = (struct dri3_screen *) priv->base.psc;
-   xcb_connection_t     *c = XGetXCBConnection(priv->base.psc->dpy);
-   struct dri3_buffer      *buffer;
-   int                  buf_id;
-
-   if (buffer_type == dri3_buffer_back) {
-      buf_id = dri3_find_back(c, priv);
-
-      if (buf_id < 0)
-         return NULL;
-   } else {
-      buf_id = DRI3_FRONT_ID;
-   }
-
-   buffer = priv->buffers[buf_id];
-
-   /* Allocate a new buffer if there isn't an old one, or if that
-    * old one is the wrong size
-    */
-   if (!buffer || buffer->width != priv->width || buffer->height != priv->height) {
-      struct dri3_buffer   *new_buffer;
-
-      /* Allocate the new buffers
-       */
-      new_buffer = dri3_alloc_render_buffer(priv->base.psc,
-                                            priv->base.xDrawable,
-                                            format, priv->width, priv->height, priv->depth);
-      if (!new_buffer)
-         return NULL;
-
-      /* When resizing, copy the contents of the old buffer, waiting for that
-       * copy to complete using our fences before proceeding
-       */
-      switch (buffer_type) {
-      case dri3_buffer_back:
-         if (buffer) {
-            if (!buffer->linear_buffer) {
-               dri3_fence_reset(c, new_buffer);
-               dri3_fence_await(c, buffer);
-               dri3_copy_area(c,
-                              buffer->pixmap,
-                              new_buffer->pixmap,
-                              dri3_drawable_gc(priv),
-                              0, 0, 0, 0, priv->width, priv->height);
-            dri3_fence_trigger(c, new_buffer);
-            } else if ((&pcp->base != &dummyContext) && pcp->base.psc == &psc->base) {
-               psc->image->blitImage(pcp->driContext,
-                                     new_buffer->image,
-                                     buffer->image,
-                                     0, 0, priv->width,
-                                     priv->height,
-                                     0, 0, priv->width,
-                                     priv->height, 0);
-            }
-            dri3_free_render_buffer(priv, buffer);
-         }
-         break;
-      case dri3_buffer_front:
-         dri3_fence_reset(c, new_buffer);
-         dri3_copy_area(c,
-                        priv->base.xDrawable,
-                        new_buffer->pixmap,
-                        dri3_drawable_gc(priv),
-                        0, 0, 0, 0, priv->width, priv->height);
-         dri3_fence_trigger(c, new_buffer);
-
-         if (new_buffer->linear_buffer && (&pcp->base != &dummyContext) && pcp->base.psc == &psc->base) {
-            dri3_fence_await(c, new_buffer);
-            psc->image->blitImage(pcp->driContext,
-                                  new_buffer->image,
-                                  new_buffer->linear_buffer,
-                                  0, 0, priv->width,
-                                  priv->height,
-                                  0, 0, priv->width,
-                                  priv->height, 0);
-         }
-         break;
-      }
-      buffer = new_buffer;
-      buffer->buffer_type = buffer_type;
-      priv->buffers[buf_id] = buffer;
-   }
-   dri3_fence_await(c, buffer);
-
-   /* Return the requested buffer */
-   return buffer;
-}
-
-/** dri3_free_buffers
- *
- * Free the front bufffer or all of the back buffers. Used
- * when the application changes which buffers it needs
- */
-static void
-dri3_free_buffers(__DRIdrawable *driDrawable,
-                 enum dri3_buffer_type buffer_type,
-                 void *loaderPrivate)
-{
-   struct dri3_drawable *priv = loaderPrivate;
-   struct dri3_buffer      *buffer;
-   int                  first_id;
-   int                  n_id;
-   int                  buf_id;
-
-   switch (buffer_type) {
-   case dri3_buffer_back:
-      first_id = DRI3_BACK_ID(0);
-      n_id = DRI3_MAX_BACK;
-      break;
-   case dri3_buffer_front:
-      first_id = DRI3_FRONT_ID;
-      n_id = 1;
-   }
-
-   for (buf_id = first_id; buf_id < first_id + n_id; buf_id++) {
-      buffer = priv->buffers[buf_id];
-      if (buffer) {
-         dri3_free_render_buffer(priv, buffer);
-         priv->buffers[buf_id] = NULL;
-      }
-   }
-}
-
-/** dri3_get_buffers
- *
- * The published buffer allocation API.
- * Returns all of the necessary buffers, allocating
- * as needed.
- */
-static int
-dri3_get_buffers(__DRIdrawable *driDrawable,
-                 unsigned int format,
-                 uint32_t *stamp,
-                 void *loaderPrivate,
-                 uint32_t buffer_mask,
-                 struct __DRIimageList *buffers)
-{
-   struct dri3_drawable *priv = loaderPrivate;
-   struct dri3_screen *psc = (struct dri3_screen *) priv->base.psc;
-   struct dri3_buffer   *front, *back;
-
-   buffers->image_mask = 0;
-   buffers->front = NULL;
-   buffers->back = NULL;
-
-   front = NULL;
-   back = NULL;
-
-   if (!dri3_update_drawable(driDrawable, loaderPrivate))
-      return false;
-
-   /* pixmaps always have front buffers */
-   if (priv->is_pixmap)
-      buffer_mask |= __DRI_IMAGE_BUFFER_FRONT;
-
-   if (buffer_mask & __DRI_IMAGE_BUFFER_FRONT) {
-      /* All pixmaps are owned by the server gpu.
-       * When we use a different gpu, we can't use the pixmap
-       * as buffer since it is potentially tiled a way
-       * our device can't understand. In this case, use
-       * a fake front buffer. Hopefully the pixmap
-       * content will get synced with the fake front
-       * buffer.
-       */
-      if (priv->is_pixmap && !psc->is_different_gpu)
-         front = dri3_get_pixmap_buffer(driDrawable,
-                                        format,
-                                        dri3_buffer_front,
-                                        loaderPrivate);
-      else
-         front = dri3_get_buffer(driDrawable,
-                                 format,
-                                 dri3_buffer_front,
-                                 loaderPrivate);
-
-      if (!front)
-         return false;
-   } else {
-      dri3_free_buffers(driDrawable, dri3_buffer_front, loaderPrivate);
-      priv->have_fake_front = 0;
-   }
-
-   if (buffer_mask & __DRI_IMAGE_BUFFER_BACK) {
-      back = dri3_get_buffer(driDrawable,
-                             format,
-                             dri3_buffer_back,
-                             loaderPrivate);
-      if (!back)
-         return false;
-      priv->have_back = 1;
-   } else {
-      dri3_free_buffers(driDrawable, dri3_buffer_back, loaderPrivate);
-      priv->have_back = 0;
-   }
-
-   if (front) {
-      buffers->image_mask |= __DRI_IMAGE_BUFFER_FRONT;
-      buffers->front = front->image;
-      priv->have_fake_front = psc->is_different_gpu || !priv->is_pixmap;
-   }
-
-   if (back) {
-      buffers->image_mask |= __DRI_IMAGE_BUFFER_BACK;
-      buffers->back = back->image;
-   }
-
-   priv->stamp = stamp;
-
-   return true;
+   loader_dri3_wait_gl(draw);
 }
 
 /* The image loader extension record for DRI3
@@ -1496,7 +480,7 @@ dri3_get_buffers(__DRIdrawable *driDrawable,
 static const __DRIimageLoaderExtension imageLoaderExtension = {
    .base = { __DRI_IMAGE_LOADER, 1 },
 
-   .getBuffers          = dri3_get_buffers,
+   .getBuffers          = loader_dri3_get_buffers,
    .flushFrontBuffer    = dri3_flush_front_buffer,
 };
 
@@ -1519,172 +503,25 @@ static int64_t
 dri3_swap_buffers(__GLXDRIdrawable *pdraw, int64_t target_msc, int64_t divisor,
                   int64_t remainder, Bool flush)
 {
-   struct dri3_context *pcp = (struct dri3_context *) __glXGetCurrentContext();
    struct dri3_drawable *priv = (struct dri3_drawable *) pdraw;
-   struct dri3_screen *psc = (struct dri3_screen *) priv->base.psc;
-   Display *dpy = priv->base.psc->dpy;
-   xcb_connection_t *c = XGetXCBConnection(dpy);
-   struct dri3_buffer *back;
-   int64_t ret = 0;
-   uint32_t options = XCB_PRESENT_OPTION_NONE;
-
    unsigned flags = __DRI2_FLUSH_DRAWABLE;
+
    if (flush)
       flags |= __DRI2_FLUSH_CONTEXT;
-   dri3_flush(psc, priv, flags, __DRI2_THROTTLE_SWAPBUFFER);
-
-   back = priv->buffers[DRI3_BACK_ID(priv->cur_back)];
-   if (psc->is_different_gpu && back) {
-      /* Update the linear buffer before presenting the pixmap */
-      psc->image->blitImage(pcp->driContext,
-                            back->linear_buffer,
-                            back->image,
-                            0, 0, back->width,
-                            back->height,
-                            0, 0, back->width,
-                            back->height, __BLIT_FLAG_FLUSH);
-      /* Update the fake front */
-      if (priv->have_fake_front)
-         psc->image->blitImage(pcp->driContext,
-                               priv->buffers[DRI3_FRONT_ID]->image,
-                               back->image,
-                               0, 0, priv->width,
-                               priv->height,
-                               0, 0, priv->width,
-                               priv->height, __BLIT_FLAG_FLUSH);
-   }
-
-   dri3_flush_present_events(priv);
-
-   if (back && !priv->is_pixmap) {
-      dri3_fence_reset(c, back);
-
-      /* Compute when we want the frame shown by taking the last known successful
-       * MSC and adding in a swap interval for each outstanding swap request.
-       * target_msc=divisor=remainder=0 means "Use glXSwapBuffers() semantic"
-       */
-      ++priv->send_sbc;
-      if (target_msc == 0 && divisor == 0 && remainder == 0)
-         target_msc = priv->msc + priv->swap_interval * (priv->send_sbc - priv->recv_sbc);
-      else if (divisor == 0 && remainder > 0) {
-         /* From the GLX_OML_sync_control spec:
-          *
-          *     "If <divisor> = 0, the swap will occur when MSC becomes
-          *      greater than or equal to <target_msc>."
-          *
-          * Note that there's no mention of the remainder.  The Present extension
-          * throws BadValue for remainder != 0 with divisor == 0, so just drop
-          * the passed in value.
-          */
-         remainder = 0;
-      }
-
-      /* From the GLX_EXT_swap_control spec:
-       *
-       *     "If <interval> is set to a value of 0, buffer swaps are not
-       *      synchronized to a video frame."
-       *
-       * Implementation note: It is possible to enable triple buffering behaviour
-       * by not using XCB_PRESENT_OPTION_ASYNC, but this should not be the default.
-       */
-      if (priv->swap_interval == 0)
-          options |= XCB_PRESENT_OPTION_ASYNC;
-
-      back->busy = 1;
-      back->last_swap = priv->send_sbc;
-      xcb_present_pixmap(c,
-                         priv->base.xDrawable,
-                         back->pixmap,
-                         (uint32_t) priv->send_sbc,
-                         0,                                    /* valid */
-                         0,                                    /* update */
-                         0,                                    /* x_off */
-                         0,                                    /* y_off */
-                         None,                                 /* target_crtc */
-                         None,
-                         back->sync_fence,
-                         options,
-                         target_msc,
-                         divisor,
-                         remainder, 0, NULL);
-      ret = (int64_t) priv->send_sbc;
-
-      /* If there's a fake front, then copy the source back buffer
-       * to the fake front to keep it up to date. This needs
-       * to reset the fence and make future users block until
-       * the X server is done copying the bits
-       */
-      if (priv->have_fake_front && !psc->is_different_gpu) {
-         dri3_fence_reset(c, priv->buffers[DRI3_FRONT_ID]);
-         dri3_copy_area(c,
-                        back->pixmap,
-                        priv->buffers[DRI3_FRONT_ID]->pixmap,
-                        dri3_drawable_gc(priv),
-                        0, 0, 0, 0, priv->width, priv->height);
-         dri3_fence_trigger(c, priv->buffers[DRI3_FRONT_ID]);
-      }
-      xcb_flush(c);
-      if (priv->stamp)
-         ++(*priv->stamp);
-   }
 
-   (*psc->f->invalidate)(priv->driDrawable);
-
-   return ret;
+   return loader_dri3_swap_buffers_msc(&priv->loader_drawable,
+                                       target_msc, divisor, remainder,
+                                       flags, false);
 }
 
 static int
 dri3_get_buffer_age(__GLXDRIdrawable *pdraw)
 {
-   xcb_connection_t *c = XGetXCBConnection(pdraw->psc->dpy);
-   struct dri3_drawable *priv = (struct dri3_drawable *) pdraw;
-   int back_id = DRI3_BACK_ID(dri3_find_back(c, priv));
-
-   if (back_id < 0 || !priv->buffers[back_id])
-      return 0;
-
-   if (priv->buffers[back_id]->last_swap != 0)
-      return priv->send_sbc - priv->buffers[back_id]->last_swap + 1;
-   else
-      return 0;
-}
-
-/** dri3_open
- *
- * Wrapper around xcb_dri3_open
- */
-static int
-dri3_open(Display *dpy,
-          Window root,
-          CARD32 provider)
-{
-   xcb_dri3_open_cookie_t       cookie;
-   xcb_dri3_open_reply_t        *reply;
-   xcb_connection_t             *c = XGetXCBConnection(dpy);
-   int                          fd;
-
-   cookie = xcb_dri3_open(c,
-                          root,
-                          provider);
-
-   reply = xcb_dri3_open_reply(c, cookie, NULL);
-   if (!reply)
-      return -1;
-
-   if (reply->nfd != 1) {
-      free(reply);
-      return -1;
-   }
+   struct dri3_drawable *priv = (struct dri3_drawable *)pdraw;
 
-   fd = xcb_dri3_open_reply_fds(c, reply)[0];
-   fcntl(fd, F_SETFD, FD_CLOEXEC);
-
-   free(reply);
-
-   return fd;
+   return loader_dri3_query_buffer_age(&priv->loader_drawable);
 }
 
-
 /** dri3_destroy_screen
  */
 static void
@@ -1727,8 +564,7 @@ dri3_set_swap_interval(__GLXDRIdrawable *pdraw, int interval)
       break;
    }
 
-   priv->swap_interval = interval;
-   dri3_update_num_back(priv);
+   loader_dri3_set_swap_interval(&priv->loader_drawable, interval);
 
    return 0;
 }
@@ -1759,14 +595,14 @@ dri3_bind_tex_image(Display * dpy,
    if (pdraw != NULL) {
       psc = (struct dri3_screen *) base->psc;
 
-      (*psc->f->invalidate)(pdraw->driDrawable);
+      (*psc->f->invalidate)(pdraw->loader_drawable.dri_drawable);
 
       XSync(dpy, false);
 
       (*psc->texBuffer->setTexBuffer2) (pcp->driContext,
                                         pdraw->base.textureTarget,
                                         pdraw->base.textureFormat,
-                                        pdraw->driDrawable);
+                                        pdraw->loader_drawable.dri_drawable);
    }
 }
 
@@ -1786,7 +622,7 @@ dri3_release_tex_image(Display * dpy, GLXDrawable drawable, int buffer)
           psc->texBuffer->releaseTexBuffer != NULL)
          (*psc->texBuffer->releaseTexBuffer) (pcp->driContext,
                                               pdraw->base.textureTarget,
-                                              pdraw->driDrawable);
+                                              pdraw->loader_drawable.dri_drawable);
    }
 }
 
@@ -1908,7 +744,7 @@ dri3_create_screen(int screen, struct glx_display * priv)
       return NULL;
    }
 
-   psc->fd = dri3_open(priv->dpy, RootWindow(priv->dpy, screen), None);
+   psc->fd = loader_dri3_open(c, RootWindow(priv->dpy, screen), None);
    if (psc->fd < 0) {
       int conn_error = xcb_connection_has_error(c);
 
@@ -2000,6 +836,13 @@ dri3_create_screen(int screen, struct glx_display * priv)
       goto handle_error;
    }
 
+   psc->loader_dri3_ext.core = psc->core;
+   psc->loader_dri3_ext.image_driver = psc->image_driver;
+   psc->loader_dri3_ext.flush = psc->f;
+   psc->loader_dri3_ext.tex_buffer = psc->texBuffer;
+   psc->loader_dri3_ext.image = psc->image;
+   psc->loader_dri3_ext.config = psc->config;
+
    configs = driConvertConfigs(psc->core, psc->base.configs, driver_configs);
    visuals = driConvertConfigs(psc->core, psc->base.visuals, driver_configs);
 
diff --git a/src/glx/dri3_priv.h b/src/glx/dri3_priv.h
index 160444907e6..56a63309f36 100644
--- a/src/glx/dri3_priv.h
+++ b/src/glx/dri3_priv.h
@@ -59,50 +59,14 @@
 #include <xcb/present.h>
 #include <xcb/sync.h>
 
+#include "loader_dri3_helper.h"
+
 /* From xmlpool/options.h, user exposed so should be stable */
 #define DRI_CONF_VBLANK_NEVER 0
 #define DRI_CONF_VBLANK_DEF_INTERVAL_0 1
 #define DRI_CONF_VBLANK_DEF_INTERVAL_1 2
 #define DRI_CONF_VBLANK_ALWAYS_SYNC 3
 
-enum dri3_buffer_type {
-   dri3_buffer_back = 0,
-   dri3_buffer_front = 1
-};
-
-struct dri3_buffer {
-   __DRIimage   *image;
-   __DRIimage   *linear_buffer;
-   uint32_t     pixmap;
-
-   /* Synchronization between the client and X server is done using an
-    * xshmfence that is mapped into an X server SyncFence. This lets the
-    * client check whether the X server is done using a buffer with a simple
-    * xshmfence call, rather than going to read X events from the wire.
-    *
-    * However, we can only wait for one xshmfence to be triggered at a time,
-    * so we need to know *which* buffer is going to be idle next. We do that
-    * by waiting for a PresentIdleNotify event. When that event arrives, the
-    * 'busy' flag gets cleared and the client knows that the fence has been
-    * triggered, and that the wait call will not block.
-    */
-
-   uint32_t     sync_fence;     /* XID of X SyncFence object */
-   struct xshmfence *shm_fence; /* pointer to xshmfence object */
-   GLboolean    busy;           /* Set on swap, cleared on IdleNotify */
-   GLboolean    own_pixmap;     /* We allocated the pixmap ID, free on destroy */
-   void         *driverPrivate;
-
-   uint32_t     size;
-   uint32_t     pitch;
-   uint32_t     cpp;
-   uint32_t     flags;
-   uint32_t     width, height;
-   uint64_t     last_swap;
-
-   enum dri3_buffer_type        buffer_type;
-};
-
 struct dri3_display
 {
    __GLXDRIdisplay base;
@@ -139,6 +103,8 @@ struct dri3_screen {
    int is_different_gpu;
 
    int show_fps_interval;
+
+   struct loader_dri3_extensions loader_dri3_ext;
 };
 
 struct dri3_context
@@ -147,60 +113,10 @@ struct dri3_context
    __DRIcontext *driContext;
 };
 
-#define DRI3_MAX_BACK   4
-#define DRI3_BACK_ID(i) (i)
-#define DRI3_FRONT_ID   (DRI3_MAX_BACK)
-
-static inline int
-dri3_pixmap_buf_id(enum dri3_buffer_type buffer_type)
-{
-   if (buffer_type == dri3_buffer_back)
-      return DRI3_BACK_ID(0);
-   else
-      return DRI3_FRONT_ID;
-}
-
-#define DRI3_NUM_BUFFERS        (1 + DRI3_MAX_BACK)
-
 struct dri3_drawable {
    __GLXDRIdrawable base;
-   __DRIdrawable *driDrawable;
-   int width, height, depth;
+   struct loader_dri3_drawable loader_drawable;
    int swap_interval;
-   uint8_t have_back;
-   uint8_t have_fake_front;
-   uint8_t is_pixmap;
-   uint8_t flipping;
-
-   /* Present extension capabilities
-    */
-   uint32_t present_capabilities;
-
-   /* SBC numbers are tracked by using the serial numbers
-    * in the present request and complete events
-    */
-   uint64_t send_sbc;
-   uint64_t recv_sbc;
-
-   /* Last received UST/MSC values for pixmap present complete */
-   uint64_t ust, msc;
-
-   /* Last received UST/MSC values from present notify msc event */
-   uint64_t notify_ust, notify_msc;
-
-   /* Serial numbers for tracking wait_for_msc events */
-   uint32_t send_msc_serial;
-   uint32_t recv_msc_serial;
-
-   struct dri3_buffer *buffers[DRI3_NUM_BUFFERS];
-   int cur_back;
-   int num_back;
-
-   uint32_t *stamp;
-
-   xcb_present_event_t eid;
-   xcb_gcontext_t gc;
-   xcb_special_event_t *special_event;
 
    /* LIBGL_SHOW_FPS support */
    uint64_t previous_ust;
diff --git a/src/loader/Makefile.am b/src/loader/Makefile.am
index 5190f7f8a46..9ca17540d54 100644
--- a/src/loader/Makefile.am
+++ b/src/loader/Makefile.am
@@ -25,13 +25,16 @@ EXTRA_DIST = SConscript
 
 noinst_LTLIBRARIES = libloader.la
 
-libloader_la_CPPFLAGS = \
+AM_CPPFLAGS = \
 	$(DEFINES) \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
 	$(VISIBILITY_CFLAGS) \
+	$(XCB_DRI3_CFLAGS) \
+	$(LIBDRM_CFLAGS) \
 	$(LIBUDEV_CFLAGS)
 
+libloader_la_CPPFLAGS = $(AM_CPPFLAGS)
 libloader_la_SOURCES = $(LOADER_C_FILES)
 libloader_la_LIBADD =
 
@@ -49,9 +52,15 @@ libloader_la_CPPFLAGS += \
 endif
 
 if HAVE_LIBDRM
-libloader_la_CPPFLAGS += \
-	$(LIBDRM_CFLAGS)
-
 libloader_la_LIBADD += \
 	$(LIBDRM_LIBS)
 endif
+
+if HAVE_DRI3
+noinst_LTLIBRARIES += libloader_dri3_helper.la
+
+libloader_dri3_helper_la_SOURCES = \
+	loader_dri3_helper.c \
+	loader_dri3_helper.h
+libloader_dri3_helper_la_LIBADD = $(XCB_DRI3_LIBS)
+endif
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
new file mode 100644
index 00000000000..62bfe845c08
--- /dev/null
+++ b/src/loader/loader_dri3_helper.c
@@ -0,0 +1,1396 @@
+/*
+ * Copyright © 2013 Keith Packard
+ * Copyright © 2015 Boyan Ding
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <X11/xshmfence.h>
+#include <xcb/xcb.h>
+#include <xcb/dri3.h>
+#include <xcb/present.h>
+
+#include <X11/Xlib-xcb.h>
+
+#include "loader_dri3_helper.h"
+
+/* From xmlpool/options.h, user exposed so should be stable */
+#define DRI_CONF_VBLANK_NEVER 0
+#define DRI_CONF_VBLANK_DEF_INTERVAL_0 1
+#define DRI_CONF_VBLANK_DEF_INTERVAL_1 2
+#define DRI_CONF_VBLANK_ALWAYS_SYNC 3
+
+static inline void
+dri3_fence_reset(xcb_connection_t *c, struct loader_dri3_buffer *buffer)
+{
+   xshmfence_reset(buffer->shm_fence);
+}
+
+static inline void
+dri3_fence_set(struct loader_dri3_buffer *buffer)
+{
+   xshmfence_trigger(buffer->shm_fence);
+}
+
+static inline void
+dri3_fence_trigger(xcb_connection_t *c, struct loader_dri3_buffer *buffer)
+{
+   xcb_sync_trigger_fence(c, buffer->sync_fence);
+}
+
+static inline void
+dri3_fence_await(xcb_connection_t *c, struct loader_dri3_buffer *buffer)
+{
+   xcb_flush(c);
+   xshmfence_await(buffer->shm_fence);
+}
+
+static void
+dri3_update_num_back(struct loader_dri3_drawable *draw)
+{
+   draw->num_back = 1;
+   if (draw->flipping) {
+      if (!draw->is_pixmap &&
+          !(draw->present_capabilities & XCB_PRESENT_CAPABILITY_ASYNC))
+         draw->num_back++;
+      draw->num_back++;
+   }
+   if (draw->vtable->get_swap_interval(draw) == 0)
+      draw->num_back++;
+}
+
+void
+loader_dri3_set_swap_interval(struct loader_dri3_drawable *draw, int interval)
+{
+   interval = draw->vtable->clamp_swap_interval(draw, interval);
+   draw->vtable->set_swap_interval(draw, interval);
+   dri3_update_num_back(draw);
+}
+
+/** dri3_free_render_buffer
+ *
+ * Free everything associated with one render buffer including pixmap, fence
+ * stuff and the driver image
+ */
+static void
+dri3_free_render_buffer(struct loader_dri3_drawable *draw,
+                        struct loader_dri3_buffer *buffer)
+{
+   if (buffer->own_pixmap)
+      xcb_free_pixmap(draw->conn, buffer->pixmap);
+   xcb_sync_destroy_fence(draw->conn, buffer->sync_fence);
+   xshmfence_unmap_shm(buffer->shm_fence);
+   (draw->ext->image->destroyImage)(buffer->image);
+   if (buffer->linear_buffer)
+      (draw->ext->image->destroyImage)(buffer->linear_buffer);
+   free(buffer);
+}
+
+void
+loader_dri3_drawable_fini(struct loader_dri3_drawable *draw)
+{
+   int i;
+
+   (draw->ext->core->destroyDrawable)(draw->dri_drawable);
+
+   for (i = 0; i < LOADER_DRI3_NUM_BUFFERS; i++) {
+      if (draw->buffers[i])
+         dri3_free_render_buffer(draw, draw->buffers[i]);
+   }
+
+   if (draw->special_event)
+      xcb_unregister_for_special_event(draw->conn, draw->special_event);
+}
+
+int
+loader_dri3_drawable_init(xcb_connection_t *conn,
+                          xcb_drawable_t drawable,
+                          __DRIscreen *dri_screen,
+                          bool is_different_gpu,
+                          const __DRIconfig *dri_config,
+                          struct loader_dri3_extensions *ext,
+                          struct loader_dri3_vtable *vtable,
+                          struct loader_dri3_drawable *draw)
+{
+   xcb_get_geometry_cookie_t cookie;
+   xcb_get_geometry_reply_t *reply;
+   xcb_generic_error_t *error;
+   GLint vblank_mode = DRI_CONF_VBLANK_DEF_INTERVAL_1;
+   int swap_interval;
+
+   draw->conn = conn;
+   draw->ext = ext;
+   draw->vtable = vtable;
+   draw->drawable = drawable;
+   draw->dri_screen = dri_screen;
+   draw->is_different_gpu = is_different_gpu;
+
+   draw->have_back = 0;
+   draw->have_fake_front = 0;
+   draw->first_init = true;
+
+   if (draw->ext->config)
+      draw->ext->config->configQueryi(draw->dri_screen,
+                                      "vblank_mode", &vblank_mode);
+
+   switch (vblank_mode) {
+   case DRI_CONF_VBLANK_NEVER:
+   case DRI_CONF_VBLANK_DEF_INTERVAL_0:
+      swap_interval = 0;
+      break;
+   case DRI_CONF_VBLANK_DEF_INTERVAL_1:
+   case DRI_CONF_VBLANK_ALWAYS_SYNC:
+   default:
+      swap_interval = 1;
+      break;
+   }
+   draw->vtable->set_swap_interval(draw, swap_interval);
+
+   dri3_update_num_back(draw);
+
+   /* Create a new drawable */
+   draw->dri_drawable =
+      (draw->ext->image_driver->createNewDrawable)(dri_screen,
+                                                   dri_config,
+                                                   draw);
+
+   if (!draw->dri_drawable)
+      return 1;
+
+   cookie = xcb_get_geometry(draw->conn, draw->drawable);
+   reply = xcb_get_geometry_reply(draw->conn, cookie, &error);
+   if (reply == NULL || error != NULL) {
+      draw->ext->core->destroyDrawable(draw->dri_drawable);
+      return 1;
+   }
+
+   draw->width = reply->width;
+   draw->height = reply->height;
+   draw->depth = reply->depth;
+   draw->vtable->set_drawable_size(draw, draw->width, draw->height);
+   free(reply);
+
+   /*
+    * Make sure server has the same swap interval we do for the new
+    * drawable.
+    */
+   loader_dri3_set_swap_interval(draw, swap_interval);
+
+   return 0;
+}
+
+/*
+ * Process one Present event
+ */
+static void
+dri3_handle_present_event(struct loader_dri3_drawable *draw,
+                          xcb_present_generic_event_t *ge)
+{
+   switch (ge->evtype) {
+   case XCB_PRESENT_CONFIGURE_NOTIFY: {
+      xcb_present_configure_notify_event_t *ce = (void *) ge;
+
+      draw->width = ce->width;
+      draw->height = ce->height;
+      draw->vtable->set_drawable_size(draw, draw->width, draw->height);
+      break;
+   }
+   case XCB_PRESENT_COMPLETE_NOTIFY: {
+      xcb_present_complete_notify_event_t *ce = (void *) ge;
+
+      /* Compute the processed SBC number from the received 32-bit serial number
+       * merged with the upper 32-bits of the sent 64-bit serial number while
+       * checking for wrap.
+       */
+      if (ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP) {
+         draw->recv_sbc = (draw->send_sbc & 0xffffffff00000000LL) | ce->serial;
+         if (draw->recv_sbc > draw->send_sbc)
+            draw->recv_sbc -= 0x100000000;
+         switch (ce->mode) {
+         case XCB_PRESENT_COMPLETE_MODE_FLIP:
+            draw->flipping = true;
+            break;
+         case XCB_PRESENT_COMPLETE_MODE_COPY:
+            draw->flipping = false;
+            break;
+         }
+         dri3_update_num_back(draw);
+
+         if (draw->vtable->show_fps)
+            draw->vtable->show_fps(draw, ce->ust);
+
+         draw->ust = ce->ust;
+         draw->msc = ce->msc;
+      } else {
+         draw->recv_msc_serial = ce->serial;
+         draw->notify_ust = ce->ust;
+         draw->notify_msc = ce->msc;
+      }
+      break;
+   }
+   case XCB_PRESENT_EVENT_IDLE_NOTIFY: {
+      xcb_present_idle_notify_event_t *ie = (void *) ge;
+      int b;
+
+      for (b = 0; b < sizeof(draw->buffers) / sizeof(draw->buffers[0]); b++) {
+         struct loader_dri3_buffer *buf = draw->buffers[b];
+
+         if (buf && buf->pixmap == ie->pixmap) {
+            buf->busy = 0;
+            if (draw->num_back <= b && b < LOADER_DRI3_MAX_BACK) {
+               dri3_free_render_buffer(draw, buf);
+               draw->buffers[b] = NULL;
+            }
+            break;
+         }
+      }
+      break;
+   }
+   }
+   free(ge);
+}
+
+static bool
+dri3_wait_for_event(struct loader_dri3_drawable *draw)
+{
+   xcb_generic_event_t *ev;
+   xcb_present_generic_event_t *ge;
+
+   xcb_flush(draw->conn);
+   ev = xcb_wait_for_special_event(draw->conn, draw->special_event);
+   if (!ev)
+      return false;
+   ge = (void *) ev;
+   dri3_handle_present_event(draw, ge);
+   return true;
+}
+
+/** loader_dri3_wait_for_msc
+ *
+ * Get the X server to send an event when the target msc/divisor/remainder is
+ * reached.
+ */
+bool
+loader_dri3_wait_for_msc(struct loader_dri3_drawable *draw,
+                         int64_t target_msc,
+                         int64_t divisor, int64_t remainder,
+                         int64_t *ust, int64_t *msc, int64_t *sbc)
+{
+   uint32_t msc_serial;
+
+   msc_serial = ++draw->send_msc_serial;
+   xcb_present_notify_msc(draw->conn,
+                          draw->drawable,
+                          msc_serial,
+                          target_msc,
+                          divisor,
+                          remainder);
+
+   xcb_flush(draw->conn);
+
+   /* Wait for the event */
+   if (draw->special_event) {
+      while ((int32_t) (msc_serial - draw->recv_msc_serial) > 0) {
+         if (!dri3_wait_for_event(draw))
+            return false;
+      }
+   }
+
+   *ust = draw->notify_ust;
+   *msc = draw->notify_msc;
+   *sbc = draw->recv_sbc;
+
+   return true;
+}
+
+/** loader_dri3_wait_for_sbc
+ *
+ * Wait for the completed swap buffer count to reach the specified
+ * target. Presumably the application knows that this will be reached with
+ * outstanding complete events, or we're going to be here awhile.
+ */
+int
+loader_dri3_wait_for_sbc(struct loader_dri3_drawable *draw,
+                         int64_t target_sbc, int64_t *ust,
+                         int64_t *msc, int64_t *sbc)
+{
+   /* From the GLX_OML_sync_control spec:
+    *
+    *     "If <target_sbc> = 0, the function will block until all previous
+    *      swaps requested with glXSwapBuffersMscOML for that window have
+    *      completed."
+    */
+   if (!target_sbc)
+      target_sbc = draw->send_sbc;
+
+   while (draw->recv_sbc < target_sbc) {
+      if (!dri3_wait_for_event(draw))
+         return 0;
+   }
+
+   *ust = draw->ust;
+   *msc = draw->msc;
+   *sbc = draw->recv_sbc;
+   return 1;
+}
+
+/** loader_dri3_find_back
+ *
+ * Find an idle back buffer. If there isn't one, then
+ * wait for a present idle notify event from the X server
+ */
+static int
+dri3_find_back(struct loader_dri3_drawable *draw)
+{
+   int b;
+   xcb_generic_event_t *ev;
+   xcb_present_generic_event_t *ge;
+
+   for (;;) {
+      for (b = 0; b < draw->num_back; b++) {
+         int id = LOADER_DRI3_BACK_ID((b + draw->cur_back) % draw->num_back);
+         struct loader_dri3_buffer *buffer = draw->buffers[id];
+
+         if (!buffer || !buffer->busy) {
+            draw->cur_back = id;
+            return id;
+         }
+      }
+      xcb_flush(draw->conn);
+      ev = xcb_wait_for_special_event(draw->conn, draw->special_event);
+      if (!ev)
+         return -1;
+      ge = (void *) ev;
+      dri3_handle_present_event(draw, ge);
+   }
+}
+
+static xcb_gcontext_t
+dri3_drawable_gc(struct loader_dri3_drawable *draw)
+{
+   if (!draw->gc) {
+      uint32_t v = 0;
+      xcb_create_gc(draw->conn,
+                    (draw->gc = xcb_generate_id(draw->conn)),
+                    draw->drawable,
+                    XCB_GC_GRAPHICS_EXPOSURES,
+                    &v);
+   }
+   return draw->gc;
+}
+
+
+static struct loader_dri3_buffer *
+dri3_back_buffer(struct loader_dri3_drawable *draw)
+{
+   return draw->buffers[LOADER_DRI3_BACK_ID(draw->cur_back)];
+}
+
+static struct loader_dri3_buffer *
+dri3_fake_front_buffer(struct loader_dri3_drawable *draw)
+{
+   return draw->buffers[LOADER_DRI3_FRONT_ID];
+}
+
+static void
+dri3_copy_area(xcb_connection_t *c,
+               xcb_drawable_t    src_drawable,
+               xcb_drawable_t    dst_drawable,
+               xcb_gcontext_t    gc,
+               int16_t           src_x,
+               int16_t           src_y,
+               int16_t           dst_x,
+               int16_t           dst_y,
+               uint16_t          width,
+               uint16_t          height)
+{
+   xcb_void_cookie_t cookie;
+
+   cookie = xcb_copy_area_checked(c,
+                                  src_drawable,
+                                  dst_drawable,
+                                  gc,
+                                  src_x,
+                                  src_y,
+                                  dst_x,
+                                  dst_y,
+                                  width,
+                                  height);
+   xcb_discard_reply(c, cookie.sequence);
+}
+
+/**
+ * Asks the driver to flush any queued work necessary for serializing with the
+ * X command stream, and optionally the slightly more strict requirement of
+ * glFlush() equivalence (which would require flushing even if nothing had
+ * been drawn to a window system framebuffer, for example).
+ */
+void
+loader_dri3_flush(struct loader_dri3_drawable *draw,
+                  unsigned flags,
+                  enum __DRI2throttleReason throttle_reason)
+{
+   /* NEED TO CHECK WHETHER CONTEXT IS NULL */
+   __DRIcontext *dri_context = draw->vtable->get_dri_context(draw);
+
+   if (dri_context) {
+      draw->ext->flush->flush_with_flags(dri_context, draw->dri_drawable,
+                                         flags, throttle_reason);
+   }
+}
+
+void
+loader_dri3_copy_sub_buffer(struct loader_dri3_drawable *draw,
+                            int x, int y,
+                            int width, int height,
+                            bool flush)
+{
+   struct loader_dri3_buffer *back;
+   unsigned flags = __DRI2_FLUSH_DRAWABLE;
+   __DRIcontext *dri_context;
+
+   dri_context = draw->vtable->get_dri_context(draw);
+
+   /* Check we have the right attachments */
+   if (!draw->have_back || draw->is_pixmap)
+      return;
+
+   if (flush)
+      flags |= __DRI2_FLUSH_CONTEXT;
+   loader_dri3_flush(draw, flags, __DRI2_THROTTLE_SWAPBUFFER);
+
+   back = dri3_back_buffer(draw);
+   y = draw->height - y - height;
+
+   if (draw->is_different_gpu && draw->vtable->in_current_context(draw)) {
+      /* Update the linear buffer part of the back buffer
+       * for the dri3_copy_area operation
+       */
+      draw->ext->image->blitImage(dri_context,
+                                  back->linear_buffer,
+                                  back->image,
+                                  0, 0, back->width,
+                                  back->height,
+                                  0, 0, back->width,
+                                  back->height, __BLIT_FLAG_FLUSH);
+      /* We use blitImage to update our fake front,
+       */
+      if (draw->have_fake_front)
+         draw->ext->image->blitImage(dri_context,
+                                     dri3_fake_front_buffer(draw)->image,
+                                     back->image,
+                                     x, y, width, height,
+                                     x, y, width, height, __BLIT_FLAG_FLUSH);
+   }
+
+   dri3_fence_reset(draw->conn, back);
+   dri3_copy_area(draw->conn,
+                  dri3_back_buffer(draw)->pixmap,
+                  draw->drawable,
+                  dri3_drawable_gc(draw),
+                  x, y, x, y, width, height);
+   dri3_fence_trigger(draw->conn, back);
+   /* Refresh the fake front (if present) after we just damaged the real
+    * front.
+    */
+   if (draw->have_fake_front && !draw->is_different_gpu) {
+      dri3_fence_reset(draw->conn, dri3_fake_front_buffer(draw));
+      dri3_copy_area(draw->conn,
+                     dri3_back_buffer(draw)->pixmap,
+                     dri3_fake_front_buffer(draw)->pixmap,
+                     dri3_drawable_gc(draw),
+                     x, y, x, y, width, height);
+      dri3_fence_trigger(draw->conn, dri3_fake_front_buffer(draw));
+      dri3_fence_await(draw->conn, dri3_fake_front_buffer(draw));
+   }
+   dri3_fence_await(draw->conn, back);
+}
+
+void
+loader_dri3_copy_drawable(struct loader_dri3_drawable *draw,
+                          xcb_drawable_t dest,
+                          xcb_drawable_t src)
+{
+   loader_dri3_flush(draw, __DRI2_FLUSH_DRAWABLE, 0);
+
+   dri3_fence_reset(draw->conn, dri3_fake_front_buffer(draw));
+   dri3_copy_area(draw->conn,
+                  src, dest,
+                  dri3_drawable_gc(draw),
+                  0, 0, 0, 0, draw->width, draw->height);
+   dri3_fence_trigger(draw->conn, dri3_fake_front_buffer(draw));
+   dri3_fence_await(draw->conn, dri3_fake_front_buffer(draw));
+}
+
+void
+loader_dri3_wait_x(struct loader_dri3_drawable *draw)
+{
+   struct loader_dri3_buffer *front;
+   __DRIcontext *dri_context;
+
+   if (draw == NULL || !draw->have_fake_front)
+      return;
+
+   front = dri3_fake_front_buffer(draw);
+   dri_context = draw->vtable->get_dri_context(draw);
+
+   loader_dri3_copy_drawable(draw, front->pixmap, draw->drawable);
+
+   /* In the psc->is_different_gpu case, the linear buffer has been updated,
+    * but not yet the tiled buffer.
+    * Copy back to the tiled buffer we use for rendering.
+    * Note that we don't need flushing.
+    */
+   if (draw->is_different_gpu && draw->vtable->in_current_context(draw))
+      draw->ext->image->blitImage(dri_context,
+                                  front->image,
+                                  front->linear_buffer,
+                                  0, 0, front->width,
+                                  front->height,
+                                  0, 0, front->width,
+                                  front->height, 0);
+}
+
+void
+loader_dri3_wait_gl(struct loader_dri3_drawable *draw)
+{
+   struct loader_dri3_buffer *front;
+   __DRIcontext *dri_context;
+
+   if (draw == NULL || !draw->have_fake_front)
+      return;
+
+   front = dri3_fake_front_buffer(draw);
+   dri_context = draw->vtable->get_dri_context(draw);
+
+   /* In the psc->is_different_gpu case, we update the linear_buffer
+    * before updating the real front.
+    */
+   if (draw->is_different_gpu && draw->vtable->in_current_context(draw))
+      draw->ext->image->blitImage(dri_context,
+                                  front->linear_buffer,
+                                  front->image,
+                                  0, 0, front->width,
+                                  front->height,
+                                  0, 0, front->width,
+                                  front->height, __BLIT_FLAG_FLUSH);
+   loader_dri3_copy_drawable(draw, draw->drawable, front->pixmap);
+}
+
+/** dri3_flush_present_events
+ *
+ * Process any present events that have been received from the X server
+ */
+static void
+dri3_flush_present_events(struct loader_dri3_drawable *draw)
+{
+   /* Check to see if any configuration changes have occurred
+    * since we were last invoked
+    */
+   if (draw->special_event) {
+      xcb_generic_event_t    *ev;
+
+      while ((ev = xcb_poll_for_special_event(draw->conn,
+                                              draw->special_event)) != NULL) {
+         xcb_present_generic_event_t *ge = (void *) ev;
+         dri3_handle_present_event(draw, ge);
+      }
+   }
+}
+
+/** loader_dri3_swap_buffers_msc
+ *
+ * Make the current back buffer visible using the present extension
+ */
+int64_t
+loader_dri3_swap_buffers_msc(struct loader_dri3_drawable *draw,
+                             int64_t target_msc, int64_t divisor,
+                             int64_t remainder, unsigned flush_flags,
+                             bool force_copy)
+{
+   struct loader_dri3_buffer *back;
+   __DRIcontext *dri_context;
+   int64_t ret = 0;
+   uint32_t options = XCB_PRESENT_OPTION_NONE;
+   int swap_interval;
+
+   dri_context = draw->vtable->get_dri_context(draw);
+   swap_interval = draw->vtable->get_swap_interval(draw);
+
+   draw->vtable->flush_drawable(draw, flush_flags);
+
+   back = draw->buffers[LOADER_DRI3_BACK_ID(draw->cur_back)];
+   if (draw->is_different_gpu && back) {
+      /* Update the linear buffer before presenting the pixmap */
+      draw->ext->image->blitImage(dri_context,
+                                  back->linear_buffer,
+                                  back->image,
+                                  0, 0, back->width,
+                                  back->height,
+                                  0, 0, back->width,
+                                  back->height, __BLIT_FLAG_FLUSH);
+      /* Update the fake front */
+      if (draw->have_fake_front)
+         draw->ext->image->blitImage(dri_context,
+                                     draw->buffers[LOADER_DRI3_FRONT_ID]->image,
+                                     back->image,
+                                     0, 0, draw->width, draw->height,
+                                     0, 0, draw->width, draw->height,
+                                     __BLIT_FLAG_FLUSH);
+   }
+
+   dri3_flush_present_events(draw);
+
+   if (back && !draw->is_pixmap) {
+      dri3_fence_reset(draw->conn, back);
+
+      /* Compute when we want the frame shown by taking the last known
+       * successful MSC and adding in a swap interval for each outstanding swap
+       * request. target_msc=divisor=remainder=0 means "Use glXSwapBuffers()
+       * semantic"
+       */
+      ++draw->send_sbc;
+      if (target_msc == 0 && divisor == 0 && remainder == 0)
+         target_msc = draw->msc + swap_interval *
+                      (draw->send_sbc - draw->recv_sbc);
+      else if (divisor == 0 && remainder > 0) {
+         /* From the GLX_OML_sync_control spec:
+          *     "If <divisor> = 0, the swap will occur when MSC becomes
+          *      greater than or equal to <target_msc>."
+          *
+          * Note that there's no mention of the remainder.  The Present
+          * extension throws BadValue for remainder != 0 with divisor == 0, so
+          * just drop the passed in value.
+          */
+         remainder = 0;
+      }
+
+      /* From the GLX_EXT_swap_control spec
+       * and the EGL 1.4 spec (page 53):
+       *
+       *     "If <interval> is set to a value of 0, buffer swaps are not
+       *      synchronized to a video frame."
+       *
+       * Implementation note: It is possible to enable triple buffering
+       * behaviour by not using XCB_PRESENT_OPTION_ASYNC, but this should not be
+       * the default.
+       */
+      if (swap_interval == 0)
+          options |= XCB_PRESENT_OPTION_ASYNC;
+      if (force_copy)
+          options |= XCB_PRESENT_OPTION_COPY;
+
+      back->busy = 1;
+      back->last_swap = draw->send_sbc;
+      xcb_present_pixmap(draw->conn,
+                         draw->drawable,
+                         back->pixmap,
+                         (uint32_t) draw->send_sbc,
+                         0,                                    /* valid */
+                         0,                                    /* update */
+                         0,                                    /* x_off */
+                         0,                                    /* y_off */
+                         None,                                 /* target_crtc */
+                         None,
+                         back->sync_fence,
+                         options,
+                         target_msc,
+                         divisor,
+                         remainder, 0, NULL);
+      ret = (int64_t) draw->send_sbc;
+
+      /* If there's a fake front, then copy the source back buffer
+       * to the fake front to keep it up to date. This needs
+       * to reset the fence and make future users block until
+       * the X server is done copying the bits
+       */
+      if (draw->have_fake_front && !draw->is_different_gpu) {
+         dri3_fence_reset(draw->conn, draw->buffers[LOADER_DRI3_FRONT_ID]);
+         dri3_copy_area(draw->conn,
+                        back->pixmap,
+                        draw->buffers[LOADER_DRI3_FRONT_ID]->pixmap,
+                        dri3_drawable_gc(draw),
+                        0, 0, 0, 0,
+                        draw->width, draw->height);
+         dri3_fence_trigger(draw->conn, draw->buffers[LOADER_DRI3_FRONT_ID]);
+      }
+      xcb_flush(draw->conn);
+      if (draw->stamp)
+         ++(*draw->stamp);
+   }
+
+   (draw->ext->flush->invalidate)(draw->dri_drawable);
+
+   return ret;
+}
+
+int
+loader_dri3_query_buffer_age(struct loader_dri3_drawable *draw)
+{
+   int back_id = LOADER_DRI3_BACK_ID(dri3_find_back(draw));
+
+   if (back_id < 0 || !draw->buffers[back_id])
+      return 0;
+
+   if (draw->buffers[back_id]->last_swap != 0)
+      return draw->send_sbc - draw->buffers[back_id]->last_swap + 1;
+   else
+      return 0;
+}
+
+/** loader_dri3_open
+ *
+ * Wrapper around xcb_dri3_open
+ */
+int
+loader_dri3_open(xcb_connection_t *conn,
+                 xcb_window_t root,
+                 uint32_t provider)
+{
+   xcb_dri3_open_cookie_t       cookie;
+   xcb_dri3_open_reply_t        *reply;
+   int                          fd;
+
+   cookie = xcb_dri3_open(conn,
+                          root,
+                          provider);
+
+   reply = xcb_dri3_open_reply(conn, cookie, NULL);
+   if (!reply)
+      return -1;
+
+   if (reply->nfd != 1) {
+      free(reply);
+      return -1;
+   }
+
+   fd = xcb_dri3_open_reply_fds(conn, reply)[0];
+   fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+
+   return fd;
+}
+
+static uint32_t
+dri3_cpp_for_format(uint32_t format) {
+   switch (format) {
+   case  __DRI_IMAGE_FORMAT_R8:
+      return 1;
+   case  __DRI_IMAGE_FORMAT_RGB565:
+   case  __DRI_IMAGE_FORMAT_GR88:
+      return 2;
+   case  __DRI_IMAGE_FORMAT_XRGB8888:
+   case  __DRI_IMAGE_FORMAT_ARGB8888:
+   case  __DRI_IMAGE_FORMAT_ABGR8888:
+   case  __DRI_IMAGE_FORMAT_XBGR8888:
+   case  __DRI_IMAGE_FORMAT_XRGB2101010:
+   case  __DRI_IMAGE_FORMAT_ARGB2101010:
+   case  __DRI_IMAGE_FORMAT_SARGB8:
+      return 4;
+   case  __DRI_IMAGE_FORMAT_NONE:
+   default:
+      return 0;
+   }
+}
+
+/** loader_dri3_alloc_render_buffer
+ *
+ * Use the driver createImage function to construct a __DRIimage, then
+ * get a file descriptor for that and create an X pixmap from that
+ *
+ * Allocate an xshmfence for synchronization
+ */
+static struct loader_dri3_buffer *
+dri3_alloc_render_buffer(struct loader_dri3_drawable *draw, unsigned int format,
+                         int width, int height, int depth)
+{
+   struct loader_dri3_buffer *buffer;
+   __DRIimage *pixmap_buffer;
+   xcb_pixmap_t pixmap;
+   xcb_sync_fence_t sync_fence;
+   struct xshmfence *shm_fence;
+   int buffer_fd, fence_fd;
+   int stride;
+
+   /* Create an xshmfence object and
+    * prepare to send that to the X server
+    */
+
+   fence_fd = xshmfence_alloc_shm();
+   if (fence_fd < 0)
+      return NULL;
+
+   shm_fence = xshmfence_map_shm(fence_fd);
+   if (shm_fence == NULL)
+      goto no_shm_fence;
+
+   /* Allocate the image from the driver
+    */
+   buffer = calloc(1, sizeof *buffer);
+   if (!buffer)
+      goto no_buffer;
+
+   buffer->cpp = dri3_cpp_for_format(format);
+   if (!buffer->cpp)
+      goto no_image;
+
+   if (!draw->is_different_gpu) {
+      buffer->image = (draw->ext->image->createImage)(draw->dri_screen,
+                                                      width, height,
+                                                      format,
+                                                      __DRI_IMAGE_USE_SHARE |
+                                                      __DRI_IMAGE_USE_SCANOUT,
+                                                      buffer);
+      pixmap_buffer = buffer->image;
+
+      if (!buffer->image)
+         goto no_image;
+   } else {
+      buffer->image = (draw->ext->image->createImage)(draw->dri_screen,
+                                                      width, height,
+                                                      format,
+                                                      0,
+                                                      buffer);
+
+      if (!buffer->image)
+         goto no_image;
+
+      buffer->linear_buffer =
+        (draw->ext->image->createImage)(draw->dri_screen,
+                                        width, height, format,
+                                        __DRI_IMAGE_USE_SHARE |
+                                           __DRI_IMAGE_USE_LINEAR,
+                                        buffer);
+      pixmap_buffer = buffer->linear_buffer;
+
+      if (!buffer->linear_buffer)
+         goto no_linear_buffer;
+   }
+
+   /* X wants the stride, so ask the image for it
+    */
+   if (!(draw->ext->image->queryImage)(pixmap_buffer, __DRI_IMAGE_ATTRIB_STRIDE,
+                                       &stride))
+      goto no_buffer_attrib;
+
+   buffer->pitch = stride;
+
+   if (!(draw->ext->image->queryImage)(pixmap_buffer, __DRI_IMAGE_ATTRIB_FD,
+                                       &buffer_fd))
+      goto no_buffer_attrib;
+
+   xcb_dri3_pixmap_from_buffer(draw->conn,
+                               (pixmap = xcb_generate_id(draw->conn)),
+                               draw->drawable,
+                               buffer->size,
+                               width, height, buffer->pitch,
+                               depth, buffer->cpp * 8,
+                               buffer_fd);
+
+   xcb_dri3_fence_from_fd(draw->conn,
+                          pixmap,
+                          (sync_fence = xcb_generate_id(draw->conn)),
+                          false,
+                          fence_fd);
+
+   buffer->pixmap = pixmap;
+   buffer->own_pixmap = true;
+   buffer->sync_fence = sync_fence;
+   buffer->shm_fence = shm_fence;
+   buffer->width = width;
+   buffer->height = height;
+
+   /* Mark the buffer as idle
+    */
+   dri3_fence_set(buffer);
+
+   return buffer;
+
+no_buffer_attrib:
+   (draw->ext->image->destroyImage)(pixmap_buffer);
+no_linear_buffer:
+   if (draw->is_different_gpu)
+      (draw->ext->image->destroyImage)(buffer->image);
+no_image:
+   free(buffer);
+no_buffer:
+   xshmfence_unmap_shm(shm_fence);
+no_shm_fence:
+   close(fence_fd);
+   return NULL;
+}
+
+/** loader_dri3_update_drawable
+ *
+ * Called the first time we use the drawable and then
+ * after we receive present configure notify events to
+ * track the geometry of the drawable
+ */
+static int
+dri3_update_drawable(__DRIdrawable *driDrawable,
+                     struct loader_dri3_drawable *draw)
+{
+   if (draw->first_init) {
+      xcb_get_geometry_cookie_t                 geom_cookie;
+      xcb_get_geometry_reply_t                  *geom_reply;
+      xcb_void_cookie_t                         cookie;
+      xcb_generic_error_t                       *error;
+      xcb_present_query_capabilities_cookie_t   present_capabilities_cookie;
+      xcb_present_query_capabilities_reply_t    *present_capabilities_reply;
+
+      draw->first_init = false;
+
+      /* Try to select for input on the window.
+       *
+       * If the drawable is a window, this will get our events
+       * delivered.
+       *
+       * Otherwise, we'll get a BadWindow error back from this request which
+       * will let us know that the drawable is a pixmap instead.
+       */
+
+      draw->eid = xcb_generate_id(draw->conn);
+      cookie =
+         xcb_present_select_input_checked(draw->conn, draw->eid, draw->drawable,
+                                          XCB_PRESENT_EVENT_MASK_CONFIGURE_NOTIFY |
+                                          XCB_PRESENT_EVENT_MASK_COMPLETE_NOTIFY |
+                                          XCB_PRESENT_EVENT_MASK_IDLE_NOTIFY);
+
+      present_capabilities_cookie =
+         xcb_present_query_capabilities(draw->conn, draw->drawable);
+
+      /* Create an XCB event queue to hold present events outside of the usual
+       * application event queue
+       */
+      draw->special_event = xcb_register_for_special_xge(draw->conn,
+                                                         &xcb_present_id,
+                                                         draw->eid,
+                                                         draw->stamp);
+      geom_cookie = xcb_get_geometry(draw->conn, draw->drawable);
+
+      geom_reply = xcb_get_geometry_reply(draw->conn, geom_cookie, NULL);
+
+      if (!geom_reply)
+         return false;
+
+      draw->width = geom_reply->width;
+      draw->height = geom_reply->height;
+      draw->depth = geom_reply->depth;
+      draw->vtable->set_drawable_size(draw, draw->width, draw->height);
+
+      free(geom_reply);
+
+      draw->is_pixmap = false;
+
+      /* Check to see if our select input call failed. If it failed with a
+       * BadWindow error, then assume the drawable is a pixmap. Destroy the
+       * special event queue created above and mark the drawable as a pixmap
+       */
+
+      error = xcb_request_check(draw->conn, cookie);
+
+      present_capabilities_reply =
+          xcb_present_query_capabilities_reply(draw->conn,
+                                               present_capabilities_cookie,
+                                               NULL);
+
+      if (present_capabilities_reply) {
+         draw->present_capabilities = present_capabilities_reply->capabilities;
+         free(present_capabilities_reply);
+      } else
+         draw->present_capabilities = 0;
+
+      if (error) {
+         if (error->error_code != BadWindow) {
+            free(error);
+            return false;
+         }
+         draw->is_pixmap = true;
+         xcb_unregister_for_special_event(draw->conn, draw->special_event);
+         draw->special_event = NULL;
+      }
+   }
+   dri3_flush_present_events(draw);
+   return true;
+}
+
+/* the DRIimage createImage function takes __DRI_IMAGE_FORMAT codes, while
+ * the createImageFromFds call takes __DRI_IMAGE_FOURCC codes. To avoid
+ * complete confusion, just deal in __DRI_IMAGE_FORMAT codes for now and
+ * translate to __DRI_IMAGE_FOURCC codes in the call to createImageFromFds
+ */
+static int
+image_format_to_fourcc(int format)
+{
+
+   /* Convert from __DRI_IMAGE_FORMAT to __DRI_IMAGE_FOURCC (sigh) */
+   switch (format) {
+   case __DRI_IMAGE_FORMAT_SARGB8: return __DRI_IMAGE_FOURCC_SARGB8888;
+   case __DRI_IMAGE_FORMAT_RGB565: return __DRI_IMAGE_FOURCC_RGB565;
+   case __DRI_IMAGE_FORMAT_XRGB8888: return __DRI_IMAGE_FOURCC_XRGB8888;
+   case __DRI_IMAGE_FORMAT_ARGB8888: return __DRI_IMAGE_FOURCC_ARGB8888;
+   case __DRI_IMAGE_FORMAT_ABGR8888: return __DRI_IMAGE_FOURCC_ABGR8888;
+   case __DRI_IMAGE_FORMAT_XBGR8888: return __DRI_IMAGE_FOURCC_XBGR8888;
+   }
+   return 0;
+}
+
+__DRIimage *
+loader_dri3_create_image(xcb_connection_t *c,
+                         xcb_dri3_buffer_from_pixmap_reply_t *bp_reply,
+                         unsigned int format,
+                         __DRIscreen *dri_screen,
+                         const __DRIimageExtension *image,
+                         void *loaderPrivate)
+{
+   int                                  *fds;
+   __DRIimage                           *image_planar, *ret;
+   int                                  stride, offset;
+
+   /* Get an FD for the pixmap object
+    */
+   fds = xcb_dri3_buffer_from_pixmap_reply_fds(c, bp_reply);
+
+   stride = bp_reply->stride;
+   offset = 0;
+
+   /* createImageFromFds creates a wrapper __DRIimage structure which
+    * can deal with multiple planes for things like Yuv images. So, once
+    * we've gotten the planar wrapper, pull the single plane out of it and
+    * discard the wrapper.
+    */
+   image_planar = (image->createImageFromFds)(dri_screen,
+                                              bp_reply->width,
+                                              bp_reply->height,
+                                              image_format_to_fourcc(format),
+                                              fds, 1,
+                                              &stride, &offset, loaderPrivate);
+   close(fds[0]);
+   if (!image_planar)
+      return NULL;
+
+   ret = (image->fromPlanar)(image_planar, 0, loaderPrivate);
+
+   (image->destroyImage)(image_planar);
+
+   return ret;
+}
+
+/** dri3_get_pixmap_buffer
+ *
+ * Get the DRM object for a pixmap from the X server and
+ * wrap that with a __DRIimage structure using createImageFromFds
+ */
+static struct loader_dri3_buffer *
+dri3_get_pixmap_buffer(__DRIdrawable *driDrawable, unsigned int format,
+                       enum loader_dri3_buffer_type buffer_type,
+                       struct loader_dri3_drawable *draw)
+{
+   int                                  buf_id = loader_dri3_pixmap_buf_id(buffer_type);
+   struct loader_dri3_buffer            *buffer = draw->buffers[buf_id];
+   xcb_drawable_t                       pixmap;
+   xcb_dri3_buffer_from_pixmap_cookie_t bp_cookie;
+   xcb_dri3_buffer_from_pixmap_reply_t  *bp_reply;
+   xcb_sync_fence_t                     sync_fence;
+   struct xshmfence                     *shm_fence;
+   int                                  fence_fd;
+
+   if (buffer)
+      return buffer;
+
+   pixmap = draw->drawable;
+
+   buffer = calloc(1, sizeof *buffer);
+   if (!buffer)
+      goto no_buffer;
+
+   fence_fd = xshmfence_alloc_shm();
+   if (fence_fd < 0)
+      goto no_fence;
+   shm_fence = xshmfence_map_shm(fence_fd);
+   if (shm_fence == NULL) {
+      close (fence_fd);
+      goto no_fence;
+   }
+
+   xcb_dri3_fence_from_fd(draw->conn,
+                          pixmap,
+                          (sync_fence = xcb_generate_id(draw->conn)),
+                          false,
+                          fence_fd);
+
+   bp_cookie = xcb_dri3_buffer_from_pixmap(draw->conn, pixmap);
+   bp_reply = xcb_dri3_buffer_from_pixmap_reply(draw->conn, bp_cookie, NULL);
+   if (!bp_reply)
+      goto no_image;
+
+   buffer->image = loader_dri3_create_image(draw->conn, bp_reply, format,
+                                            draw->dri_screen, draw->ext->image,
+                                            buffer);
+   if (!buffer->image)
+      goto no_image;
+
+   buffer->pixmap = pixmap;
+   buffer->own_pixmap = false;
+   buffer->width = bp_reply->width;
+   buffer->height = bp_reply->height;
+   buffer->buffer_type = buffer_type;
+   buffer->shm_fence = shm_fence;
+   buffer->sync_fence = sync_fence;
+
+   draw->buffers[buf_id] = buffer;
+
+   free(bp_reply);
+
+   return buffer;
+
+no_image:
+   free(bp_reply);
+   xcb_sync_destroy_fence(draw->conn, sync_fence);
+   xshmfence_unmap_shm(shm_fence);
+no_fence:
+   free(buffer);
+no_buffer:
+   return NULL;
+}
+
+/** dri3_get_buffer
+ *
+ * Find a front or back buffer, allocating new ones as necessary
+ */
+static struct loader_dri3_buffer *
+dri3_get_buffer(__DRIdrawable *driDrawable,
+                unsigned int format,
+                enum loader_dri3_buffer_type buffer_type,
+                struct loader_dri3_drawable *draw)
+{
+   struct loader_dri3_buffer *buffer;
+   int buf_id;
+   __DRIcontext *dri_context;
+
+   dri_context = draw->vtable->get_dri_context(draw);
+
+   if (buffer_type == loader_dri3_buffer_back) {
+      buf_id = dri3_find_back(draw);
+
+      if (buf_id < 0)
+         return NULL;
+   } else {
+      buf_id = LOADER_DRI3_FRONT_ID;
+   }
+
+   buffer = draw->buffers[buf_id];
+
+   /* Allocate a new buffer if there isn't an old one, or if that
+    * old one is the wrong size
+    */
+   if (!buffer || buffer->width != draw->width ||
+       buffer->height != draw->height) {
+      struct loader_dri3_buffer *new_buffer;
+
+      /* Allocate the new buffers
+       */
+      new_buffer = dri3_alloc_render_buffer(draw,
+                                                   format,
+                                                   draw->width,
+                                                   draw->height,
+                                                   draw->depth);
+      if (!new_buffer)
+         return NULL;
+
+      /* When resizing, copy the contents of the old buffer, waiting for that
+       * copy to complete using our fences before proceeding
+       */
+      switch (buffer_type) {
+      case loader_dri3_buffer_back:
+         if (buffer) {
+            if (!buffer->linear_buffer) {
+               dri3_fence_reset(draw->conn, new_buffer);
+               dri3_fence_await(draw->conn, buffer);
+               dri3_copy_area(draw->conn,
+                              buffer->pixmap,
+                              new_buffer->pixmap,
+                              dri3_drawable_gc(draw),
+                              0, 0, 0, 0,
+                              draw->width, draw->height);
+               dri3_fence_trigger(draw->conn, new_buffer);
+            } else if (draw->vtable->in_current_context(draw)) {
+               draw->ext->image->blitImage(dri_context,
+                                           new_buffer->image,
+                                           buffer->image,
+                                           0, 0, draw->width, draw->height,
+                                           0, 0, draw->width, draw->height, 0);
+            }
+            dri3_free_render_buffer(draw, buffer);
+         }
+         break;
+      case loader_dri3_buffer_front:
+         dri3_fence_reset(draw->conn, new_buffer);
+         dri3_copy_area(draw->conn,
+                        draw->drawable,
+                        new_buffer->pixmap,
+                        dri3_drawable_gc(draw),
+                        0, 0, 0, 0,
+                        draw->width, draw->height);
+         dri3_fence_trigger(draw->conn, new_buffer);
+
+         if (new_buffer->linear_buffer &&
+             draw->vtable->in_current_context(draw)) {
+            dri3_fence_await(draw->conn, new_buffer);
+            draw->ext->image->blitImage(dri_context,
+                                        new_buffer->image,
+                                        new_buffer->linear_buffer,
+                                        0, 0, draw->width, draw->height,
+                                        0, 0, draw->width, draw->height, 0);
+         }
+         break;
+      }
+      buffer = new_buffer;
+      buffer->buffer_type = buffer_type;
+      draw->buffers[buf_id] = buffer;
+   }
+   dri3_fence_await(draw->conn, buffer);
+
+   /* Return the requested buffer */
+   return buffer;
+}
+
+/** dri3_free_buffers
+ *
+ * Free the front bufffer or all of the back buffers. Used
+ * when the application changes which buffers it needs
+ */
+static void
+dri3_free_buffers(__DRIdrawable *driDrawable,
+                  enum loader_dri3_buffer_type buffer_type,
+                  struct loader_dri3_drawable *draw)
+{
+   struct loader_dri3_buffer *buffer;
+   int first_id;
+   int n_id;
+   int buf_id;
+
+   switch (buffer_type) {
+   case loader_dri3_buffer_back:
+      first_id = LOADER_DRI3_BACK_ID(0);
+      n_id = LOADER_DRI3_MAX_BACK;
+      break;
+   case loader_dri3_buffer_front:
+      first_id = LOADER_DRI3_FRONT_ID;
+      n_id = 1;
+   }
+
+   for (buf_id = first_id; buf_id < first_id + n_id; buf_id++) {
+      buffer = draw->buffers[buf_id];
+      if (buffer) {
+         dri3_free_render_buffer(draw, buffer);
+         draw->buffers[buf_id] = NULL;
+      }
+   }
+}
+
+/** loader_dri3_get_buffers
+ *
+ * The published buffer allocation API.
+ * Returns all of the necessary buffers, allocating
+ * as needed.
+ */
+int
+loader_dri3_get_buffers(__DRIdrawable *driDrawable,
+                        unsigned int format,
+                        uint32_t *stamp,
+                        void *loaderPrivate,
+                        uint32_t buffer_mask,
+                        struct __DRIimageList *buffers)
+{
+   struct loader_dri3_drawable *draw = loaderPrivate;
+   struct loader_dri3_buffer   *front, *back;
+
+   buffers->image_mask = 0;
+   buffers->front = NULL;
+   buffers->back = NULL;
+
+   front = NULL;
+   back = NULL;
+
+   if (!dri3_update_drawable(driDrawable, draw))
+      return false;
+
+   /* pixmaps always have front buffers */
+   if (draw->is_pixmap)
+      buffer_mask |= __DRI_IMAGE_BUFFER_FRONT;
+
+   if (buffer_mask & __DRI_IMAGE_BUFFER_FRONT) {
+      /* All pixmaps are owned by the server gpu.
+       * When we use a different gpu, we can't use the pixmap
+       * as buffer since it is potentially tiled a way
+       * our device can't understand. In this case, use
+       * a fake front buffer. Hopefully the pixmap
+       * content will get synced with the fake front
+       * buffer.
+       */
+      if (draw->is_pixmap && !draw->is_different_gpu)
+         front = dri3_get_pixmap_buffer(driDrawable,
+                                               format,
+                                               loader_dri3_buffer_front,
+                                               draw);
+      else
+         front = dri3_get_buffer(driDrawable,
+                                        format,
+                                        loader_dri3_buffer_front,
+                                        draw);
+
+      if (!front)
+         return false;
+   } else {
+      dri3_free_buffers(driDrawable, loader_dri3_buffer_front, draw);
+      draw->have_fake_front = 0;
+   }
+
+   if (buffer_mask & __DRI_IMAGE_BUFFER_BACK) {
+      back = dri3_get_buffer(driDrawable,
+                                    format,
+                                    loader_dri3_buffer_back,
+                                    draw);
+      if (!back)
+         return false;
+      draw->have_back = 1;
+   } else {
+      dri3_free_buffers(driDrawable, loader_dri3_buffer_back, draw);
+      draw->have_back = 0;
+   }
+
+   if (front) {
+      buffers->image_mask |= __DRI_IMAGE_BUFFER_FRONT;
+      buffers->front = front->image;
+      draw->have_fake_front = draw->is_different_gpu || !draw->is_pixmap;
+   }
+
+   if (back) {
+      buffers->image_mask |= __DRI_IMAGE_BUFFER_BACK;
+      buffers->back = back->image;
+   }
+
+   draw->stamp = stamp;
+
+   return true;
+}
diff --git a/src/loader/loader_dri3_helper.h b/src/loader/loader_dri3_helper.h
new file mode 100644
index 00000000000..5b8fd1d24ca
--- /dev/null
+++ b/src/loader/loader_dri3_helper.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright © 2013 Keith Packard
+ * Copyright © 2015 Boyan Ding
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#ifndef LOADER_DRI3_HEADER_H
+#define LOADER_DRI3_HEADER_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <xcb/xcb.h>
+#include <xcb/dri3.h>
+#include <xcb/present.h>
+
+#include <GL/gl.h>
+#include <GL/internal/dri_interface.h>
+
+enum loader_dri3_buffer_type {
+   loader_dri3_buffer_back = 0,
+   loader_dri3_buffer_front = 1
+};
+
+struct loader_dri3_buffer {
+   __DRIimage   *image;
+   __DRIimage   *linear_buffer;
+   uint32_t     pixmap;
+
+   /* Synchronization between the client and X server is done using an
+    * xshmfence that is mapped into an X server SyncFence. This lets the
+    * client check whether the X server is done using a buffer with a simple
+    * xshmfence call, rather than going to read X events from the wire.
+    *
+    * However, we can only wait for one xshmfence to be triggered at a time,
+    * so we need to know *which* buffer is going to be idle next. We do that
+    * by waiting for a PresentIdleNotify event. When that event arrives, the
+    * 'busy' flag gets cleared and the client knows that the fence has been
+    * triggered, and that the wait call will not block.
+    */
+
+   uint32_t     sync_fence;     /* XID of X SyncFence object */
+   struct xshmfence *shm_fence; /* pointer to xshmfence object */
+   bool         busy;           /* Set on swap, cleared on IdleNotify */
+   bool         own_pixmap;     /* We allocated the pixmap ID, free on destroy */
+
+   uint32_t     size;
+   uint32_t     pitch;
+   uint32_t     cpp;
+   uint32_t     flags;
+   uint32_t     width, height;
+   uint64_t     last_swap;
+
+   enum loader_dri3_buffer_type        buffer_type;
+};
+
+
+#define LOADER_DRI3_MAX_BACK   4
+#define LOADER_DRI3_BACK_ID(i) (i)
+#define LOADER_DRI3_FRONT_ID   (LOADER_DRI3_MAX_BACK)
+
+static inline int
+loader_dri3_pixmap_buf_id(enum loader_dri3_buffer_type buffer_type)
+{
+   if (buffer_type == loader_dri3_buffer_back)
+      return LOADER_DRI3_BACK_ID(0);
+   else
+      return LOADER_DRI3_FRONT_ID;
+}
+
+struct loader_dri3_extensions {
+   const __DRIcoreExtension *core;
+   const __DRIimageDriverExtension *image_driver;
+   const __DRI2flushExtension *flush;
+   const __DRI2configQueryExtension *config;
+   const __DRItexBufferExtension *tex_buffer;
+   const __DRIimageExtension *image;
+};
+
+struct loader_dri3_drawable;
+
+struct loader_dri3_vtable {
+   int (*get_swap_interval)(struct loader_dri3_drawable *);
+   int (*clamp_swap_interval)(struct loader_dri3_drawable *, int);
+   void (*set_swap_interval)(struct loader_dri3_drawable *, int);
+   void (*set_drawable_size)(struct loader_dri3_drawable *, int, int);
+   bool (*in_current_context)(struct loader_dri3_drawable *);
+   __DRIcontext *(*get_dri_context)(struct loader_dri3_drawable *);
+   void (*flush_drawable)(struct loader_dri3_drawable *, unsigned);
+   void (*show_fps)(struct loader_dri3_drawable *, uint64_t);
+};
+
+#define LOADER_DRI3_NUM_BUFFERS (1 + LOADER_DRI3_MAX_BACK)
+
+struct loader_dri3_drawable {
+   xcb_connection_t *conn;
+   __DRIdrawable *dri_drawable;
+   xcb_drawable_t drawable;
+   int width;
+   int height;
+   int depth;
+   uint8_t have_back;
+   uint8_t have_fake_front;
+   uint8_t is_pixmap;
+   uint8_t flipping;
+
+   /* Information about the GPU owning the buffer */
+   __DRIscreen *dri_screen;
+   bool is_different_gpu;
+
+   /* Present extension capabilities
+    */
+   uint32_t present_capabilities;
+
+   /* SBC numbers are tracked by using the serial numbers
+    * in the present request and complete events
+    */
+   uint64_t send_sbc;
+   uint64_t recv_sbc;
+
+   /* Last received UST/MSC values for pixmap present complete */
+   uint64_t ust, msc;
+
+   /* Last received UST/MSC values from present notify msc event */
+   uint64_t notify_ust, notify_msc;
+
+   /* Serial numbers for tracking wait_for_msc events */
+   uint32_t send_msc_serial;
+   uint32_t recv_msc_serial;
+
+   struct loader_dri3_buffer *buffers[LOADER_DRI3_NUM_BUFFERS];
+   int cur_back;
+   int num_back;
+
+   uint32_t *stamp;
+
+   xcb_present_event_t eid;
+   xcb_gcontext_t gc;
+   xcb_special_event_t *special_event;
+
+   bool first_init;
+
+   struct loader_dri3_extensions *ext;
+   struct loader_dri3_vtable *vtable;
+};
+
+void
+loader_dri3_set_swap_interval(struct loader_dri3_drawable *draw,
+                              int interval);
+
+void
+loader_dri3_drawable_fini(struct loader_dri3_drawable *draw);
+
+int
+loader_dri3_drawable_init(xcb_connection_t *conn,
+                          xcb_drawable_t drawable,
+                          __DRIscreen *dri_screen,
+                          bool is_different_gpu,
+                          const __DRIconfig *dri_config,
+                          struct loader_dri3_extensions *ext,
+                          struct loader_dri3_vtable *vtable,
+                          struct loader_dri3_drawable*);
+
+bool loader_dri3_wait_for_msc(struct loader_dri3_drawable *draw,
+                              int64_t target_msc,
+                              int64_t divisor, int64_t remainder,
+                              int64_t *ust, int64_t *msc, int64_t *sbc);
+
+int64_t
+loader_dri3_swap_buffers_msc(struct loader_dri3_drawable *draw,
+                             int64_t target_msc, int64_t divisor,
+                             int64_t remainder, unsigned flush_flags,
+                             bool force_copy);
+
+int
+loader_dri3_wait_for_sbc(struct loader_dri3_drawable *draw,
+                         int64_t target_sbc, int64_t *ust,
+                         int64_t *msc, int64_t *sbc);
+
+int loader_dri3_query_buffer_age(struct loader_dri3_drawable *draw);
+
+void
+loader_dri3_flush(struct loader_dri3_drawable *draw,
+                  unsigned flags,
+                  enum __DRI2throttleReason throttle_reason);
+
+void
+loader_dri3_copy_sub_buffer(struct loader_dri3_drawable *draw,
+                            int x, int y,
+                            int width, int height,
+                            bool flush);
+
+void
+loader_dri3_copy_drawable(struct loader_dri3_drawable *draw,
+                          xcb_drawable_t dest,
+                          xcb_drawable_t src);
+
+void
+loader_dri3_wait_x(struct loader_dri3_drawable *draw);
+
+void
+loader_dri3_wait_gl(struct loader_dri3_drawable *draw);
+
+int loader_dri3_open(xcb_connection_t *conn,
+                     xcb_window_t root,
+                     uint32_t provider);
+
+__DRIimage *
+loader_dri3_create_image(xcb_connection_t *c,
+                         xcb_dri3_buffer_from_pixmap_reply_t *bp_reply,
+                         unsigned int format,
+                         __DRIscreen *dri_screen,
+                         const __DRIimageExtension *image,
+                         void *loaderPrivate);
+
+int
+loader_dri3_get_buffers(__DRIdrawable *driDrawable,
+                        unsigned int format,
+                        uint32_t *stamp,
+                        void *loaderPrivate,
+                        uint32_t buffer_mask,
+                        struct __DRIimageList *buffers);
+
+#endif
diff --git a/src/mapi/glapi/gen/EXT_gpu_shader4.xml b/src/mapi/glapi/gen/EXT_gpu_shader4.xml
index b1f7eae2610..b4120b9c192 100644
--- a/src/mapi/glapi/gen/EXT_gpu_shader4.xml
+++ b/src/mapi/glapi/gen/EXT_gpu_shader4.xml
@@ -232,7 +232,8 @@
         <param name="params" type="GLuint *"/>
     </function>
 
-    <function name="BindFragDataLocationEXT" alias="BindFragDataLocation">
+    <function name="BindFragDataLocationEXT" alias="BindFragDataLocation"
+	    es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="colorNumber" type="GLuint"/>
         <param name="name" type="const GLchar *"/>
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 9a777a24c61..577d8254c43 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -914,4 +914,30 @@
     </function>
 </category>
 
+<category name="GL_EXT_blend_func_extended" number="247">
+
+    <function name="BindFragDataLocationIndexedEXT" alias="BindFragDataLocationIndexed"
+            es2="3.0">
+        <param name="program" type="GLuint"/>
+        <param name="colorNumber" type="GLuint"/>
+        <param name="index" type="GLuint"/>
+        <param name="name" type="const GLchar *"/>
+    </function>
+
+    <function name="GetFragDataIndexEXT" alias="GetFragDataIndex"
+            es2="3.0">
+        <param name="program" type="GLuint"/>
+        <param name="name" type="const GLchar *"/>
+        <return type="GLint"/>
+    </function>
+
+    <function name="GetProgramResourceLocationIndexEXT" alias="GetProgramResourceLocationIndex"
+           es2="3.1">
+        <param name="program" type="GLuint"/>
+        <param name="programInterface" type="GLenum"/>
+        <param name="name" type="const GLchar *"/>
+        <return type="GLint"/>
+    </function>
+
+</category>
 </OpenGLAPI>
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index a9da0a21ba3..bde170fcf6f 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -102,13 +102,13 @@ fallback_required(struct gl_context *ctx, GLenum target,
     */
    if (!mipmap->FBO)
       _mesa_GenFramebuffers(1, &mipmap->FBO);
-   _mesa_BindFramebuffer(GL_FRAMEBUFFER_EXT, mipmap->FBO);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, mipmap->FBO);
 
-   _mesa_meta_bind_fbo_image(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, baseImage, 0);
+   _mesa_meta_bind_fbo_image(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, baseImage, 0);
 
-   status = _mesa_CheckFramebufferStatus(GL_FRAMEBUFFER_EXT);
+   status = _mesa_CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER);
 
-   _mesa_BindFramebuffer(GL_FRAMEBUFFER_EXT, fboSave);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fboSave);
 
    if (status != GL_FRAMEBUFFER_COMPLETE_EXT) {
       _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH,
@@ -131,6 +131,11 @@ _mesa_meta_glsl_generate_mipmap_cleanup(struct gen_mipmap_state *mipmap)
    _mesa_DeleteSamplers(1, &mipmap->Sampler);
    mipmap->Sampler = 0;
 
+   if (mipmap->FBO != 0) {
+      _mesa_DeleteFramebuffers(1, &mipmap->FBO);
+      mipmap->FBO = 0;
+   }
+
    _mesa_meta_blit_shader_table_cleanup(&mipmap->shaders);
 }
 
diff --git a/src/mesa/drivers/dri/common/xmlconfig.c b/src/mesa/drivers/dri/common/xmlconfig.c
index b8ab480ddfe..a8f7c9b854b 100644
--- a/src/mesa/drivers/dri/common/xmlconfig.c
+++ b/src/mesa/drivers/dri/common/xmlconfig.c
@@ -59,6 +59,9 @@ extern char *program_invocation_name, *program_invocation_short_name;
 #elif defined(__NetBSD__) && defined(__NetBSD_Version__) && (__NetBSD_Version__ >= 106000100)
 #    include <stdlib.h>
 #    define GET_PROGRAM_NAME() getprogname()
+#elif defined(__DragonFly__)
+#    include <stdlib.h>
+#    define GET_PROGRAM_NAME() getprogname()
 #elif defined(__APPLE__)
 #    include <stdlib.h>
 #    define GET_PROGRAM_NAME() getprogname()
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index 7fa4ce87f18..b8990cef89e 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -85,7 +85,7 @@ brw_blorp_eu_emitter::emit_texture_lookup(const struct brw_reg &dst,
                                           unsigned msg_length)
 {
    fs_inst *inst = new (mem_ctx) fs_inst(op, 16, dst, brw_message_reg(base_mrf),
-                                         fs_reg(0u), fs_reg(0u));
+                                         brw_imm_ud(0u), brw_imm_ud(0u));
 
    inst->base_mrf = base_mrf;
    inst->mlen = msg_length;
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 40ad14402a7..73ba85e2a61 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -224,7 +224,10 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
               vec1(t_nopersp),
               brw_imm_f(0));
       brw_IF(p, BRW_EXECUTE_1);
-      brw_MOV(p, t_nopersp, brw_imm_vf4(1, 0, 0, 0));
+      brw_MOV(p, t_nopersp, brw_imm_vf4(brw_float_to_vf(1.0),
+                                        brw_float_to_vf(0.0),
+                                        brw_float_to_vf(0.0),
+                                        brw_float_to_vf(0.0)));
       brw_ENDIF(p);
 
       /* Now compute t_nopersp = t_nopersp.y/t_nopersp.x and broadcast it. */
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index cd78af0dce4..e49994f19a8 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include <stdio.h>
 #include "brw_device_info.h"
 #include "main/mtypes.h"
 
@@ -89,8 +90,7 @@ struct brw_compiler {
    void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
    void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
 
-   bool scalar_vs;
-   bool scalar_gs;
+   bool scalar_stage[MESA_SHADER_STAGES];
    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 };
 
@@ -493,6 +493,34 @@ enum shader_dispatch_mode {
    DISPATCH_MODE_SIMD8 = 3,
 };
 
+/**
+ * @defgroup Tessellator parameter enumerations.
+ *
+ * These correspond to the hardware values in 3DSTATE_TE, and are provided
+ * as part of the tessellation evaluation shader.
+ *
+ * @{
+ */
+enum brw_tess_partitioning {
+   BRW_TESS_PARTITIONING_INTEGER         = 0,
+   BRW_TESS_PARTITIONING_ODD_FRACTIONAL  = 1,
+   BRW_TESS_PARTITIONING_EVEN_FRACTIONAL = 2,
+};
+
+enum brw_tess_output_topology {
+   BRW_TESS_OUTPUT_TOPOLOGY_POINT   = 0,
+   BRW_TESS_OUTPUT_TOPOLOGY_LINE    = 1,
+   BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW  = 2,
+   BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3,
+};
+
+enum brw_tess_domain {
+   BRW_TESS_DOMAIN_QUAD    = 0,
+   BRW_TESS_DOMAIN_TRI     = 1,
+   BRW_TESS_DOMAIN_ISOLINE = 2,
+};
+/** @} */
+
 struct brw_vue_prog_data {
    struct brw_stage_prog_data base;
    struct brw_vue_map vue_map;
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index ac6045dbba9..2ea0a9eca92 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -322,64 +322,82 @@ static void
 brw_initialize_context_constants(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+
+   const bool stage_exists[MESA_SHADER_STAGES] = {
+      [MESA_SHADER_VERTEX] = true,
+      [MESA_SHADER_TESS_CTRL] = false,
+      [MESA_SHADER_TESS_EVAL] = false,
+      [MESA_SHADER_GEOMETRY] = brw->gen >= 6,
+      [MESA_SHADER_FRAGMENT] = true,
+      [MESA_SHADER_COMPUTE] = _mesa_extension_override_enables.ARB_compute_shader,
+   };
+
+   unsigned num_stages = 0;
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (stage_exists[i])
+         num_stages++;
+   }
 
    unsigned max_samplers =
       brw->gen >= 8 || brw->is_haswell ? BRW_MAX_TEX_UNIT : 16;
 
+   ctx->Const.MaxDualSourceDrawBuffers = 1;
+   ctx->Const.MaxDrawBuffers = BRW_MAX_DRAW_BUFFERS;
+   ctx->Const.MaxCombinedShaderOutputResources =
+      MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
+
    ctx->Const.QueryCounterBits.Timestamp = 36;
 
+   ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
+   ctx->Const.MaxImageUnits = MAX_IMAGE_UNITS;
+   ctx->Const.MaxRenderbufferSize = 8192;
+   ctx->Const.MaxTextureLevels = MIN2(14 /* 8192 */, MAX_TEXTURE_LEVELS);
+   ctx->Const.Max3DTextureLevels = 12; /* 2048 */
+   ctx->Const.MaxCubeTextureLevels = 14; /* 8192 */
+   ctx->Const.MaxArrayTextureLayers = brw->gen >= 7 ? 2048 : 512;
+   ctx->Const.MaxTextureMbytes = 1536;
+   ctx->Const.MaxTextureRectSize = 1 << 12;
+   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
    ctx->Const.StripTextureBorder = true;
+   if (brw->gen >= 7)
+      ctx->Const.MaxProgramTextureGatherComponents = 4;
+   else if (brw->gen == 6)
+      ctx->Const.MaxProgramTextureGatherComponents = 1;
 
    ctx->Const.MaxUniformBlockSize = 65536;
+
    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
       struct gl_program_constants *prog = &ctx->Const.Program[i];
+
+      if (!stage_exists[i])
+         continue;
+
+      prog->MaxTextureImageUnits = max_samplers;
+
       prog->MaxUniformBlocks = BRW_MAX_UBO;
       prog->MaxCombinedUniformComponents =
          prog->MaxUniformComponents +
          ctx->Const.MaxUniformBlockSize / 4 * prog->MaxUniformBlocks;
+
+      prog->MaxAtomicCounters = MAX_ATOMIC_COUNTERS;
+      prog->MaxAtomicBuffers = BRW_MAX_ABO;
+      prog->MaxImageUniforms = compiler->scalar_stage[i] ? BRW_MAX_IMAGES : 0;
+      prog->MaxShaderStorageBlocks = BRW_MAX_SSBO;
    }
 
-   ctx->Const.MaxDualSourceDrawBuffers = 1;
-   ctx->Const.MaxDrawBuffers = BRW_MAX_DRAW_BUFFERS;
-   ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits = max_samplers;
-   ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
    ctx->Const.MaxTextureUnits =
       MIN2(ctx->Const.MaxTextureCoordUnits,
            ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits);
-   ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits = max_samplers;
-   if (brw->gen >= 6)
-      ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits = max_samplers;
-   else
-      ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits = 0;
-   if (_mesa_extension_override_enables.ARB_compute_shader) {
-      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits = BRW_MAX_TEX_UNIT;
-      ctx->Const.MaxUniformBufferBindings += BRW_MAX_UBO;
-   } else {
-      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits = 0;
-   }
-   ctx->Const.MaxCombinedTextureImageUnits =
-      ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits +
-      ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits +
-      ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits +
-      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
-
-   ctx->Const.MaxTextureLevels = 14; /* 8192 */
-   if (ctx->Const.MaxTextureLevels > MAX_TEXTURE_LEVELS)
-      ctx->Const.MaxTextureLevels = MAX_TEXTURE_LEVELS;
-   ctx->Const.Max3DTextureLevels = 12; /* 2048 */
-   ctx->Const.MaxCubeTextureLevels = 14; /* 8192 */
-   ctx->Const.MaxTextureMbytes = 1536;
-
-   if (brw->gen >= 7)
-      ctx->Const.MaxArrayTextureLayers = 2048;
-   else
-      ctx->Const.MaxArrayTextureLayers = 512;
 
-   ctx->Const.MaxTextureRectSize = 1 << 12;
+   ctx->Const.MaxUniformBufferBindings = num_stages * BRW_MAX_UBO;
+   ctx->Const.MaxCombinedUniformBlocks = num_stages * BRW_MAX_UBO;
+   ctx->Const.MaxCombinedAtomicBuffers = num_stages * BRW_MAX_ABO;
+   ctx->Const.MaxCombinedShaderStorageBlocks = num_stages * BRW_MAX_SSBO;
+   ctx->Const.MaxShaderStorageBufferBindings = num_stages * BRW_MAX_SSBO;
+   ctx->Const.MaxCombinedTextureImageUnits = num_stages * max_samplers;
+   ctx->Const.MaxCombinedImageUniforms = num_stages * BRW_MAX_IMAGES;
 
-   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
-
-   ctx->Const.MaxRenderbufferSize = 8192;
 
    /* Hardware only supports a limited number of transform feedback buffers.
     * So we need to override the Mesa default (which is based only on software
@@ -427,6 +445,7 @@ brw_initialize_context_constants(struct brw_context *brw)
    ctx->Const.MaxColorTextureSamples = max_samples;
    ctx->Const.MaxDepthTextureSamples = max_samples;
    ctx->Const.MaxIntegerSamples = max_samples;
+   ctx->Const.MaxImageSamples = 0;
 
    /* gen6_set_sample_maps() sets SampleMap{2,4,8}x variables which are used
     * to map indices of rectangular grid to sample numbers within a pixel.
@@ -436,11 +455,6 @@ brw_initialize_context_constants(struct brw_context *brw)
     */
    gen6_set_sample_maps(ctx);
 
-   if (brw->gen >= 7)
-      ctx->Const.MaxProgramTextureGatherComponents = 4;
-   else if (brw->gen == 6)
-      ctx->Const.MaxProgramTextureGatherComponents = 1;
-
    ctx->Const.MinLineWidth = 1.0;
    ctx->Const.MinLineWidthAA = 1.0;
    if (brw->gen >= 6) {
@@ -511,30 +525,6 @@ brw_initialize_context_constants(struct brw_context *brw)
    ctx->Const.Program[MESA_SHADER_VERTEX].HighInt = ctx->Const.Program[MESA_SHADER_VERTEX].LowInt;
    ctx->Const.Program[MESA_SHADER_VERTEX].MediumInt = ctx->Const.Program[MESA_SHADER_VERTEX].LowInt;
 
-   if (brw->gen >= 7) {
-      ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters = MAX_ATOMIC_COUNTERS;
-      ctx->Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters = MAX_ATOMIC_COUNTERS;
-      ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicCounters = MAX_ATOMIC_COUNTERS;
-      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicCounters = MAX_ATOMIC_COUNTERS;
-      ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers = BRW_MAX_ABO;
-      ctx->Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers = BRW_MAX_ABO;
-      ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers = BRW_MAX_ABO;
-      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicBuffers = BRW_MAX_ABO;
-      ctx->Const.MaxCombinedAtomicBuffers = 3 * BRW_MAX_ABO;
-
-      ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms =
-         BRW_MAX_IMAGES;
-      ctx->Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms =
-         (brw->intelScreen->compiler->scalar_vs ? BRW_MAX_IMAGES : 0);
-      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxImageUniforms =
-         BRW_MAX_IMAGES;
-      ctx->Const.MaxImageUnits = MAX_IMAGE_UNITS;
-      ctx->Const.MaxCombinedShaderOutputResources =
-         MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
-      ctx->Const.MaxImageSamples = 0;
-      ctx->Const.MaxCombinedImageUniforms = 3 * BRW_MAX_IMAGES;
-   }
-
    /* Gen6 converts quads to polygon in beginning of 3D pipeline,
     * but we're not sure how it's actually done for vertex order,
     * that affect provoking vertex decision. Always use last vertex
@@ -586,21 +576,6 @@ brw_initialize_context_constants(struct brw_context *brw)
    ctx->Const.TextureBufferOffsetAlignment = 16;
    ctx->Const.MaxTextureBufferSize = 128 * 1024 * 1024;
 
-   /* FIXME: Tessellation stages are not yet supported in i965, so
-    * MaxCombinedShaderStorageBlocks doesn't take them into account.
-    */
-   ctx->Const.Program[MESA_SHADER_VERTEX].MaxShaderStorageBlocks = BRW_MAX_SSBO;
-   ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxShaderStorageBlocks = BRW_MAX_SSBO;
-   ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxShaderStorageBlocks = 0;
-   ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxShaderStorageBlocks = 0;
-   ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxShaderStorageBlocks = BRW_MAX_SSBO;
-   ctx->Const.Program[MESA_SHADER_COMPUTE].MaxShaderStorageBlocks = BRW_MAX_SSBO;
-   ctx->Const.MaxCombinedShaderStorageBlocks = BRW_MAX_SSBO * 3;
-   ctx->Const.MaxShaderStorageBufferBindings = BRW_MAX_SSBO * 3;
-
-   if (_mesa_extension_override_enables.ARB_compute_shader)
-      ctx->Const.MaxShaderStorageBufferBindings += BRW_MAX_SSBO;
-
    if (brw->gen >= 6) {
       ctx->Const.MaxVarying = 32;
       ctx->Const.Program[MESA_SHADER_VERTEX].MaxOutputComponents = 128;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 4b2db61c758..fe45edb89ff 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -523,6 +523,8 @@ struct brw_tracked_state {
 enum shader_time_shader_type {
    ST_NONE,
    ST_VS,
+   ST_TCS,
+   ST_TES,
    ST_GS,
    ST_FS8,
    ST_FS16,
@@ -1465,6 +1467,8 @@ void brw_upload_image_surfaces(struct brw_context *brw,
 /* brw_surface_formats.c */
 bool brw_render_target_supported(struct brw_context *brw,
                                  struct gl_renderbuffer *rb);
+bool brw_losslessly_compressible_format(struct brw_context *brw,
+                                        uint32_t brw_format);
 uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
 mesa_format brw_lower_mesa_image_format(const struct brw_device_info *devinfo,
                                         mesa_format format);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 3ad90da8b2f..36d9f716e03 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1291,6 +1291,16 @@ enum opcode {
     * Calculate the high 32-bits of a 32x32 multiply.
     */
    SHADER_OPCODE_MULH,
+
+   /**
+    * A MOV that uses VxH indirect addressing.
+    *
+    * Source 0: A register to start from (HW_REG).
+    * Source 1: An indirect offset (in bytes, UD GRF).
+    * Source 2: The length of the region that could be accessed (in bytes,
+    *           UD immediate).
+    */
+   SHADER_OPCODE_MOV_INDIRECT,
 };
 
 enum brw_urb_write_flags {
@@ -1930,8 +1940,14 @@ enum brw_message_target {
 
 /* Gen7 "GS URB Entry Allocation Size" is a U9-1 field, so the maximum gs_size
  * is 2^9, or 512.  It's counted in multiples of 64 bytes.
+ *
+ * Identical for VS, DS, and HS.
  */
 #define GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GEN7_MAX_VS_URB_ENTRY_SIZE_BYTES                (512*64)
+
 /* Gen6 "GS URB Entry Allocation Size" is defined as a number of 1024-bit
  * (128 bytes) URB rows and the maximum allowed value is 5 rows.
  */
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 250d4097e38..419168966de 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -43,6 +43,7 @@
 #include "brw_wm.h"
 #include "brw_fs.h"
 #include "brw_cs.h"
+#include "brw_nir.h"
 #include "brw_vec4_gs_visitor.h"
 #include "brw_cfg.h"
 #include "brw_dead_control_flow.h"
@@ -186,7 +187,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
     * the redundant ones.
     */
    fs_reg vec4_offset = vgrf(glsl_type::int_type);
-   bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
+   bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~3));
 
    int scale = 1;
    if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
@@ -374,54 +375,6 @@ fs_reg::fs_reg()
    this->file = BAD_FILE;
 }
 
-/** Immediate value constructor. */
-fs_reg::fs_reg(float f)
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_F;
-   this->stride = 0;
-   this->f = f;
-}
-
-/** Immediate value constructor. */
-fs_reg::fs_reg(int32_t i)
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_D;
-   this->stride = 0;
-   this->d = i;
-}
-
-/** Immediate value constructor. */
-fs_reg::fs_reg(uint32_t u)
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_UD;
-   this->stride = 0;
-   this->ud = u;
-}
-
-/** Vector float immediate value constructor. */
-fs_reg::fs_reg(uint8_t vf[4])
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_VF;
-   memcpy(&this->ud, vf, sizeof(unsigned));
-}
-
-/** Vector float immediate value constructor. */
-fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_VF;
-   this->ud = (vf0 <<  0) | (vf1 <<  8) | (vf2 << 16) | (vf3 << 24);
-}
-
 fs_reg::fs_reg(struct brw_reg reg) :
    backend_reg(reg)
 {
@@ -591,7 +544,7 @@ fs_visitor::emit_shader_time_end()
    fs_reg reset = shader_end_time;
    reset.set_smear(2);
    set_condmod(BRW_CONDITIONAL_Z,
-               ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
+               ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
    ibld.IF(BRW_PREDICATE_NORMAL);
 
    fs_reg start = shader_start_time;
@@ -606,11 +559,11 @@ fs_visitor::emit_shader_time_end()
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   cbld.ADD(diff, diff, fs_reg(-2u));
+   cbld.ADD(diff, diff, brw_imm_ud(-2u));
    SHADER_TIME_ADD(cbld, 0, diff);
-   SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
+   SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
    ibld.emit(BRW_OPCODE_ELSE);
-   SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
+   SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
    ibld.emit(BRW_OPCODE_ENDIF);
 }
 
@@ -620,7 +573,7 @@ fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
                             fs_reg value)
 {
    int index = shader_time_index * 3 + shader_time_subindex;
-   fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
+   struct brw_reg offset = brw_imm_d(index * SHADER_TIME_STRIDE);
 
    fs_reg payload;
    if (dispatch_width == 8)
@@ -841,6 +794,34 @@ fs_inst::regs_read(int arg) const
    case SHADER_OPCODE_BARRIER:
       return 1;
 
+   case SHADER_OPCODE_MOV_INDIRECT:
+      if (arg == 0) {
+         assert(src[2].file == IMM);
+         unsigned region_length = src[2].ud;
+
+         if (src[0].file == FIXED_GRF) {
+            /* If the start of the region is not register aligned, then
+             * there's some portion of the register that's technically
+             * unread at the beginning.
+             *
+             * However, the register allocator works in terms of whole
+             * registers, and does not use subnr.  It assumes that the
+             * read starts at the beginning of the register, and extends
+             * regs_read() whole registers beyond that.
+             *
+             * To compensate, we extend the region length to include this
+             * unread portion at the beginning.
+             */
+            if (src[0].subnr)
+               region_length += src[0].subnr * type_sz(src[0].type);
+
+            return DIV_ROUND_UP(region_length, REG_SIZE);
+         } else {
+            assert(!"Invalid register file");
+         }
+      }
+      break;
+
    default:
       if (is_tex() && arg == 0 && src[0].file == VGRF)
          return mlen;
@@ -1005,7 +986,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
    if (pixel_center_integer) {
       bld.MOV(wpos, this->pixel_x);
    } else {
-      bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
+      bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
    }
    wpos = offset(wpos, bld, 1);
 
@@ -1021,7 +1002,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 	 offset += key->drawable_height - 1.0f;
       }
 
-      bld.ADD(wpos, pixel_y, fs_reg(offset));
+      bld.ADD(wpos, pixel_y, brw_imm_f(offset));
    }
    wpos = offset(wpos, bld, 1);
 
@@ -1198,7 +1179,7 @@ fs_visitor::emit_frontfacing_interpolation()
       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
       g0.negate = true;
 
-      bld.ASR(*reg, g0, fs_reg(15));
+      bld.ASR(*reg, g0, brw_imm_d(15));
    } else {
       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
        * a boolean result from this (1/true or 0/false).
@@ -1213,7 +1194,7 @@ fs_visitor::emit_frontfacing_interpolation()
       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
       g1_6.negate = true;
 
-      bld.ASR(*reg, g1_6, fs_reg(31));
+      bld.ASR(*reg, g1_6, brw_imm_d(31));
    }
 
    return reg;
@@ -1230,7 +1211,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
       /* Convert int_sample_pos to floating point */
       bld.MOV(dst, int_sample_pos);
       /* Scale to the range [0, 1] */
-      bld.MUL(dst, dst, fs_reg(1 / 16.0f));
+      bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
    }
    else {
       /* From ARB_sample_shading specification:
@@ -1238,7 +1219,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
        *  rasterization is disabled, gl_SamplePosition will always be
        *  (0.5, 0.5).
        */
-      bld.MOV(dst, fs_reg(0.5f));
+      bld.MOV(dst, brw_imm_f(0.5f));
    }
 }
 
@@ -1333,8 +1314,8 @@ fs_visitor::emit_sampleid_setup()
 
       abld.exec_all().group(1, 0)
           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
-               fs_reg(sspi_mask));
-      abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5));
+               brw_imm_ud(sspi_mask));
+      abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
 
       /* This works for both SIMD8 and SIMD16 */
       abld.exec_all().group(4, 0)
@@ -1349,7 +1330,7 @@ fs_visitor::emit_sampleid_setup()
        * "When rendering to a non-multisample buffer, or if multisample
        *  rasterization is disabled, gl_SampleID will always be zero."
        */
-      abld.MOV(*reg, fs_reg(0));
+      abld.MOV(*reg, brw_imm_d(0));
    }
 
    return reg;
@@ -1662,24 +1643,7 @@ fs_visitor::assign_gs_urb_setup()
    first_non_payload_grf +=
       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
 
-   const unsigned first_icp_handle = payload.num_regs -
-      (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
-
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      /* Lower URB_READ_SIMD8 opcodes into real messages. */
-      if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
-         assert(inst->src[0].file == IMM);
-         inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
-                                            inst->src[0].ud,
-                                            0), BRW_REGISTER_TYPE_UD);
-         /* for now, assume constant - we can do per-slot offsets later */
-         assert(inst->src[1].file == IMM);
-         inst->offset = inst->src[1].ud;
-         inst->src[1] = fs_reg();
-         inst->mlen = 1;
-         inst->base_mrf = -1;
-      }
-
       /* Rewrite all ATTR file references to GRFs. */
       convert_attr_sources_to_hw_regs(inst);
    }
@@ -2037,16 +2001,16 @@ fs_visitor::demote_pull_constants()
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
-                                       fs_reg(index),
+                                       brw_imm_ud(index),
                                        *inst->src[i].reladdr,
                                        pull_index);
             inst->src[i].reladdr = NULL;
             inst->src[i].stride = 1;
          } else {
             const fs_builder ubld = ibld.exec_all().group(8, 0);
-            fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
+            struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      dst, fs_reg(index), offset);
+                      dst, brw_imm_ud(index), offset);
             inst->src[i].set_smear(pull_index & 3);
          }
          brw_mark_surface_used(prog_data, index);
@@ -2738,7 +2702,7 @@ fs_visitor::eliminate_find_live_channel()
       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
          if (depth == 0) {
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = fs_reg(0u);
+            inst->src[0] = brw_imm_ud(0u);
             inst->sources = 1;
             inst->force_writemask_all = true;
             progress = true;
@@ -3591,6 +3555,12 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
       assert(devinfo->gen >= 9);
       assert(bld.dispatch_width() != 16);
 
+      /* XXX: src_stencil is only available on gen9+. dst_depth is never
+       * available on gen9+. As such it's impossible to have both enabled at the
+       * same time and therefore length cannot overrun the array.
+       */
+      assert(length < 15);
+
       sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
       bld.exec_all().annotate("FB write OS")
          .emit(FS_OPCODE_PACK_STENCIL_REF, sources[length],
@@ -3660,7 +3630,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
        (has_lod || shadow_c.file != BAD_FILE ||
         (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
       for (unsigned i = coord_components; i < 3; i++)
-         bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+         bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
 
       msg_end = offset(msg_end, bld, 3 - coord_components);
    }
@@ -3717,7 +3687,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
          /* There's no plain shadow compare message, so we use shadow
           * compare with a bias of 0.0.
           */
-         bld.MOV(msg_end, fs_reg(0.0f));
+         bld.MOV(msg_end, brw_imm_f(0.0f));
          msg_end = offset(msg_end, bld, 1);
       }
 
@@ -3813,7 +3783,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
    case SHADER_OPCODE_TXF_CMS:
       msg_lod = offset(msg_coords, bld, 3);
       /* lod */
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
       /* sample index */
       bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
       msg_end = offset(msg_lod, bld, 2);
@@ -3896,7 +3866,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
    if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
        op == SHADER_OPCODE_TEX) {
       op = SHADER_OPCODE_TXL;
-      lod = fs_reg(0.0f);
+      lod = brw_imm_f(0.0f);
    }
 
    /* Set up the LOD info */
@@ -4110,7 +4080,7 @@ emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
 {
    fs_builder ubld = bld.exec_all().group(8, 0);
    const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-   ubld.MOV(dst, fs_reg(0));
+   ubld.MOV(dst, brw_imm_d(0));
    ubld.MOV(component(dst, 7), sample_mask);
    return dst;
 }
@@ -4252,7 +4222,7 @@ fs_visitor::lower_logical_sends()
       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
          lower_surface_logical_send(ibld, inst,
                                     SHADER_OPCODE_TYPED_SURFACE_READ,
-                                    fs_reg(0xffff));
+                                    brw_imm_d(0xffff));
          break;
 
       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
@@ -4677,6 +4647,8 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    case IMM:
       unreachable("not reached");
    }
+   if (inst->dst.stride != 1)
+      fprintf(file, "<%u>", inst->dst.stride);
    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
 
    for (int i = 0; i < inst->sources; i++) {
@@ -4764,6 +4736,16 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
          fprintf(file, "|");
 
       if (inst->src[i].file != IMM) {
+         unsigned stride;
+         if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
+            unsigned hstride = inst->src[i].hstride;
+            stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
+         } else {
+            stride = inst->src[i].stride;
+         }
+         if (stride != 1)
+            fprintf(file, "<%u>", stride);
+
          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
       }
 
@@ -5241,7 +5223,7 @@ fs_visitor::run_gs()
        */
       if (gs_compile->control_data_header_size_bits <= 32) {
          const fs_builder abld = bld.annotate("initialize control data bits");
-         abld.MOV(this->control_data_bits, fs_reg(0u));
+         abld.MOV(this->control_data_bits, brw_imm_ud(0u));
       }
    }
 
@@ -5474,13 +5456,18 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
                const struct brw_wm_prog_key *key,
                struct brw_wm_prog_data *prog_data,
-               const nir_shader *shader,
+               const nir_shader *src_shader,
                struct gl_program *prog,
                int shader_time_index8, int shader_time_index16,
                bool use_rep_send,
                unsigned *final_assembly_size,
                char **error_str)
 {
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
+                                      true);
+   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+
    /* key->alpha_test_func means simulating alpha testing via discards,
     * so the shader definitely kills pixels.
     */
@@ -5633,11 +5620,16 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
                const struct brw_cs_prog_key *key,
                struct brw_cs_prog_data *prog_data,
-               const nir_shader *shader,
+               const nir_shader *src_shader,
                int shader_time_index,
                unsigned *final_assembly_size,
                char **error_str)
 {
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
+                                      true);
+   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+
    prog_data->local_size[0] = shader->info.cs.local_size[0];
    prog_data->local_size[1] = shader->info.cs.local_size[1];
    prog_data->local_size[2] = shader->info.cs.local_size[2];
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 9b56afd292f..658608f9951 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -116,10 +116,6 @@ public:
    void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
    void compute_clip_distance(gl_clip_plane *clip_planes);
 
-   uint32_t gather_channel(int orig_chan, uint32_t surface, uint32_t sampler);
-   void swizzle_result(ir_texture_opcode op, int dest_components,
-                       fs_reg orig_val, uint32_t sampler);
-
    fs_inst *get_instruction_generating_reg(fs_inst *start,
 					   fs_inst *end,
 					   const fs_reg &reg);
@@ -218,8 +214,6 @@ public:
    void emit_interpolation_setup_gen4();
    void emit_interpolation_setup_gen6();
    void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
-   fs_reg rescale_texcoord(fs_reg coordinate, int coord_components,
-                           bool is_rect, uint32_t sampler);
    void emit_texture(ir_texture_opcode op,
                      const glsl_type *dest_type,
                      fs_reg coordinate, int components,
@@ -230,7 +224,6 @@ public:
                      fs_reg mcs,
                      int gather_component,
                      bool is_cube_array,
-                     bool is_rect,
                      uint32_t surface,
                      fs_reg surface_reg,
                      uint32_t sampler,
@@ -305,7 +298,8 @@ public:
                        unsigned stream_id);
    void emit_gs_thread_end();
    void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
-                           unsigned offset, unsigned num_components);
+                           const fs_reg &indirect_offset, unsigned imm_offset,
+                           unsigned num_components);
    void emit_cs_terminate();
    fs_reg *emit_cs_local_invocation_id_setup();
    fs_reg *emit_cs_work_group_id_setup();
@@ -530,6 +524,11 @@ private:
                                  struct brw_reg offset,
                                  struct brw_reg value);
 
+   void generate_mov_indirect(fs_inst *inst,
+                              struct brw_reg dst,
+                              struct brw_reg reg,
+                              struct brw_reg indirect_byte_offset);
+
    bool patch_discard_jumps_to_fb_writes();
 
    const struct brw_compiler *compiler;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 22b2f22073f..dd3c383a17d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -225,7 +225,7 @@ namespace brw {
       sample_mask_reg() const
       {
          if (shader->stage != MESA_SHADER_FRAGMENT) {
-            return src_reg(0xffff);
+            return brw_imm_d(0xffff);
          } else if (((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill) {
             return brw_flag_reg(0, 1);
          } else {
@@ -548,7 +548,7 @@ namespace brw {
             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 
             MUL(y_times_a, y, a);
-            ADD(one_minus_a, negate(a), src_reg(1.0f));
+            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
          }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
index 8fdc959f992..7c01f1e3d62 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
@@ -90,7 +90,8 @@ opt_cmod_propagation_local(bblock_t *block)
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
          if (scan_inst->overwrites_reg(inst->src[0])) {
             if (scan_inst->is_partial_write() ||
-                scan_inst->dst.reg_offset != inst->src[0].reg_offset)
+                scan_inst->dst.reg_offset != inst->src[0].reg_offset ||
+                scan_inst->exec_size != inst->exec_size)
                break;
 
             /* CMP's result is the same regardless of dest type. */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index 0c115f50748..c3ad7ad4771 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -279,7 +279,7 @@ fs_visitor::opt_combine_constants()
                       imm->block->last_non_control_flow_inst()->next);
       const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
 
-      ibld.MOV(reg, fs_reg(imm->val));
+      ibld.MOV(reg, brw_imm_f(imm->val));
       imm->nr = reg.nr;
       imm->subreg_offset = reg.subreg_offset;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 426ea57d8f9..62ae9abede7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -275,6 +275,59 @@ is_logic_op(enum opcode opcode)
            opcode == BRW_OPCODE_NOT);
 }
 
+static bool
+can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
+                const brw_device_info *devinfo)
+{
+   if (stride > 4)
+      return false;
+
+   /* 3-source instructions can only be Align16, which restricts what strides
+    * they can take. They can only take a stride of 1 (the usual case), or 0
+    * with a special "repctrl" bit. But the repctrl bit doesn't work for
+    * 64-bit datatypes, so if the source type is 64-bit then only a stride of
+    * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page
+    * 944:
+    *
+    *    This is applicable to 32b datatypes and 16b datatype. 64b datatypes
+    *    cannot use the replicate control.
+    */
+   if (inst->is_3src()) {
+      if (type_sz(inst->src[arg].type) > 4)
+         return stride == 1;
+      else
+         return stride == 1 || stride == 0;
+   }
+
+   /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions",
+    * page 391 ("Extended Math Function"):
+    *
+    *     The following restrictions apply for align1 mode: Scalar source is
+    *     supported. Source and destination horizontal stride must be the
+    *     same.
+    *
+    * From the Haswell PRM Volume 2b "Command Reference - Instructions", page
+    * 134 ("Extended Math Function"):
+    *
+    *    Scalar source is supported. Source and destination horizontal stride
+    *    must be 1.
+    *
+    * and similar language exists for IVB and SNB. Pre-SNB, math instructions
+    * are sends, so the sources are moved to MRF's and there are no
+    * restrictions.
+    */
+   if (inst->is_math()) {
+      if (devinfo->gen == 6 || devinfo->gen == 7) {
+         assert(inst->dst.stride == 1);
+         return stride == 1 || stride == 0;
+      } else if (devinfo->gen >= 8) {
+         return stride == inst->dst.stride || stride == 0;
+      }
+   }
+
+   return true;
+}
+
 bool
 fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 {
@@ -326,7 +379,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    /* Bail if the result of composing both strides would exceed the
     * hardware limit.
     */
-   if (entry->src.stride * inst->src[arg].stride > 4)
+   if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride,
+                        devinfo))
       return false;
 
    /* Bail if the instruction type is larger than the execution type of the
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 8c67caff6e0..3b65a382dc8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -78,6 +78,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case FS_OPCODE_LINTERP:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_MOV_INDIRECT:
       return true;
    case SHADER_OPCODE_RCP:
    case SHADER_OPCODE_RSQ:
@@ -209,6 +210,8 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
       copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
    } else {
       copy = bld.MOV(inst->dst, src);
+      copy->force_sechalf = inst->force_sechalf;
+      copy->force_writemask_all = inst->force_writemask_all;
       copy->src[0].negate = negate;
    }
    assert(copy->regs_written == written);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 9d7fb94c397..8528f391941 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -89,39 +89,9 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen)
       brw_reg.abs = reg->abs;
       brw_reg.negate = reg->negate;
       break;
-   case IMM:
-      assert(reg->stride == ((reg->type == BRW_REGISTER_TYPE_V ||
-                              reg->type == BRW_REGISTER_TYPE_UV ||
-                              reg->type == BRW_REGISTER_TYPE_VF) ? 1 : 0));
-
-      switch (reg->type) {
-      case BRW_REGISTER_TYPE_F:
-	 brw_reg = brw_imm_f(reg->f);
-	 break;
-      case BRW_REGISTER_TYPE_D:
-	 brw_reg = brw_imm_d(reg->d);
-	 break;
-      case BRW_REGISTER_TYPE_UD:
-	 brw_reg = brw_imm_ud(reg->ud);
-	 break;
-      case BRW_REGISTER_TYPE_W:
-	 brw_reg = brw_imm_w(reg->d);
-	 break;
-      case BRW_REGISTER_TYPE_UW:
-	 brw_reg = brw_imm_uw(reg->ud);
-	 break;
-      case BRW_REGISTER_TYPE_VF:
-         brw_reg = brw_imm_vf(reg->ud);
-         break;
-      case BRW_REGISTER_TYPE_V:
-         brw_reg = brw_imm_v(reg->ud);
-         break;
-      default:
-	 unreachable("not reached");
-      }
-      break;
    case ARF:
    case FIXED_GRF:
+   case IMM:
       brw_reg = *static_cast<struct brw_reg *>(reg);
       break;
    case BAD_FILE:
@@ -372,6 +342,36 @@ fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
 }
 
 void
+fs_generator::generate_mov_indirect(fs_inst *inst,
+                                    struct brw_reg dst,
+                                    struct brw_reg reg,
+                                    struct brw_reg indirect_byte_offset)
+{
+   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
+   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
+
+   /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+   struct brw_reg addr = vec8(brw_address_reg(0));
+
+   /* The destination stride of an instruction (in bytes) must be greater
+    * than or equal to the size of the rest of the instruction.  Since the
+    * address register is of type UW, we can't use a D-type instruction.
+    * In order to get around this, re re-type to UW and use a stride.
+    */
+   indirect_byte_offset =
+      retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+
+   /* Prior to Broadwell, there are only 8 address registers. */
+   assert(inst->exec_size == 8 || devinfo->gen >= 8);
+
+   brw_MOV(p, addr, indirect_byte_offset);
+   brw_inst_set_mask_control(devinfo, brw_last_inst, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type));
+}
+
+void
 fs_generator::generate_urb_read(fs_inst *inst,
                                 struct brw_reg dst,
                                 struct brw_reg header)
@@ -700,6 +700,17 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       break;
    }
 
+   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
+    * is set as part of the message descriptor.  On gen4, the PRM seems to
+    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
+    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
+    * gone from the message descriptor entirely and you just get UINT32 all
+    * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
+    * just stomp it to UINT32 all the time.
+    */
+   if (inst->opcode == SHADER_OPCODE_TXS)
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+
    switch (inst->exec_size) {
    case 8:
       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
@@ -2087,6 +2098,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          fill_count++;
 	 break;
 
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(inst, dst, src[0], src[1]);
+         break;
+
       case SHADER_OPCODE_URB_READ_SIMD8:
       case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
          generate_urb_read(inst, dst, src[0]);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 3a666b8debc..6b0c4a5b36e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -208,7 +208,7 @@ emit_system_values_block(nir_block *block, void *void_visitor)
             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-            abld.SHR(iid, g1, fs_reg(27u));
+            abld.SHR(iid, g1, brw_imm_ud(27u));
             *reg = iid;
          }
          break;
@@ -250,6 +250,57 @@ emit_system_values_block(nir_block *block, void *void_visitor)
             *reg = *v->emit_cs_work_group_id_setup();
          break;
 
+      case nir_intrinsic_load_helper_invocation:
+         assert(v->stage == MESA_SHADER_FRAGMENT);
+         reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
+         if (reg->file == BAD_FILE) {
+            const fs_builder abld =
+               v->bld.annotate("gl_HelperInvocation", NULL);
+
+            /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
+             * pixel mask is in g1.7 of the thread payload.
+             *
+             * We move the per-channel pixel enable bit to the low bit of each
+             * channel by shifting the byte containing the pixel mask by the
+             * vector immediate 0x76543210UV.
+             *
+             * The region of <1,8,0> reads only 1 byte (the pixel masks for
+             * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
+             * masks for 2 and 3) in SIMD16.
+             */
+            fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
+            abld.SHR(shifted,
+                     stride(byte_offset(retype(brw_vec1_grf(1, 0),
+                                               BRW_REGISTER_TYPE_UB), 28),
+                            1, 8, 0),
+                     brw_imm_uv(0x76543210));
+
+            /* A set bit in the pixel mask means the channel is enabled, but
+             * that is the opposite of gl_HelperInvocation so we need to invert
+             * the mask.
+             *
+             * The negate source-modifier bit of logical instructions on Gen8+
+             * performs 1's complement negation, so we can use that instead of
+             * a NOT instruction.
+             */
+            fs_reg inverted = negate(shifted);
+            if (v->devinfo->gen < 8) {
+               inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
+               abld.NOT(inverted, shifted);
+            }
+
+            /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
+             * with 1 and negating.
+             */
+            fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            abld.AND(anded, inverted, brw_imm_uw(1));
+
+            fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
+            abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
+            *reg = dst;
+         }
+         break;
+
       default:
          break;
       }
@@ -454,8 +505,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
       tmp.subreg_offset = 2;
       tmp.stride = 2;
 
-      fs_inst *or_inst = bld.OR(tmp, g0, fs_reg(0x3f80));
-      or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
+      bld.OR(tmp, g0, brw_imm_uw(0x3f80));
 
       tmp.type = BRW_REGISTER_TYPE_D;
       tmp.subreg_offset = 0;
@@ -479,9 +529,9 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
          g1_6.negate = true;
       }
 
-      bld.OR(tmp, g1_6, fs_reg(0x3f800000));
+      bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
    }
-   bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, fs_reg(0xbf800000));
+   bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
 
    return true;
 }
@@ -594,14 +644,14 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
          * zero.
          */
-      bld.CMP(bld.null_reg_f(), op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ);
+      bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 
       fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
       op[0].type = BRW_REGISTER_TYPE_UD;
       result.type = BRW_REGISTER_TYPE_UD;
-      bld.AND(result_int, op[0], fs_reg(0x80000000u));
+      bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
 
-      inst = bld.OR(result_int, result_int, fs_reg(0x3f800000u));
+      inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
       inst->predicate = BRW_PREDICATE_NORMAL;
       if (instr->dest.saturate) {
          inst = bld.MOV(result, result);
@@ -615,9 +665,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        *               -> non-negative val generates 0x00000000.
        *  Predicated OR sets 1 if val is positive.
        */
-      bld.CMP(bld.null_reg_d(), op[0], fs_reg(0), BRW_CONDITIONAL_G);
-      bld.ASR(result, op[0], fs_reg(31));
-      inst = bld.OR(result, result, fs_reg(1));
+      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
+      bld.ASR(result, op[0], brw_imm_d(31));
+      inst = bld.OR(result, result, brw_imm_d(1));
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
@@ -665,21 +715,21 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_fddy:
       if (fs_key->high_quality_derivatives) {
          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
-                         fs_reg(fs_key->render_to_fbo));
+                         brw_imm_d(fs_key->render_to_fbo));
       } else {
          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
-                         fs_reg(fs_key->render_to_fbo));
+                         brw_imm_d(fs_key->render_to_fbo));
       }
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddy_fine:
       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
-                      fs_reg(fs_key->render_to_fbo));
+                      brw_imm_d(fs_key->render_to_fbo));
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddy_coarse:
       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
-                      fs_reg(fs_key->render_to_fbo));
+                      brw_imm_d(fs_key->render_to_fbo));
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -828,10 +878,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_f2b:
-      bld.CMP(result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ);
+      bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
       break;
    case nir_op_i2b:
-      bld.CMP(result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ);
+      bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
       break;
 
    case nir_op_ftrunc:
@@ -931,9 +981,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
        * subtract the result from 31 to convert the MSB count into an LSB count.
        */
-      bld.CMP(bld.null_reg_d(), result, fs_reg(-1), BRW_CONDITIONAL_NZ);
+      bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
 
-      inst = bld.ADD(result, result, fs_reg(31));
+      inst = bld.ADD(result, result, brw_imm_d(31));
       inst->predicate = BRW_PREDICATE_NORMAL;
       inst->src[0].negate = true;
       break;
@@ -986,7 +1036,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       if (optimize_frontfacing_ternary(instr, result))
          return;
 
-      bld.CMP(bld.null_reg_d(), op[0], fs_reg(0), BRW_CONDITIONAL_NZ);
+      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
       inst = bld.SEL(result, op[1], op[2]);
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
@@ -1001,7 +1051,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    if (devinfo->gen <= 5 &&
        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
       fs_reg masked = vgrf(glsl_type::int_type);
-      bld.AND(masked, result, fs_reg(1));
+      bld.AND(masked, result, brw_imm_d(1));
       masked.negate = true;
       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
    }
@@ -1014,7 +1064,7 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
    fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
 
    for (unsigned i = 0; i < instr->def.num_components; i++)
-      bld.MOV(offset(reg, bld, i), fs_reg(instr->value.i[i]));
+      bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i[i]));
 
    nir_ssa_values[instr->def.index] = reg;
 }
@@ -1042,7 +1092,7 @@ fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
 
       reg.reladdr = new(v->mem_ctx) fs_reg(v->vgrf(glsl_type::int_type));
       v->bld.MUL(*reg.reladdr, v->get_nir_src(*indirect),
-                 fs_reg(multiplier));
+                 brw_imm_d(multiplier));
    }
 
    return reg;
@@ -1108,12 +1158,12 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
              */
             bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
                                         BRW_REGISTER_TYPE_UD),
-                            fs_reg(size - base - 1), BRW_CONDITIONAL_L);
+                            brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
          } else {
             bld.MOV(tmp, get_nir_src(deref_array->indirect));
          }
 
-         bld.MUL(tmp, tmp, fs_reg(element_size));
+         bld.MUL(tmp, tmp, brw_imm_ud(element_size));
          if (image.reladdr)
             bld.ADD(*image.reladdr, *image.reladdr, tmp);
          else
@@ -1232,7 +1282,7 @@ intexp2(const fs_builder &bld, const fs_reg &x)
    fs_reg result = bld.vgrf(x.type, 1);
    fs_reg one = bld.vgrf(x.type, 1);
 
-   bld.MOV(one, retype(fs_reg(1), one.type));
+   bld.MOV(one, retype(brw_imm_d(1), one.type));
    bld.SHL(result, one, x);
    return result;
 }
@@ -1285,7 +1335,7 @@ fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
 
    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-   abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
+   abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
    fs_reg mask = intexp2(abld, prev_count);
    /* Note: we're relying on the fact that the GEN SHL instruction only pays
     * attention to the lower 5 bits of its second source argument, so on this
@@ -1356,26 +1406,26 @@ fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-      abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
+      abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
       unsigned log2_bits_per_vertex =
          _mesa_fls(gs_compile->control_data_bits_per_vertex);
-      abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex));
+      abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
 
       if (per_slot_offset.file != BAD_FILE) {
          /* Set the per-slot offset to dword_index / 4, so that we'll write to
           * the appropriate OWord within the control data header.
           */
-         abld.SHR(per_slot_offset, dword_index, fs_reg(2u));
+         abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
       }
 
       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
        * write to the appropriate DWORD within the OWORD.
        */
       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-      fwa_bld.AND(channel, dword_index, fs_reg(3u));
+      fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
       channel_mask = intexp2(fwa_bld, channel);
       /* Then the channel masks need to be in bits 23:16. */
-      fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u));
+      fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
    }
 
    /* Store the control data bits in the message payload and send it. */
@@ -1435,11 +1485,11 @@ fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
 
    /* reg::sid = stream_id */
    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-   abld.MOV(sid, fs_reg(stream_id));
+   abld.MOV(sid, brw_imm_ud(stream_id));
 
    /* reg:shift_count = 2 * (vertex_count - 1) */
    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-   abld.SHL(shift_count, vertex_count, fs_reg(1u));
+   abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
 
    /* Note: we're relying on the fact that the GEN SHL instruction only pays
     * attention to the lower 5 bits of its second source argument, so on this
@@ -1510,14 +1560,14 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
        */
       fs_inst *inst =
          abld.AND(bld.null_reg_d(), vertex_count,
-                  fs_reg(32u / gs_compile->control_data_bits_per_vertex - 1u));
+                  brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
       inst->conditional_mod = BRW_CONDITIONAL_Z;
 
       abld.IF(BRW_PREDICATE_NORMAL);
       /* If vertex_count is 0, then no control data bits have been
        * accumulated yet, so we can skip emitting them.
        */
-      abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u),
+      abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
                BRW_CONDITIONAL_NEQ);
       abld.IF(BRW_PREDICATE_NORMAL);
       emit_gs_control_data_bits(vertex_count);
@@ -1530,7 +1580,7 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
        * effect of any call to EndPrimitive() that the shader may have
        * made before outputting its first vertex.
        */
-      inst = abld.MOV(this->control_data_bits, fs_reg(0u));
+      inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
       inst->force_writemask_all = true;
       abld.emit(BRW_OPCODE_ENDIF);
    }
@@ -1551,42 +1601,113 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
 void
 fs_visitor::emit_gs_input_load(const fs_reg &dst,
                                const nir_src &vertex_src,
-                               unsigned input_offset,
+                               const fs_reg &indirect_offset,
+                               unsigned imm_offset,
                                unsigned num_components)
 {
-   const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
-   const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
+   struct brw_gs_prog_data *gs_prog_data = (struct brw_gs_prog_data *) prog_data;
 
-   const unsigned array_stride = vue_prog_data->urb_read_length * 8;
+   /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
+    * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
+    * gl_PointSize is available as a GS input, however, so it must be that.
+    */
+   const bool is_point_size =
+      indirect_offset.file == BAD_FILE && imm_offset == 0;
+
+   nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
+
+   if (indirect_offset.file == BAD_FILE && vertex_const != NULL &&
+       4 * imm_offset < push_reg_count) {
+      imm_offset = 4 * imm_offset + vertex_const->u[0] * push_reg_count;
+      /* This input was pushed into registers. */
+      if (is_point_size) {
+         /* gl_PointSize comes in .w */
+         bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
+      } else {
+         for (unsigned i = 0; i < num_components; i++) {
+            bld.MOV(offset(dst, bld, i),
+                    fs_reg(ATTR, imm_offset + i, dst.type));
+         }
+      }
+   } else {
+      /* Resort to the pull model.  Ensure the VUE handles are provided. */
+      gs_prog_data->base.include_vue_handles = true;
 
-   const bool pushed = 4 * input_offset < array_stride;
+      unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
+      fs_reg icp_handle;
 
-   if (input_offset == 0) {
-      /* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
-       * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
-       * Only gl_PointSize is available as a GS input, so they must
-       * be asking for that input.
-       */
-      if (pushed) {
-         bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
+      if (vertex_const) {
+         /* The vertex index is constant; just select the proper URB handle. */
+         icp_handle =
+            retype(brw_vec8_grf(first_icp_handle + vertex_const->i[0], 0),
+                   BRW_REGISTER_TYPE_UD);
       } else {
-         fs_reg tmp = bld.vgrf(dst.type, 4);
-         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
-                                  fs_reg(vertex), fs_reg(0));
-         inst->regs_written = 4;
-         bld.MOV(dst, offset(tmp, bld, 3));
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          *
+          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+          * indicating that channel <n> should read the handle from
+          * DWord <n>.  We convert that to bytes by multiplying by 4.
+          *
+          * Next, we convert the vertex index to bytes by multiplying
+          * by 32 (shifting by 5), and add the two together.  This is
+          * the final indirect byte offset.
+          */
+         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
+         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+         bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+         bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+         /* Convert vertex_index to bytes (multiply by 32) */
+         bld.SHL(vertex_offset_bytes,
+                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(5u));
+         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+         /* Use first_icp_handle as the base offset.  There is one register
+          * of URB handles per vertex, so inform the register allocator that
+          * we might read up to nir->info.gs.vertices_in registers.
+          */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+                  fs_reg(brw_vec8_grf(first_icp_handle, 0)),
+                  fs_reg(icp_offset_bytes),
+                  brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
       }
-   } else {
-      if (pushed) {
-         int index = vertex * array_stride + 4 * input_offset;
-         for (unsigned i = 0; i < num_components; i++) {
-            bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
-         }
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Constant indexing - use global offset. */
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->mlen = 1;
+         inst->regs_written = num_components;
       } else {
-         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
-                                  fs_reg(vertex), fs_reg(input_offset));
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = { icp_handle, indirect_offset };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->mlen = 2;
          inst->regs_written = num_components;
       }
+
+      if (is_point_size) {
+         /* Read the whole VUE header (because of alignment) and read .w. */
+         fs_reg tmp = bld.vgrf(dst.type, 4);
+         inst->dst = tmp;
+         inst->regs_written = 4;
+         bld.MOV(dst, offset(tmp, bld, 3));
+      }
    }
 }
 
@@ -1626,6 +1747,7 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
                                   nir_intrinsic_instr *instr)
 {
    assert(stage == MESA_SHADER_GEOMETRY);
+   fs_reg indirect_offset;
 
    fs_reg dest;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -1644,9 +1766,11 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
       unreachable("load_input intrinsics are invalid for the GS stage");
 
    case nir_intrinsic_load_per_vertex_input_indirect:
-      assert(!"Not allowed");
+      indirect_offset = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_D);
+      /* fallthrough */
    case nir_intrinsic_load_per_vertex_input:
-      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
+      emit_gs_input_load(dest, instr->src[0],
+                         indirect_offset, instr->const_index[0],
                          instr->num_components);
       break;
 
@@ -1703,6 +1827,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
       break;
    }
 
+   case nir_intrinsic_load_helper_invocation:
    case nir_intrinsic_load_sample_mask_in:
    case nir_intrinsic_load_sample_id: {
       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
@@ -1723,7 +1848,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
       fs_inst *cmp;
       if (instr->intrinsic == nir_intrinsic_discard_if) {
          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
-                       fs_reg(0), BRW_CONDITIONAL_Z);
+                       brw_imm_d(0), BRW_CONDITIONAL_Z);
       } else {
          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
                                        BRW_REGISTER_TYPE_UW));
@@ -1771,7 +1896,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
                                       FS_OPCODE_INTERPOLATE_AT_CENTROID,
                                       dst_xy,
                                       fs_reg(), /* src */
-                                      fs_reg(0u),
+                                      brw_imm_ud(0u),
                                       interpolation);
          break;
 
@@ -1785,7 +1910,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
                                          dst_xy,
                                          fs_reg(), /* src */
-                                         fs_reg(msg_data),
+                                         brw_imm_ud(msg_data),
                                          interpolation);
          } else {
             const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
@@ -1794,7 +1919,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
             if (nir_src_is_dynamically_uniform(instr->src[0])) {
                const fs_reg sample_id = bld.emit_uniformize(sample_src);
                const fs_reg msg_data = vgrf(glsl_type::uint_type);
-               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
+               bld.exec_all().group(1, 0)
+                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
                emit_pixel_interpolater_send(bld,
                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
                                             dst_xy,
@@ -1820,7 +1946,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
                        sample_src, sample_id,
                        BRW_CONDITIONAL_EQ);
                const fs_reg msg_data = vgrf(glsl_type::uint_type);
-               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
+               bld.exec_all().group(1, 0)
+                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
                fs_inst *inst =
                   emit_pixel_interpolater_send(bld,
                                                FS_OPCODE_INTERPOLATE_AT_SAMPLE,
@@ -1851,7 +1978,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
                                          FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
                                          dst_xy,
                                          fs_reg(), /* src */
-                                         fs_reg(off_x | (off_y << 4)),
+                                         brw_imm_ud(off_x | (off_y << 4)),
                                          interpolation);
          } else {
             fs_reg src = vgrf(glsl_type::ivec2_type);
@@ -1859,7 +1986,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
                                        BRW_REGISTER_TYPE_F);
             for (int i = 0; i < 2; i++) {
                fs_reg temp = vgrf(glsl_type::float_type);
-               bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
+               bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
                fs_reg itemp = vgrf(glsl_type::int_type);
                bld.MOV(itemp, temp);  /* float to int */
 
@@ -1879,7 +2006,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
                 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
                 */
                set_condmod(BRW_CONDITIONAL_L,
-                           bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
+                           bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
             }
 
             const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
@@ -1887,7 +2014,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
                                          opcode,
                                          dst_xy,
                                          src,
-                                         fs_reg(0u),
+                                         brw_imm_ud(0u),
                                          interpolation);
          }
          break;
@@ -1947,14 +2074,14 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
 
       cs_prog_data->uses_num_work_groups = true;
 
-      fs_reg surf_index = fs_reg(surface);
+      fs_reg surf_index = brw_imm_ud(surface);
       brw_mark_surface_used(prog_data, surface);
 
       /* Read the 3 GLuint components of gl_NumWorkGroups */
       for (unsigned i = 0; i < 3; i++) {
          fs_reg read_result =
             emit_untyped_read(bld, surf_index,
-                              fs_reg(i << 2),
+                              brw_imm_ud(i << 2),
                               1 /* dims */, 1 /* size */,
                               BRW_PREDICATE_NONE);
          read_result.type = dest.type;
@@ -1994,16 +2121,16 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       /* Emit a surface read or atomic op. */
       switch (instr->intrinsic) {
       case nir_intrinsic_atomic_counter_read:
-         tmp = emit_untyped_read(bld, fs_reg(surface), offset, 1, 1);
+         tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
          break;
 
       case nir_intrinsic_atomic_counter_inc:
-         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+         tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, fs_reg(),
                                    fs_reg(), 1, 1, BRW_AOP_INC);
          break;
 
       case nir_intrinsic_atomic_counter_dec:
-         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+         tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, fs_reg(),
                                    fs_reg(), 1, 1, BRW_AOP_PREDEC);
          break;
 
@@ -2145,14 +2272,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       for (unsigned c = 0; c < info->dest_components; ++c) {
          if ((int)c >= type->coordinate_components()) {
              bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                     fs_reg(1));
+                     brw_imm_d(1));
          } else if (c == 1 && is_1d_array_image) {
             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
                     offset(size, bld, 2));
          } else if (c == 2 && is_cube_array_image) {
             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
                      offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                     offset(size, bld, c), fs_reg(6));
+                     offset(size, bld, c), brw_imm_d(6));
          } else {
             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
                     offset(size, bld, c));
@@ -2164,7 +2291,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
    case nir_intrinsic_image_samples:
       /* The driver does not support multi-sampled images. */
-      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), fs_reg(1));
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
       break;
 
    case nir_intrinsic_load_uniform_indirect:
@@ -2195,7 +2322,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       if (const_index) {
          const unsigned index = stage_prog_data->binding_table.ubo_start +
                                 const_index->u[0];
-         surf_index = fs_reg(index);
+         surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
          /* The block index is not a constant. Evaluate the index expression
@@ -2204,7 +2331,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           */
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[0]),
-                 fs_reg(stage_prog_data->binding_table.ubo_start));
+                 brw_imm_ud(stage_prog_data->binding_table.ubo_start));
          surf_index = bld.emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
@@ -2220,7 +2347,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg base_offset = vgrf(glsl_type::int_type);
          bld.SHR(base_offset, retype(get_nir_src(instr->src[1]),
                                      BRW_REGISTER_TYPE_D),
-                 fs_reg(2));
+                 brw_imm_d(2));
 
          unsigned vec4_offset = instr->const_index[0] / 4;
          for (int i = 0; i < instr->num_components; i++)
@@ -2230,7 +2357,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg packed_consts = vgrf(glsl_type::float_type);
          packed_consts.type = dest.type;
 
-         fs_reg const_offset_reg((unsigned) instr->const_index[0] & ~15);
+         struct brw_reg const_offset_reg = brw_imm_ud(instr->const_index[0] & ~15);
          bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
                   surf_index, const_offset_reg);
 
@@ -2262,12 +2389,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       if (const_uniform_block) {
          unsigned index = stage_prog_data->binding_table.ssbo_start +
                           const_uniform_block->u[0];
-         surf_index = fs_reg(index);
+         surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[0]),
-                 fs_reg(stage_prog_data->binding_table.ssbo_start));
+                 brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -2282,7 +2409,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       if (has_indirect) {
          offset_reg = get_nir_src(instr->src[1]);
       } else {
-         offset_reg = fs_reg(instr->const_index[0]);
+         offset_reg = brw_imm_ud(instr->const_index[0]);
       }
 
       /* Read the vector */
@@ -2333,12 +2460,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       if (const_uniform_block) {
          unsigned index = stage_prog_data->binding_table.ssbo_start +
                           const_uniform_block->u[0];
-         surf_index = fs_reg(index);
+         surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[1]),
-                  fs_reg(stage_prog_data->binding_table.ssbo_start));
+                  brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
 
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ssbo_start +
@@ -2362,12 +2489,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg offset_reg;
 
          if (!has_indirect) {
-            offset_reg = fs_reg(instr->const_index[0] + 4 * first_component);
+            offset_reg = brw_imm_ud(instr->const_index[0] + 4 * first_component);
          } else {
             offset_reg = vgrf(glsl_type::uint_type);
             bld.ADD(offset_reg,
                     retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
-                    fs_reg(4 * first_component));
+                    brw_imm_ud(4 * first_component));
          }
 
          emit_untyped_write(bld, surf_index, offset_reg,
@@ -2438,7 +2565,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       int reg_width = dispatch_width / 8;
 
       /* Set LOD = 0 */
-      fs_reg source = fs_reg(0);
+      fs_reg source = brw_imm_d(0);
 
       int mlen = 1 * reg_width;
 
@@ -2457,7 +2584,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                                   BRW_REGISTER_TYPE_UD);
       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
       fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, buffer_size,
-                               src_payload, fs_reg(index));
+                               src_payload, brw_imm_ud(index));
       inst->header_size = 0;
       inst->mlen = mlen;
       inst->regs_written = regs_written;
@@ -2486,12 +2613,12 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
    if (const_surface) {
       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
                             const_surface->u[0];
-      surface = fs_reg(surf_index);
+      surface = brw_imm_ud(surf_index);
       brw_mark_surface_used(prog_data, surf_index);
    } else {
       surface = vgrf(glsl_type::uint_type);
       bld.ADD(surface, get_nir_src(instr->src[0]),
-              fs_reg(stage_prog_data->binding_table.ssbo_start));
+              brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
 
       /* Assume this may touch any SSBO. This is the same we do for other
        * UBO/SSBO accesses with non-constant surface.
@@ -2524,13 +2651,11 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
 {
    unsigned texture = instr->texture_index;
    unsigned sampler = instr->sampler_index;
-   fs_reg texture_reg(texture);
-   fs_reg sampler_reg(sampler);
+   fs_reg texture_reg(brw_imm_ud(texture));
+   fs_reg sampler_reg(brw_imm_ud(sampler));
 
    int gather_component = instr->component;
 
-   bool is_rect = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
-
    bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
                         instr->is_array;
 
@@ -2552,6 +2677,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
          switch (instr->op) {
          case nir_texop_txf:
          case nir_texop_txf_ms:
+         case nir_texop_samples_identical:
             coordinate = retype(src, BRW_REGISTER_TYPE_D);
             break;
          default:
@@ -2604,7 +2730,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
 
          /* Emit code to evaluate the actual indexing expression */
          texture_reg = vgrf(glsl_type::uint_type);
-         bld.ADD(texture_reg, src, fs_reg(texture));
+         bld.ADD(texture_reg, src, brw_imm_ud(texture));
          texture_reg = bld.emit_uniformize(texture_reg);
          break;
       }
@@ -2612,7 +2738,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
       case nir_tex_src_sampler_offset: {
          /* Emit code to evaluate the actual indexing expression */
          sampler_reg = vgrf(glsl_type::uint_type);
-         bld.ADD(sampler_reg, src, fs_reg(sampler));
+         bld.ADD(sampler_reg, src, brw_imm_ud(sampler));
          sampler_reg = bld.emit_uniformize(sampler_reg);
          break;
       }
@@ -2622,19 +2748,20 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
       }
    }
 
-   if (instr->op == nir_texop_txf_ms) {
+   if (instr->op == nir_texop_txf_ms ||
+       instr->op == nir_texop_samples_identical) {
       if (devinfo->gen >= 7 &&
           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
          mcs = emit_mcs_fetch(coordinate, instr->coord_components, texture_reg);
       } else {
-         mcs = fs_reg(0u);
+         mcs = brw_imm_ud(0u);
       }
    }
 
    for (unsigned i = 0; i < 3; i++) {
       if (instr->const_offset[i] != 0) {
          assert(offset_components == 0);
-         tex_offset = fs_reg(brw_texture_offset(instr->const_offset, 3));
+         tex_offset = brw_imm_ud(brw_texture_offset(instr->const_offset, 3));
          break;
       }
    }
@@ -2668,6 +2795,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
       inst->base_mrf = -1;
       return;
    }
+   case nir_texop_samples_identical: op = ir_samples_identical; break;
    default:
       unreachable("unknown texture opcode");
    }
@@ -2675,8 +2803,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
    emit_texture(op, dest_type, coordinate, instr->coord_components,
                 shadow_comparitor, lod, lod2, lod_components, sample_index,
                 tex_offset, mcs, gather_component,
-                is_cube_array, is_rect,
-                texture, texture_reg, sampler, sampler_reg);
+                is_cube_array, texture, texture_reg, sampler, sampler_reg);
 
    fs_reg dest = get_nir_dest(instr->dest);
    dest.type = this->result.type;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 534d8490cdf..45694ec0894 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -44,7 +44,7 @@ namespace brw {
              */
             const fs_reg usurface = bld.emit_uniformize(surface);
             const fs_reg srcs[] = {
-               addr, src, usurface, fs_reg(dims), fs_reg(arg)
+               addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
             };
             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
@@ -330,7 +330,7 @@ namespace {
              * messages causes a hang on IVB and VLV.
              */
             set_predicate(pred,
-                          bld.CMP(bld.null_reg_ud(), stride, fs_reg(4),
+                          bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
                                   BRW_CONDITIONAL_G));
 
             return BRW_PREDICATE_NORMAL;
@@ -361,7 +361,7 @@ namespace {
              */
             bld.CMP(bld.null_reg_ud(),
                     retype(size, BRW_REGISTER_TYPE_UD),
-                    fs_reg(0), BRW_CONDITIONAL_NZ);
+                    brw_imm_d(0), BRW_CONDITIONAL_NZ);
 
             return BRW_PREDICATE_NORMAL;
          } else {
@@ -438,7 +438,7 @@ namespace {
              * FINISHME: Factor out this frequently recurring pattern into a
              * helper function.
              */
-            const fs_reg srcs[] = { addr, fs_reg(0), offset(addr, bld, 1) };
+            const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
             const fs_reg dst = bld.vgrf(addr.type, dims);
             bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
             return dst;
@@ -488,7 +488,7 @@ namespace {
             bld.ADD(offset(addr, bld, c), offset(off, bld, c),
                     (c < dims ?
                      offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
-                     fs_reg(0)));
+                     fs_reg(brw_imm_d(0))));
 
          /* The layout of 3-D textures in memory is sort-of like a tiling
           * format.  At each miplevel, the slices are arranged in rows of
@@ -515,7 +515,7 @@ namespace {
             /* Decompose z into a major (tmp.y) and a minor (tmp.x)
              * index.
              */
-            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), fs_reg(0),
+            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
             bld.SHR(offset(tmp, bld, 1),
                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
@@ -549,7 +549,7 @@ namespace {
             for (unsigned c = 0; c < 2; ++c) {
                /* Calculate the minor x and y indices. */
                bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
-                       fs_reg(0), offset(addr, bld, c));
+                       brw_imm_d(0), offset(addr, bld, c));
 
                /* Calculate the major x and y indices. */
                bld.SHR(offset(major, bld, c),
@@ -595,7 +595,7 @@ namespace {
 
                /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
                bld.XOR(tmp, tmp, offset(tmp, bld, 1));
-               bld.AND(tmp, tmp, fs_reg(1 << 6));
+               bld.AND(tmp, tmp, brw_imm_d(1 << 6));
                bld.XOR(dst, dst, tmp);
             }
 
@@ -647,7 +647,7 @@ namespace {
                const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
 
                /* Shift each component left to the correct bitfield position. */
-               bld.SHL(tmp, offset(src, bld, c), fs_reg(shifts[c] % 32));
+               bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
 
                /* Add everything up. */
                if (seen[shifts[c] / 32]) {
@@ -679,13 +679,13 @@ namespace {
                /* Shift left to discard the most significant bits. */
                bld.SHL(offset(dst, bld, c),
                        offset(src, bld, shifts[c] / 32),
-                       fs_reg(32 - shifts[c] % 32 - widths[c]));
+                       brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
 
                /* Shift back to the least significant bits using an arithmetic
                 * shift to get sign extension on signed types.
                 */
                bld.ASR(offset(dst, bld, c),
-                       offset(dst, bld, c), fs_reg(32 - widths[c]));
+                       offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
             }
          }
 
@@ -709,13 +709,13 @@ namespace {
             if (widths[c]) {
                /* Clamp to the maximum value. */
                bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
-                               fs_reg((int)scale(widths[c] - s)),
+                               brw_imm_d((int)scale(widths[c] - s)),
                                BRW_CONDITIONAL_L);
 
                /* Clamp to the minimum value. */
                if (is_signed)
                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
-                                  fs_reg(-(int)scale(widths[c] - s) - 1),
+                                  brw_imm_d(-(int)scale(widths[c] - s) - 1),
                                   BRW_CONDITIONAL_GE);
             }
          }
@@ -741,12 +741,12 @@ namespace {
 
                /* Divide by the normalization constants. */
                bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
-                       fs_reg(1.0f / scale(widths[c] - s)));
+                       brw_imm_f(1.0f / scale(widths[c] - s)));
 
                /* Clamp to the minimum value. */
                if (is_signed)
                   bld.emit_minmax(offset(dst, bld, c),
-                                  offset(dst, bld, c), fs_reg(-1.0f),
+                                  offset(dst, bld, c), brw_imm_f(-1.0f),
                                   BRW_CONDITIONAL_GE);
             }
          }
@@ -771,10 +771,10 @@ namespace {
                /* Clamp the normalized floating-point argument. */
                if (is_signed) {
                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
-                                  fs_reg(-1.0f), BRW_CONDITIONAL_GE);
+                                  brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
 
                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
-                                  fs_reg(1.0f), BRW_CONDITIONAL_L);
+                                  brw_imm_f(1.0f), BRW_CONDITIONAL_L);
                } else {
                   set_saturate(true, bld.MOV(offset(fdst, bld, c),
                                              offset(src, bld, c)));
@@ -782,7 +782,7 @@ namespace {
 
                /* Multiply by the normalization constants. */
                bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
-                       fs_reg((float)scale(widths[c] - s)));
+                       brw_imm_f((float)scale(widths[c] - s)));
 
                /* Convert to integer. */
                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
@@ -814,7 +814,7 @@ namespace {
                 */
                if (widths[c] < 16)
                   bld.SHL(offset(dst, bld, c),
-                          offset(dst, bld, c), fs_reg(15 - widths[c]));
+                          offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
 
                /* Convert to 32-bit floating point. */
                bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
@@ -842,7 +842,7 @@ namespace {
                /* Clamp to the minimum value. */
                if (widths[c] < 16)
                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
-                                  fs_reg(0.0f), BRW_CONDITIONAL_GE);
+                                  brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
 
                /* Convert to 16-bit floating-point. */
                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
@@ -855,7 +855,7 @@ namespace {
                 */
                if (widths[c] < 16)
                   bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
-                          fs_reg(15 - widths[c]));
+                          brw_imm_ud(15 - widths[c]));
             }
          }
 
@@ -874,7 +874,8 @@ namespace {
 
          for (unsigned c = 0; c < 4; ++c)
             bld.MOV(offset(dst, bld, c),
-                    widths[c] ? offset(src, bld, c) : fs_reg(pad[c]));
+                    widths[c] ? offset(src, bld, c)
+                              : fs_reg(brw_imm_ud(pad[c])));
 
          return dst;
       }
@@ -939,7 +940,7 @@ namespace brw {
             /* An out of bounds surface access should give zero as result. */
             for (unsigned c = 0; c < size; ++c)
                set_predicate(pred, bld.SEL(offset(tmp, bld, c),
-                                           offset(tmp, bld, c), fs_reg(0)));
+                                           offset(tmp, bld, c), brw_imm_d(0)));
          }
 
          /* Set the register type to D instead of UD if the data type is
@@ -1122,7 +1123,7 @@ namespace brw {
 
          /* An unbound surface access should give zero as result. */
          if (rsize)
-            set_predicate(pred, bld.SEL(tmp, tmp, fs_reg(0)));
+            set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
 
          return tmp;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 2647a40c730..e82acd141f3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -79,122 +79,6 @@ fs_visitor::emit_vs_system_value(int location)
    return reg;
 }
 
-fs_reg
-fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
-                             bool is_rect, uint32_t sampler)
-{
-   bool needs_gl_clamp = true;
-   fs_reg scale_x, scale_y;
-
-   /* The 965 requires the EU to do the normalization of GL rectangle
-    * texture coordinates.  We use the program parameter state
-    * tracking to get the scaling factor.
-    */
-   if (is_rect &&
-       (devinfo->gen < 6 ||
-        (devinfo->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
-                               key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
-      struct gl_program_parameter_list *params = prog->Parameters;
-
-
-      /* FINISHME: We're failing to recompile our programs when the sampler is
-       * updated.  This only matters for the texture rectangle scale
-       * parameters (pre-gen6, or gen6+ with GL_CLAMP).
-       */
-      int tokens[STATE_LENGTH] = {
-	 STATE_INTERNAL,
-	 STATE_TEXRECT_SCALE,
-	 prog->SamplerUnits[sampler],
-	 0,
-	 0
-      };
-
-      no16("rectangle scale uniform setup not supported on SIMD16\n");
-      if (dispatch_width == 16) {
-	 return coordinate;
-      }
-
-      GLuint index = _mesa_add_state_reference(params,
-					       (gl_state_index *)tokens);
-      /* Try to find existing copies of the texrect scale uniforms. */
-      for (unsigned i = 0; i < uniforms; i++) {
-         if (stage_prog_data->param[i] ==
-             &prog->Parameters->ParameterValues[index][0]) {
-            scale_x = fs_reg(UNIFORM, i);
-            scale_y = fs_reg(UNIFORM, i + 1);
-            break;
-         }
-      }
-
-      /* If we didn't already set them up, do so now. */
-      if (scale_x.file == BAD_FILE) {
-         scale_x = fs_reg(UNIFORM, uniforms);
-         scale_y = fs_reg(UNIFORM, uniforms + 1);
-
-         stage_prog_data->param[uniforms++] =
-            &prog->Parameters->ParameterValues[index][0];
-         stage_prog_data->param[uniforms++] =
-            &prog->Parameters->ParameterValues[index][1];
-      }
-   }
-
-   /* The 965 requires the EU to do the normalization of GL rectangle
-    * texture coordinates.  We use the program parameter state
-    * tracking to get the scaling factor.
-    */
-   if (devinfo->gen < 6 && is_rect) {
-      fs_reg dst = fs_reg(VGRF, alloc.allocate(coord_components));
-      fs_reg src = coordinate;
-      coordinate = dst;
-
-      bld.MUL(dst, src, scale_x);
-      dst = offset(dst, bld, 1);
-      src = offset(src, bld, 1);
-      bld.MUL(dst, src, scale_y);
-   } else if (is_rect) {
-      /* On gen6+, the sampler handles the rectangle coordinates
-       * natively, without needing rescaling.  But that means we have
-       * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
-       * not [0, 1] like the default case below.
-       */
-      needs_gl_clamp = false;
-
-      for (int i = 0; i < 2; i++) {
-	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
-	    fs_reg chan = coordinate;
-	    chan = offset(chan, bld, i);
-
-            set_condmod(BRW_CONDITIONAL_GE,
-                        bld.emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f)));
-
-	    /* Our parameter comes in as 1.0/width or 1.0/height,
-	     * because that's what people normally want for doing
-	     * texture rectangle handling.  We need width or height
-	     * for clamping, but we don't care enough to make a new
-	     * parameter type, so just invert back.
-	     */
-	    fs_reg limit = vgrf(glsl_type::float_type);
-            bld.MOV(limit, i == 0 ? scale_x : scale_y);
-            bld.emit(SHADER_OPCODE_RCP, limit, limit);
-
-            set_condmod(BRW_CONDITIONAL_L,
-                        bld.emit(BRW_OPCODE_SEL, chan, chan, limit));
-	 }
-      }
-   }
-
-   if (coord_components > 0 && needs_gl_clamp) {
-      for (int i = 0; i < MIN2(coord_components, 3); i++) {
-	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
-	    fs_reg chan = coordinate;
-	    chan = offset(chan, bld, i);
-            set_saturate(true, bld.MOV(chan, chan));
-	 }
-      }
-   }
-   return coordinate;
-}
-
 /* Sample from the MCS surface attached to this multisample texture. */
 fs_reg
 fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
@@ -203,7 +87,7 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
    const fs_reg dest = vgrf(glsl_type::uvec4_type);
    const fs_reg srcs[] = {
       coordinate, fs_reg(), fs_reg(), fs_reg(), fs_reg(), fs_reg(),
-      texture, texture, fs_reg(), fs_reg(components), fs_reg(0)
+      texture, texture, fs_reg(), brw_imm_ud(components), brw_imm_d(0)
    };
    fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
                             ARRAY_SIZE(srcs));
@@ -227,7 +111,6 @@ fs_visitor::emit_texture(ir_texture_opcode op,
                          fs_reg mcs,
                          int gather_component,
                          bool is_cube_array,
-                         bool is_rect,
                          uint32_t surface,
                          fs_reg surface_reg,
                          uint32_t sampler,
@@ -235,38 +118,32 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 {
    fs_inst *inst = NULL;
 
-   if (op == ir_tg4) {
-      /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
-       * emitting anything other than setting up the constant result.
-       */
-      int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
-      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
-
-         fs_reg res = vgrf(glsl_type::vec4_type);
-         this->result = res;
-
-         for (int i=0; i<4; i++) {
-            bld.MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
-            res = offset(res, bld, 1);
-         }
-         return;
-      }
-   }
-
    if (op == ir_query_levels) {
       /* textureQueryLevels() is implemented in terms of TXS so we need to
        * pass a valid LOD argument.
        */
       assert(lod.file == BAD_FILE);
-      lod = fs_reg(0u);
+      lod = brw_imm_ud(0u);
    }
 
-   if (coordinate.file != BAD_FILE) {
-      /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
-       * samplers.  This should only be a problem with GL_CLAMP on Gen7.
+   if (op == ir_samples_identical) {
+      fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 1, 1));
+
+      /* If mcs is an immediate value, it means there is no MCS.  In that case
+       * just return false.
        */
-      coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
-                                    sampler);
+      if (mcs.file == BRW_IMMEDIATE_VALUE) {
+         bld.MOV(dst, brw_imm_ud(0u));
+      } else if ((key_tex->msaa_16 & (1 << sampler))) {
+         fs_reg tmp = vgrf(glsl_type::uint_type);
+         bld.OR(tmp, mcs, offset(mcs, bld, 1));
+         bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
+      } else {
+         bld.CMP(dst, mcs, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
+      }
+
+      this->result = dst;
+      return;
    }
 
    /* Writemasking doesn't eliminate channels on SIMD8 texture
@@ -276,7 +153,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
    const fs_reg srcs[] = {
       coordinate, shadow_c, lod, lod2,
       sample_index, mcs, surface_reg, sampler_reg, offset_value,
-      fs_reg(coord_components), fs_reg(grad_components)
+      brw_imm_d(coord_components), brw_imm_d(grad_components)
    };
    enum opcode opcode;
 
@@ -327,8 +204,15 @@ fs_visitor::emit_texture(ir_texture_opcode op,
       inst->offset = offset_value.ud;
 
    if (op == ir_tg4) {
-      inst->offset |=
-         gather_channel(gather_component, surface, sampler) << 16; /* M0.2:16-17 */
+      if (gather_component == 1 &&
+          key_tex->gather_channel_quirk_mask & (1 << surface)) {
+         /* gather4 sampler is broken for green channel on RG32F --
+          * we must ask for blue instead.
+          */
+         inst->offset |= 2 << 16;
+      } else {
+         inst->offset |= gather_component << 16;
+      }
 
       if (devinfo->gen == 6)
          emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], dst);
@@ -338,7 +222,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
    if (op == ir_txs && is_cube_array) {
       fs_reg depth = offset(dst, bld, 2);
       fs_reg fixed_depth = vgrf(glsl_type::int_type);
-      bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
+      bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6));
 
       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
       int components = inst->regs_written / (inst->exec_size / 8);
@@ -352,7 +236,12 @@ fs_visitor::emit_texture(ir_texture_opcode op,
       bld.LOAD_PAYLOAD(dst, fixed_payload, components, 0);
    }
 
-   swizzle_result(op, dest_type->vector_elements, dst, sampler);
+   if (op == ir_query_levels) {
+      /* # levels is in .w */
+      dst = offset(dst, bld, 3);
+   }
+
+   this->result = dst;
 }
 
 /**
@@ -369,7 +258,7 @@ fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
    for (int i = 0; i < 4; i++) {
       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
       /* Convert from UNORM to UINT */
-      bld.MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1)));
+      bld.MUL(dst_f, dst_f, brw_imm_f((1 << width) - 1));
       bld.MOV(dst, dst_f);
 
       if (wa & WA_SIGN) {
@@ -377,83 +266,14 @@ fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
           * shifting the sign bit into place, then shifting back
           * preserving sign.
           */
-         bld.SHL(dst, dst, fs_reg(32 - width));
-         bld.ASR(dst, dst, fs_reg(32 - width));
+         bld.SHL(dst, dst, brw_imm_d(32 - width));
+         bld.ASR(dst, dst, brw_imm_d(32 - width));
       }
 
       dst = offset(dst, bld, 1);
    }
 }
 
-/**
- * Set up the gather channel based on the swizzle, for gather4.
- */
-uint32_t
-fs_visitor::gather_channel(int orig_chan, uint32_t surface, uint32_t sampler)
-{
-   int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
-   switch (swiz) {
-      case SWIZZLE_X: return 0;
-      case SWIZZLE_Y:
-         /* gather4 sampler is broken for green channel on RG32F --
-          * we must ask for blue instead.
-          */
-         if (key_tex->gather_channel_quirk_mask & (1 << surface))
-            return 2;
-         return 1;
-      case SWIZZLE_Z: return 2;
-      case SWIZZLE_W: return 3;
-      default:
-         unreachable("Not reached"); /* zero, one swizzles handled already */
-   }
-}
-
-/**
- * Swizzle the result of a texture result.  This is necessary for
- * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
- */
-void
-fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
-                           fs_reg orig_val, uint32_t sampler)
-{
-   if (op == ir_query_levels) {
-      /* # levels is in .w */
-      this->result = offset(orig_val, bld, 3);
-      return;
-   }
-
-   this->result = orig_val;
-
-   /* txs,lod don't actually sample the texture, so swizzling the result
-    * makes no sense.
-    */
-   if (op == ir_txs || op == ir_lod || op == ir_tg4)
-      return;
-
-   if (dest_components == 1) {
-      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
-   } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
-      fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
-      swizzled_result.type = orig_val.type;
-
-      for (int i = 0; i < 4; i++) {
-	 int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
-	 fs_reg l = swizzled_result;
-	 l = offset(l, bld, i);
-
-	 if (swiz == SWIZZLE_ZERO) {
-            bld.MOV(l, fs_reg(0.0f));
-	 } else if (swiz == SWIZZLE_ONE) {
-            bld.MOV(l, fs_reg(1.0f));
-	 } else {
-            bld.MOV(l, offset(orig_val, bld,
-                                  GET_SWZ(key_tex->swizzles[sampler], i)));
-	 }
-      }
-      this->result = swizzled_result;
-   }
-}
-
 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 void
 fs_visitor::emit_dummy_fs()
@@ -464,7 +284,7 @@ fs_visitor::emit_dummy_fs()
    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
    for (int i = 0; i < 4; i++) {
       bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
-              fs_reg(color[i]));
+              brw_imm_f(color[i]));
    }
 
    fs_inst *write;
@@ -683,7 +503,7 @@ fs_visitor::emit_alpha_test()
       fs_reg color = offset(outputs[0], bld, 3);
 
       /* f0.1 &= func(color, ref) */
-      cmp = abld.CMP(bld.null_reg_f(), color, fs_reg(key->alpha_test_ref),
+      cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref),
                      cond_for_alpha_func(key->alpha_test_func));
    }
    cmp->predicate = BRW_PREDICATE_NORMAL;
@@ -716,7 +536,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
 
    const fs_reg sources[] = {
       color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
-      sample_mask, fs_reg(components)
+      sample_mask, brw_imm_ud(components)
    };
    assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
    fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
@@ -950,12 +770,12 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
 
       fs_reg offset;
       if (gs_vertex_count.file == IMM) {
-         per_slot_offsets = fs_reg(output_vertex_size_owords *
-                                   gs_vertex_count.ud);
+         per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
+                                       gs_vertex_count.ud);
       } else {
          per_slot_offsets = vgrf(glsl_type::int_type);
          bld.MUL(per_slot_offsets, gs_vertex_count,
-                 fs_reg(output_vertex_size_owords));
+                 brw_imm_ud(output_vertex_size_owords));
       }
    }
 
@@ -978,7 +798,7 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
          }
 
          fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-         bld.MOV(zero, fs_reg(0u));
+         bld.MOV(zero, brw_imm_ud(0u));
 
          sources[length++] = zero;
          if (vue_map->slots_valid & VARYING_BIT_LAYER)
@@ -1038,7 +858,7 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
             for (unsigned i = 0; i < output_components[varying]; i++)
                sources[length++] = offset(this->outputs[varying], bld, i);
             for (unsigned i = output_components[varying]; i < 4; i++)
-               sources[length++] = fs_reg(0);
+               sources[length++] = brw_imm_d(0);
          }
          break;
       }
@@ -1115,11 +935,11 @@ fs_visitor::emit_barrier()
    const fs_builder pbld = bld.exec_all().group(8, 0);
 
    /* Clear the message payload */
-   pbld.MOV(payload, fs_reg(0u));
+   pbld.MOV(payload, brw_imm_ud(0u));
 
    /* Copy bits 27:24 of r0.2 (barrier id) to the message payload reg.2 */
    fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
-   pbld.AND(component(payload, 2), r0_2, fs_reg(0x0f000000u));
+   pbld.AND(component(payload, 2), r0_2, brw_imm_ud(0x0f000000u));
 
    /* Emit a gateway "barrier" message using the payload we set up, followed
     * by a wait instruction.
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index ed0890f430f..149b43ba055 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -75,7 +75,9 @@ brw_codegen_gs_prog(struct brw_context *brw,
     * every uniform is a float which gets padded to the size of a vec4.
     */
    struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
-   int param_count = gp->program.Base.nir->num_uniforms * 4;
+   int param_count = gp->program.Base.nir->num_uniforms;
+   if (!compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+      param_count *= 4;
 
    prog_data.base.base.param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
@@ -87,7 +89,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
    prog_data.base.base.nr_image_params = gs->NumImages;
 
    brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base,
-                               &prog_data.base.base, compiler->scalar_gs);
+                               &prog_data.base.base,
+                               compiler->scalar_stage[MESA_SHADER_GEOMETRY]);
 
    GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
 
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 4ed95c473cd..cd9f6ef591d 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -694,7 +694,7 @@ brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low)
    high %= 64;
    low %= 64;
 
-   const uint64_t mask = (1ull << (high - low + 1)) - 1;
+   const uint64_t mask = (~0ull >> (64 - (high - low + 1)));
 
    return (inst->data[word] >> low) & mask;
 }
@@ -713,7 +713,7 @@ brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value)
    high %= 64;
    low %= 64;
 
-   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
+   const uint64_t mask = (~0ull >> (64 - (high - low + 1))) << low;
 
    /* Make sure the supplied value actually fits in the given bitfield. */
    assert((value & (mask >> low)) == value);
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 7e977e9e727..0410053ce27 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -36,11 +36,6 @@ public:
    void init();
 
    fs_reg();
-   explicit fs_reg(float f);
-   explicit fs_reg(int32_t i);
-   explicit fs_reg(uint32_t u);
-   explicit fs_reg(uint8_t vf[4]);
-   explicit fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3);
    fs_reg(struct brw_reg reg);
    fs_reg(enum brw_reg_file file, int nr);
    fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type);
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index 110e64b979e..e2e66044d3a 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -41,11 +41,6 @@ public:
 
    src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
    src_reg();
-   src_reg(float f);
-   src_reg(uint32_t u);
-   src_reg(int32_t i);
-   src_reg(uint8_t vf[4]);
-   src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3);
    src_reg(struct brw_reg reg);
 
    bool equals(const src_reg &r) const;
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 29911732761..14421d421b6 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -66,12 +66,14 @@ brw_lower_packing_builtins(struct brw_context *brw,
                            gl_shader_stage shader_type,
                            exec_list *ir)
 {
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+
    int ops = LOWER_PACK_SNORM_2x16
            | LOWER_UNPACK_SNORM_2x16
            | LOWER_PACK_UNORM_2x16
            | LOWER_UNPACK_UNORM_2x16;
 
-   if (is_scalar_shader_stage(brw->intelScreen->compiler, shader_type)) {
+   if (compiler->scalar_stage[shader_type]) {
       ops |= LOWER_UNPACK_UNORM_4x8
            | LOWER_UNPACK_SNORM_4x8
            | LOWER_PACK_UNORM_4x8
@@ -84,7 +86,7 @@ brw_lower_packing_builtins(struct brw_context *brw,
        * lowering is needed. For SOA code, the Half2x16 ops must be
        * scalarized.
        */
-      if (is_scalar_shader_stage(brw->intelScreen->compiler, shader_type)) {
+      if (compiler->scalar_stage[shader_type]) {
          ops |= LOWER_PACK_HALF_2x16_TO_SPLIT
              |  LOWER_UNPACK_HALF_2x16_TO_SPLIT;
       }
@@ -103,6 +105,7 @@ process_glsl_ir(gl_shader_stage stage,
                 struct gl_shader *shader)
 {
    struct gl_context *ctx = &brw->ctx;
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
    const struct gl_shader_compiler_options *options =
       &ctx->Const.ShaderCompilerOptions[shader->Stage];
 
@@ -161,7 +164,7 @@ process_glsl_ir(gl_shader_stage stage,
    do {
       progress = false;
 
-      if (is_scalar_shader_stage(brw->intelScreen->compiler, shader->Stage)) {
+      if (compiler->scalar_stage[shader->Stage]) {
          brw_do_channel_expressions(shader->ir);
          brw_do_vector_splitting(shader->ir);
       }
@@ -252,7 +255,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
       brw_add_texrect_params(prog);
 
       prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage,
-                                 is_scalar_shader_stage(compiler, stage));
+                                 compiler->scalar_stage[stage]);
 
       _mesa_reference_program(ctx, &prog, NULL);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index 12e7c32e424..1f8bfdfa492 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -204,7 +204,7 @@ brw_draw_rectlist(struct gl_context *ctx, struct rect *rect, int num_instances)
 }
 
 static void
-get_fast_clear_rect(struct gl_framebuffer *fb,
+get_fast_clear_rect(struct brw_context *brw, struct gl_framebuffer *fb,
                     struct intel_renderbuffer *irb, struct rect *rect)
 {
    unsigned int x_align, y_align;
@@ -228,7 +228,14 @@ get_fast_clear_rect(struct gl_framebuffer *fb,
        */
       intel_get_non_msrt_mcs_alignment(irb->mt, &x_align, &y_align);
       x_align *= 16;
-      y_align *= 32;
+
+      /* SKL+ line alignment requirement for Y-tiled are half those of the prior
+       * generations.
+       */
+      if (brw->gen >= 9)
+         y_align *= 16;
+      else
+         y_align *= 32;
 
       /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
        * Target(s)", beneath the "Fast Color Clear" bullet (p327):
@@ -265,8 +272,10 @@ get_fast_clear_rect(struct gl_framebuffer *fb,
        *     terms of (width,height) of the RT.
        *
        *     MSAA  Width of Clear Rect  Height of Clear Rect
+       *      2X     Ceil(1/8*width)      Ceil(1/2*height)
        *      4X     Ceil(1/8*width)      Ceil(1/2*height)
        *      8X     Ceil(1/2*width)      Ceil(1/2*height)
+       *     16X         width            Ceil(1/2*height)
        *
        * The text "with upper left co-ordinate to coincide with actual
        * rectangle being cleared" is a little confusing--it seems to imply
@@ -289,6 +298,9 @@ get_fast_clear_rect(struct gl_framebuffer *fb,
       case 8:
          x_scaledown = 2;
          break;
+      case 16:
+         x_scaledown = 1;
+         break;
       default:
          unreachable("Unexpected sample count for fast clear");
       }
@@ -347,8 +359,12 @@ is_color_fast_clear_compatible(struct brw_context *brw,
    }
 
    for (int i = 0; i < 4; i++) {
-      if (color->f[i] != 0.0f && color->f[i] != 1.0f &&
-          _mesa_format_has_color_component(format, i)) {
+      if (!_mesa_format_has_color_component(format, i)) {
+         continue;
+      }
+
+      if (brw->gen < 9 &&
+          color->f[i] != 0.0f && color->f[i] != 1.0f) {
          return false;
       }
    }
@@ -357,18 +373,55 @@ is_color_fast_clear_compatible(struct brw_context *brw,
 
 /**
  * Convert the given color to a bitfield suitable for ORing into DWORD 7 of
- * SURFACE_STATE.
+ * SURFACE_STATE (DWORD 12-15 on SKL+).
  */
-static uint32_t
-compute_fast_clear_color_bits(const union gl_color_union *color)
+static void
+set_fast_clear_color(struct brw_context *brw,
+                     struct intel_mipmap_tree *mt,
+                     const union gl_color_union *color)
 {
-   uint32_t bits = 0;
-   for (int i = 0; i < 4; i++) {
-      /* Testing for non-0 works for integer and float colors */
-      if (color->f[i] != 0.0f)
-         bits |= 1 << (GEN7_SURFACE_CLEAR_COLOR_SHIFT + (3 - i));
+   union gl_color_union override_color = *color;
+
+   /* The sampler doesn't look at the format of the surface when the fast
+    * clear color is used so we need to implement luminance, intensity and
+    * missing components manually.
+    */
+   switch (_mesa_get_format_base_format(mt->format)) {
+   case GL_INTENSITY:
+      override_color.ui[3] = override_color.ui[0];
+      /* flow through */
+   case GL_LUMINANCE:
+   case GL_LUMINANCE_ALPHA:
+      override_color.ui[1] = override_color.ui[0];
+      override_color.ui[2] = override_color.ui[0];
+      break;
+   default:
+      for (int i = 0; i < 3; i++) {
+         if (!_mesa_format_has_color_component(mt->format, i))
+            override_color.ui[i] = 0;
+      }
+      break;
+   }
+
+   if (!_mesa_format_has_color_component(mt->format, 3)) {
+      if (_mesa_is_format_integer_color(mt->format))
+         override_color.ui[3] = 1;
+      else
+         override_color.f[3] = 1.0f;
+   }
+
+   if (brw->gen >= 9) {
+      mt->gen9_fast_clear_color = override_color;
+   } else {
+      mt->fast_clear_color_value = 0;
+      for (int i = 0; i < 4; i++) {
+         /* Testing for non-0 works for integer and float colors */
+         if (override_color.f[i] != 0.0f) {
+             mt->fast_clear_color_value |=
+                1 << (GEN7_SURFACE_CLEAR_COLOR_SHIFT + (3 - i));
+         }
+      }
    }
-   return bits;
 }
 
 static const uint32_t fast_clear_color[4] = { ~0, ~0, ~0, ~0 };
@@ -408,6 +461,55 @@ use_rectlist(struct brw_context *brw, bool enable)
    brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 }
 
+/**
+ * Individually fast clear each color buffer attachment. On previous gens this
+ * isn't required. The motivation for this comes from one line (which seems to
+ * be specific to SKL+). The list item is in section titled _MCS Buffer for
+ * Render Target(s)_
+ *
+ *   "Since only one RT is bound with a clear pass, only one RT can be cleared
+ *   at a time. To clear multiple RTs, multiple clear passes are required."
+ *
+ * The code follows the same idea as the resolve code which creates a fake FBO
+ * to avoid interfering with too much of the GL state.
+ */
+static void
+fast_clear_attachments(struct brw_context *brw,
+                       struct gl_framebuffer *fb,
+                       uint32_t fast_clear_buffers,
+                       struct rect fast_clear_rect)
+{
+   assert(brw->gen >= 9);
+   struct gl_context *ctx = &brw->ctx;
+
+   brw_bind_rep_write_shader(brw, (float *) fast_clear_color);
+
+   /* SKL+ also has a resolve mode for compressed render targets and thus more
+    * bits to let us select the type of resolve.  For fast clear resolves, it
+    * turns out we can use the same value as pre-SKL though.
+    */
+   set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_FAST_CLEAR_ENABLE);
+
+   while (fast_clear_buffers) {
+      int index = ffs(fast_clear_buffers) - 1;
+
+      fast_clear_buffers &= ~(1 << index);
+
+      _mesa_meta_drawbuffers_from_bitfield(1 << index);
+
+      brw_draw_rectlist(ctx, &fast_clear_rect, MAX2(1, fb->MaxNumLayers));
+
+      /* Now set the mcs we cleared to INTEL_FAST_CLEAR_STATE_CLEAR so we'll
+       * resolve them eventually.
+       */
+      struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[0];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+      irb->mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_CLEAR;
+   }
+
+   set_fast_clear_op(brw, 0);
+}
+
 bool
 brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
                     GLbitfield buffers, bool partial_clear)
@@ -447,13 +549,15 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
       if (brw->gen < 7)
          clear_type = REP_CLEAR;
 
-      if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_NO_MCS)
+      /* Certain formats have unresolved issues with sampling from the MCS
+       * buffer on Gen9. This disables fast clears altogether for MSRTs until
+       * we can figure out what's going on.
+       */
+      if (brw->gen >= 9 && irb->mt->num_samples > 1)
          clear_type = REP_CLEAR;
 
-      if (brw->gen >= 9 && clear_type == FAST_CLEAR) {
-         perf_debug("fast MCS clears are disabled on gen9");
+      if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_NO_MCS)
          clear_type = REP_CLEAR;
-      }
 
       /* We can't do scissored fast clears because of the restrictions on the
        * fast clear rectangle size.
@@ -503,8 +607,7 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
 
       switch (clear_type) {
       case FAST_CLEAR:
-         irb->mt->fast_clear_color_value =
-            compute_fast_clear_color_bits(&ctx->Color.ClearColor);
+         set_fast_clear_color(brw, irb->mt, &ctx->Color.ClearColor);
          irb->need_downsample = true;
 
          /* If the buffer is already in INTEL_FAST_CLEAR_STATE_CLEAR, the
@@ -520,7 +623,7 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
          irb->mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
          irb->need_downsample = true;
          fast_clear_buffers |= 1 << index;
-         get_fast_clear_rect(fb, irb, &fast_clear_rect);
+         get_fast_clear_rect(brw, fb, irb, &fast_clear_rect);
          break;
 
       case REP_CLEAR:
@@ -584,12 +687,27 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
    use_rectlist(brw, true);
 
    layers = MAX2(1, fb->MaxNumLayers);
-   if (fast_clear_buffers) {
+
+   if (brw->gen >= 9 && fast_clear_buffers) {
+      fast_clear_attachments(brw, fb, fast_clear_buffers, fast_clear_rect);
+   } else if (fast_clear_buffers) {
       _mesa_meta_drawbuffers_from_bitfield(fast_clear_buffers);
       brw_bind_rep_write_shader(brw, (float *) fast_clear_color);
       set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_FAST_CLEAR_ENABLE);
       brw_draw_rectlist(ctx, &fast_clear_rect, layers);
       set_fast_clear_op(brw, 0);
+
+      /* Now set the mcs we cleared to INTEL_FAST_CLEAR_STATE_CLEAR so we'll
+       * resolve them eventually.
+       */
+      for (unsigned buf = 0; buf < fb->_NumColorDrawBuffers; buf++) {
+         struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[buf];
+         struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+         int index = fb->_ColorDrawBufferIndexes[buf];
+
+         if ((1 << index) & fast_clear_buffers)
+            irb->mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_CLEAR;
+      }
    }
 
    if (rep_clear_buffers) {
@@ -598,18 +716,6 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
       brw_draw_rectlist(ctx, &clear_rect, layers);
    }
 
-   /* Now set the mts we cleared to INTEL_FAST_CLEAR_STATE_CLEAR so we'll
-    * resolve them eventually.
-    */
-   for (unsigned buf = 0; buf < fb->_NumColorDrawBuffers; buf++) {
-      struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[buf];
-      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-      int index = fb->_ColorDrawBufferIndexes[buf];
-
-      if ((1 << index) & fast_clear_buffers)
-         irb->mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_CLEAR;
-   }
-
  bail_to_meta:
    /* Dirty _NEW_BUFFERS so we reemit SURFACE_STATE which sets the fast clear
     * color before resolve and sets irb->mt->fast_clear_state to UNRESOLVED if
@@ -655,8 +761,9 @@ get_resolve_rect(struct brw_context *brw,
     *
     * The scaledown factors in the table that follows are related to the
     * alignment size returned by intel_get_non_msrt_mcs_alignment() by a
-    * multiplier.  For IVB and HSW, we divide by two, for BDW we multiply
-    * by 8 and 16 and 8 and 8 for SKL.
+    * multiplier. For IVB and HSW, we divide by two, for BDW we multiply
+    * by 8 and 16. Similar to the fast clear, SKL eases the BDW vertical scaling
+    * by a factor of 2.
     */
 
    intel_get_non_msrt_mcs_alignment(mt, &x_align, &y_align);
@@ -702,6 +809,10 @@ brw_meta_resolve_color(struct brw_context *brw,
 
    brw_bind_rep_write_shader(brw, (float *) fast_clear_color);
 
+   /* SKL+ also has a resolve mode for compressed render targets and thus more
+    * bits to let us select the type of resolve.  For fast clear resolves, it
+    * turns out we can use the same value as pre-SKL though.
+    */
    set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_RESOLVE_ENABLE);
 
    mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 58754adc887..91358d8f389 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -56,8 +56,9 @@ remap_vs_attrs(nir_block *block, void *closure)
 }
 
 static void
-brw_nir_lower_inputs(const struct brw_device_info *devinfo,
-                     nir_shader *nir, bool is_scalar)
+brw_nir_lower_inputs(nir_shader *nir,
+                     const struct brw_device_info *devinfo,
+                     bool is_scalar)
 {
    switch (nir->stage) {
    case MESA_SHADER_VERTEX:
@@ -170,131 +171,159 @@ brw_nir_lower_outputs(nir_shader *nir, bool is_scalar)
    }
 }
 
-static void
+static bool
+should_clone_nir()
+{
+   static int should_clone = -1;
+   if (should_clone < 1)
+      should_clone = brw_env_var_as_boolean("NIR_TEST_CLONE", false);
+
+   return should_clone;
+}
+
+#define _OPT(do_pass) (({                                            \
+   bool this_progress = true;                                        \
+   do_pass                                                           \
+   nir_validate_shader(nir);                                         \
+   if (should_clone_nir()) {                                         \
+      nir_shader *clone = nir_shader_clone(ralloc_parent(nir), nir); \
+      ralloc_free(nir);                                              \
+      nir = clone;                                                   \
+   }                                                                 \
+   this_progress;                                                    \
+}))
+
+#define OPT(pass, ...) _OPT(                   \
+   nir_metadata_set_validation_flag(nir);      \
+   this_progress = pass(nir ,##__VA_ARGS__);   \
+   if (this_progress) {                        \
+      progress = true;                         \
+      nir_metadata_check_validation_flag(nir); \
+   }                                           \
+)
+
+#define OPT_V(pass, ...) _OPT( \
+   pass(nir, ##__VA_ARGS__);   \
+)
+
+static nir_shader *
 nir_optimize(nir_shader *nir, bool is_scalar)
 {
    bool progress;
    do {
       progress = false;
-      nir_lower_vars_to_ssa(nir);
-      nir_validate_shader(nir);
+      OPT_V(nir_lower_vars_to_ssa);
 
       if (is_scalar) {
-         nir_lower_alu_to_scalar(nir);
-         nir_validate_shader(nir);
+         OPT_V(nir_lower_alu_to_scalar);
       }
 
-      progress |= nir_copy_prop(nir);
-      nir_validate_shader(nir);
+      OPT(nir_copy_prop);
 
       if (is_scalar) {
-         nir_lower_phis_to_scalar(nir);
-         nir_validate_shader(nir);
+         OPT_V(nir_lower_phis_to_scalar);
       }
 
-      progress |= nir_copy_prop(nir);
-      nir_validate_shader(nir);
-      progress |= nir_opt_dce(nir);
-      nir_validate_shader(nir);
-      progress |= nir_opt_cse(nir);
-      nir_validate_shader(nir);
-      progress |= nir_opt_peephole_select(nir);
-      nir_validate_shader(nir);
-      progress |= nir_opt_algebraic(nir);
-      nir_validate_shader(nir);
-      progress |= nir_opt_constant_folding(nir);
-      nir_validate_shader(nir);
-      progress |= nir_opt_dead_cf(nir);
-      nir_validate_shader(nir);
-      progress |= nir_opt_remove_phis(nir);
-      nir_validate_shader(nir);
-      progress |= nir_opt_undef(nir);
-      nir_validate_shader(nir);
+      OPT(nir_copy_prop);
+      OPT(nir_opt_dce);
+      OPT(nir_opt_cse);
+      OPT(nir_opt_peephole_select);
+      OPT(nir_opt_algebraic);
+      OPT(nir_opt_constant_folding);
+      OPT(nir_opt_dead_cf);
+      OPT(nir_opt_remove_phis);
+      OPT(nir_opt_undef);
    } while (progress);
+
+   return nir;
 }
 
+/* Does some simple lowering and runs the standard suite of optimizations
+ *
+ * This is intended to be called more-or-less directly after you get the
+ * shader out of GLSL or some other source.  While it is geared towards i965,
+ * it is not at all generator-specific except for the is_scalar flag.  Even
+ * there, it is safe to call with is_scalar = false for a shader that is
+ * intended for the FS backend as long as nir_optimize is called again with
+ * is_scalar = true to scalarize everything prior to code gen.
+ */
 nir_shader *
-brw_create_nir(struct brw_context *brw,
-               const struct gl_shader_program *shader_prog,
-               const struct gl_program *prog,
-               gl_shader_stage stage,
-               bool is_scalar)
+brw_preprocess_nir(nir_shader *nir, bool is_scalar)
 {
-   struct gl_context *ctx = &brw->ctx;
-   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
-   const nir_shader_compiler_options *options =
-      ctx->Const.ShaderCompilerOptions[stage].NirOptions;
-   nir_shader *nir;
+   bool progress; /* Written by OPT and OPT_V */
+   (void)progress;
 
-   /* First, lower the GLSL IR or Mesa IR to NIR */
-   if (shader_prog) {
-      nir = glsl_to_nir(shader_prog, stage, options);
-   } else {
-      nir = prog_to_nir(prog, options);
-      nir_convert_to_ssa(nir); /* turn registers into SSA */
-   }
-   nir_validate_shader(nir);
+   if (nir->stage == MESA_SHADER_GEOMETRY)
+      OPT(nir_lower_gs_intrinsics);
 
-   brw_preprocess_nir(nir, brw->intelScreen->devinfo, is_scalar);
+   static const nir_lower_tex_options tex_options = {
+      .lower_txp = ~0,
+   };
 
-   if (shader_prog) {
-      nir_lower_samplers(nir, shader_prog);
-      nir_validate_shader(nir);
+   OPT(nir_lower_tex, &tex_options);
+   OPT(nir_normalize_cubemap_coords);
 
-      nir_lower_atomics(nir, shader_prog);
-      nir_validate_shader(nir);
-   }
+   OPT(nir_lower_global_vars_to_local);
 
-   brw_postprocess_nir(nir, brw->intelScreen->devinfo, is_scalar);
+   OPT(nir_split_var_copies);
 
-   static GLuint msg_id = 0;
-   _mesa_gl_debug(&brw->ctx, &msg_id,
-                  MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                  MESA_DEBUG_TYPE_OTHER,
-                  MESA_DEBUG_SEVERITY_NOTIFICATION,
-                  "%s NIR shader:\n",
-                  _mesa_shader_stage_to_abbrev(nir->stage));
+   nir = nir_optimize(nir, is_scalar);
 
-   return nir;
-}
+   /* Lower a bunch of stuff */
+   OPT_V(nir_lower_var_copies);
 
-void
-brw_preprocess_nir(nir_shader *nir,
-                   const struct brw_device_info *devinfo,
-                   bool is_scalar)
-{
-   static const nir_lower_tex_options tex_options = {
-      .lower_txp = ~0,
-   };
+   /* Get rid of split copies */
+   nir = nir_optimize(nir, is_scalar);
 
-   if (nir->stage == MESA_SHADER_GEOMETRY) {
-      nir_lower_gs_intrinsics(nir);
-      nir_validate_shader(nir);
-   }
+   OPT(nir_remove_dead_variables);
 
-   nir_lower_global_vars_to_local(nir);
-   nir_validate_shader(nir);
+   return nir;
+}
 
-   nir_lower_tex(nir, &tex_options);
-   nir_validate_shader(nir);
+/* Lowers inputs, outputs, uniforms, and samplers for i965
+ *
+ * This function does all of the standard lowering prior to post-processing.
+ * The lowering done is highly gen, stage, and backend-specific.  The
+ * shader_prog parameter is optional and is used only for lowering sampler
+ * derefs and atomics for GLSL shaders.
+ */
+nir_shader *
+brw_lower_nir(nir_shader *nir,
+              const struct brw_device_info *devinfo,
+              const struct gl_shader_program *shader_prog,
+              bool is_scalar)
+{
+   bool progress; /* Written by OPT and OPT_V */
+   (void)progress;
 
-   nir_normalize_cubemap_coords(nir);
-   nir_validate_shader(nir);
+   OPT_V(brw_nir_lower_inputs, devinfo, is_scalar);
+   OPT_V(brw_nir_lower_outputs, is_scalar);
+   nir_assign_var_locations(&nir->uniforms,
+                            &nir->num_uniforms,
+                            is_scalar ? type_size_scalar : type_size_vec4);
+   OPT_V(nir_lower_io, nir_var_all, is_scalar ? type_size_scalar : type_size_vec4);
 
-   nir_split_var_copies(nir);
-   nir_validate_shader(nir);
+   if (shader_prog) {
+      OPT_V(nir_lower_samplers, shader_prog);
+   }
 
-   nir_optimize(nir, is_scalar);
+   OPT(nir_lower_system_values);
 
-   /* Lower a bunch of stuff */
-   nir_lower_var_copies(nir);
-   nir_validate_shader(nir);
+   if (shader_prog) {
+      OPT_V(nir_lower_atomics, shader_prog);
+   }
 
-   /* Get rid of split copies */
-   nir_optimize(nir, is_scalar);
+   return nir_optimize(nir, is_scalar);
 }
 
-void
+/* Prepare the given shader for codegen
+ *
+ * This function is intended to be called right before going into the actual
+ * backend and is highly backend-specific.  Also, once this function has been
+ * called on a shader, it will no longer be in SSA form so most optimizations
+ * will not work.
+ */
+nir_shader *
 brw_postprocess_nir(nir_shader *nir,
                     const struct brw_device_info *devinfo,
                     bool is_scalar)
@@ -302,40 +331,21 @@ brw_postprocess_nir(nir_shader *nir,
    bool debug_enabled =
       (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->stage));
 
-   brw_nir_lower_inputs(devinfo, nir, is_scalar);
-   brw_nir_lower_outputs(nir, is_scalar);
-   nir_assign_var_locations(&nir->uniforms,
-                            &nir->num_uniforms,
-                            is_scalar ? type_size_scalar : type_size_vec4);
-   nir_lower_io(nir, -1, is_scalar ? type_size_scalar : type_size_vec4);
-   nir_validate_shader(nir);
-
-   nir_remove_dead_variables(nir);
-   nir_validate_shader(nir);
-
-   nir_lower_system_values(nir);
-   nir_validate_shader(nir);
-
-   nir_optimize(nir, is_scalar);
+   bool progress; /* Written by OPT and OPT_V */
+   (void)progress;
 
    if (devinfo->gen >= 6) {
       /* Try and fuse multiply-adds */
-      brw_nir_opt_peephole_ffma(nir);
-      nir_validate_shader(nir);
+      OPT(brw_nir_opt_peephole_ffma);
    }
 
-   nir_opt_algebraic_late(nir);
-   nir_validate_shader(nir);
+   OPT(nir_opt_algebraic_late);
 
-   nir_lower_locals_to_regs(nir);
-   nir_validate_shader(nir);
+   OPT(nir_lower_locals_to_regs);
 
-   nir_lower_to_source_mods(nir);
-   nir_validate_shader(nir);
-   nir_copy_prop(nir);
-   nir_validate_shader(nir);
-   nir_opt_dce(nir);
-   nir_validate_shader(nir);
+   OPT_V(nir_lower_to_source_mods);
+   OPT(nir_copy_prop);
+   OPT(nir_opt_dce);
 
    if (unlikely(debug_enabled)) {
       /* Re-index SSA defs so we print more sensible numbers. */
@@ -349,15 +359,11 @@ brw_postprocess_nir(nir_shader *nir,
       nir_print_shader(nir, stderr);
    }
 
-   nir_convert_from_ssa(nir, true);
-   nir_validate_shader(nir);
+   OPT_V(nir_convert_from_ssa, true);
 
    if (!is_scalar) {
-      nir_move_vec_src_uses_to_dest(nir);
-      nir_validate_shader(nir);
-
-      nir_lower_vec_to_movs(nir);
-      nir_validate_shader(nir);
+      OPT_V(nir_move_vec_src_uses_to_dest);
+      OPT(nir_lower_vec_to_movs);
    }
 
    /* This is the last pass we run before we start emitting stuff.  It
@@ -375,13 +381,83 @@ brw_postprocess_nir(nir_shader *nir,
               _mesa_shader_stage_to_string(nir->stage));
       nir_print_shader(nir, stderr);
    }
+
+   return nir;
+}
+
+nir_shader *
+brw_create_nir(struct brw_context *brw,
+               const struct gl_shader_program *shader_prog,
+               const struct gl_program *prog,
+               gl_shader_stage stage,
+               bool is_scalar)
+{
+   struct gl_context *ctx = &brw->ctx;
+   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
+   const nir_shader_compiler_options *options =
+      ctx->Const.ShaderCompilerOptions[stage].NirOptions;
+   bool progress;
+   nir_shader *nir;
+
+   /* First, lower the GLSL IR or Mesa IR to NIR */
+   if (shader_prog) {
+      nir = glsl_to_nir(shader_prog, stage, options);
+   } else {
+      nir = prog_to_nir(prog, options);
+      OPT_V(nir_convert_to_ssa); /* turn registers into SSA */
+   }
+   nir_validate_shader(nir);
+
+   (void)progress;
+
+   nir = brw_preprocess_nir(nir, is_scalar);
+   nir = brw_lower_nir(nir, devinfo, shader_prog, is_scalar);
+
+   return nir;
+}
+
+nir_shader *
+brw_nir_apply_sampler_key(nir_shader *nir,
+                          const struct brw_device_info *devinfo,
+                          const struct brw_sampler_prog_key_data *key_tex,
+                          bool is_scalar)
+{
+   nir_lower_tex_options tex_options = { 0 };
+
+   /* Iron Lake and prior require lowering of all rectangle textures */
+   if (devinfo->gen < 6)
+      tex_options.lower_rect = true;
+
+   /* Prior to Broadwell, our hardware can't actually do GL_CLAMP */
+   if (devinfo->gen < 8) {
+      tex_options.saturate_s = key_tex->gl_clamp_mask[0];
+      tex_options.saturate_t = key_tex->gl_clamp_mask[1];
+      tex_options.saturate_r = key_tex->gl_clamp_mask[2];
+   }
+
+   /* Prior to Haswell, we have to fake texture swizzle */
+   for (unsigned s = 0; s < MAX_SAMPLERS; s++) {
+      if (key_tex->swizzles[s] == SWIZZLE_NOOP)
+         continue;
+
+      tex_options.swizzle_result |= (1 << s);
+      for (unsigned c = 0; c < 4; c++)
+         tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c);
+   }
+
+   if (nir_lower_tex(nir, &tex_options)) {
+      nir_validate_shader(nir);
+      nir = nir_optimize(nir, is_scalar);
+   }
+
+   return nir;
 }
 
 enum brw_reg_type
 brw_type_for_nir_type(nir_alu_type type)
 {
    switch (type) {
-   case nir_type_unsigned:
+   case nir_type_uint:
       return BRW_REGISTER_TYPE_UD;
    case nir_type_bool:
    case nir_type_int:
@@ -408,7 +484,7 @@ brw_glsl_base_type_for_nir_type(nir_alu_type type)
    case nir_type_int:
       return GLSL_TYPE_INT;
 
-   case nir_type_unsigned:
+   case nir_type_uint:
       return GLSL_TYPE_UINT;
 
    default:
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index d259777e1c9..0a8a5a280b1 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -81,19 +81,25 @@ nir_shader *brw_create_nir(struct brw_context *brw,
                            gl_shader_stage stage,
                            bool is_scalar);
 
+nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar);
+nir_shader *brw_lower_nir(nir_shader *nir,
+                          const struct brw_device_info *devinfo,
+                          const struct gl_shader_program *shader_prog,
+                          bool is_scalar);
+nir_shader *brw_postprocess_nir(nir_shader *nir,
+                                const struct brw_device_info *devinfo,
+                                bool is_scalar);
+
+
+nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
+                                      const struct brw_device_info *devinfo,
+                                      const struct brw_sampler_prog_key_data *key,
+                                      bool is_scalar);
+
 enum brw_reg_type brw_type_for_nir_type(nir_alu_type type);
 
 enum glsl_base_type brw_glsl_base_type_for_nir_type(nir_alu_type type);
 
-void
-brw_preprocess_nir(nir_shader *nir,
-                   const struct brw_device_info *devinfo,
-                   bool is_scalar);
-void
-brw_postprocess_nir(nir_shader *nir,
-                    const struct brw_device_info *devinfo,
-                    bool is_scalar);
-
 void brw_nir_setup_glsl_uniforms(nir_shader *shader,
                                  struct gl_shader_program *shader_prog,
                                  const struct gl_program *prog,
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 6433dec9041..3da8e9e8a97 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -126,6 +126,7 @@ brwProgramStringNotify(struct gl_context *ctx,
 		       struct gl_program *prog)
 {
    struct brw_context *brw = brw_context(ctx);
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
 
    switch (target) {
    case GL_FRAGMENT_PROGRAM_ARB: {
@@ -165,7 +166,7 @@ brwProgramStringNotify(struct gl_context *ctx,
       brw_add_texrect_params(prog);
 
       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
-                                 brw->intelScreen->compiler->scalar_vs);
+                                 compiler->scalar_stage[MESA_SHADER_VERTEX]);
 
       brw_vs_precompile(ctx, NULL, prog);
       break;
@@ -343,6 +344,8 @@ brw_report_shader_time(struct brw_context *brw)
 
       switch (type) {
       case ST_VS:
+      case ST_TCS:
+      case ST_TES:
       case ST_GS:
       case ST_FS8:
       case ST_FS16:
@@ -369,6 +372,8 @@ brw_report_shader_time(struct brw_context *brw)
 
       switch (type) {
       case ST_VS:
+      case ST_TCS:
+      case ST_TES:
       case ST_GS:
       case ST_FS8:
       case ST_FS16:
@@ -406,6 +411,12 @@ brw_report_shader_time(struct brw_context *brw)
       case ST_VS:
          stage = "vs";
          break;
+      case ST_TCS:
+         stage = "tcs";
+         break;
+      case ST_TES:
+         stage = "tes";
+         break;
       case ST_GS:
          stage = "gs";
          break;
@@ -429,6 +440,8 @@ brw_report_shader_time(struct brw_context *brw)
 
    fprintf(stderr, "\n");
    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
+   print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
+   print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 3da83b43b5d..fa912c96c36 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -43,7 +43,6 @@
 #define BRW_REG_H
 
 #include <stdbool.h>
-#include "main/imports.h"
 #include "main/compiler.h"
 #include "main/macros.h"
 #include "program/prog_instruction.h"
@@ -619,57 +618,37 @@ static inline struct brw_reg
 brw_imm_v(unsigned v)
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
-   imm.vstride = BRW_VERTICAL_STRIDE_0;
-   imm.width = BRW_WIDTH_8;
-   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
    imm.ud = v;
    return imm;
 }
 
+/** Construct vector of eight unsigned half-byte values */
+static inline struct brw_reg
+brw_imm_uv(unsigned uv)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UV);
+   imm.ud = uv;
+   return imm;
+}
+
 /** Construct vector of four 8-bit float values */
 static inline struct brw_reg
 brw_imm_vf(unsigned v)
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
-   imm.vstride = BRW_VERTICAL_STRIDE_0;
-   imm.width = BRW_WIDTH_4;
-   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
    imm.ud = v;
    return imm;
 }
 
-/**
- * Convert an integer into a "restricted" 8-bit float, used in vector
- * immediates.  The 8-bit floating point format has a sign bit, an
- * excess-3 3-bit exponent, and a 4-bit mantissa.  All integer values
- * from -31 to 31 can be represented exactly.
- */
-static inline uint8_t
-int_to_float8(int x)
-{
-   if (x == 0) {
-      return 0;
-   } else if (x < 0) {
-      return 1 << 7 | int_to_float8(-x);
-   } else {
-      const unsigned exponent = _mesa_logbase2(x);
-      const unsigned mantissa = (x - (1 << exponent)) << (4 - exponent);
-      assert(exponent <= 4);
-      return (exponent + 3) << 4 | mantissa;
-   }
-}
-
-/**
- * Construct a floating-point packed vector immediate from its integer
- * values. \sa int_to_float8()
- */
 static inline struct brw_reg
-brw_imm_vf4(int v0, int v1, int v2, int v3)
+brw_imm_vf4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
 {
-   return brw_imm_vf((int_to_float8(v0) << 0) |
-                     (int_to_float8(v1) << 8) |
-                     (int_to_float8(v2) << 16) |
-                     (int_to_float8(v3) << 24));
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.ud = ((v0 << 0) | (v1 << 8) | (v2 << 16) | (v3 << 24));
+   return imm;
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 1f3ae7ab5e6..2f0e8b680ab 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -72,22 +72,6 @@ shader_perf_log_mesa(void *data, const char *fmt, ...)
    va_end(args);
 }
 
-bool
-is_scalar_shader_stage(const struct brw_compiler *compiler, int stage)
-{
-   switch (stage) {
-   case MESA_SHADER_FRAGMENT:
-   case MESA_SHADER_COMPUTE:
-      return true;
-   case MESA_SHADER_GEOMETRY:
-      return compiler->scalar_gs;
-   case MESA_SHADER_VERTEX:
-      return compiler->scalar_vs;
-   default:
-      return false;
-   }
-}
-
 struct brw_compiler *
 brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 {
@@ -100,11 +84,12 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
    brw_fs_alloc_reg_sets(compiler);
    brw_vec4_alloc_reg_set(compiler);
 
-   if (devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
-      compiler->scalar_vs = true;
-
-   if (devinfo->gen >= 8 && brw_env_var_as_boolean("INTEL_SCALAR_GS", false))
-      compiler->scalar_gs = true;
+   compiler->scalar_stage[MESA_SHADER_VERTEX] =
+      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
+      devinfo->gen >= 8 && brw_env_var_as_boolean("INTEL_SCALAR_GS", false);
+   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
+   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
 
    nir_shader_compiler_options *nir_options =
       rzalloc(compiler, nir_shader_compiler_options);
@@ -139,7 +124,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
       compiler->glsl_compiler_options[i].LowerClipDistance = true;
 
-      bool is_scalar = is_scalar_shader_stage(compiler, i);
+      bool is_scalar = compiler->scalar_stage[i];
 
       compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
       compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
@@ -154,6 +139,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
    }
 
+   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
+
    return compiler;
 }
 
@@ -557,6 +545,8 @@ brw_instruction_name(enum opcode op)
       return "barrier";
    case SHADER_OPCODE_MULH:
       return "mulh";
+   case SHADER_OPCODE_MOV_INDIRECT:
+      return "mov_indirect";
    }
 
    unreachable("not reached");
@@ -574,16 +564,12 @@ brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg)
    switch (type) {
    case BRW_REGISTER_TYPE_UD:
    case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
    case BRW_REGISTER_TYPE_UQ:
    case BRW_REGISTER_TYPE_Q:
       /* Nothing to do. */
       return false;
-   case BRW_REGISTER_TYPE_UW:
-      sat_imm.ud = CLAMP(imm.ud, 0, USHRT_MAX);
-      break;
-   case BRW_REGISTER_TYPE_W:
-      sat_imm.d = CLAMP(imm.d, SHRT_MIN, SHRT_MAX);
-      break;
    case BRW_REGISTER_TYPE_F:
       sat_imm.f = CLAMP(imm.f, 0.0f, 1.0f);
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index c4a37187ce2..9555406c777 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -252,8 +252,6 @@ int type_size_scalar(const struct glsl_type *type);
 int type_size_vec4(const struct glsl_type *type);
 int type_size_vec4_times_4(const struct glsl_type *type);
 
-bool is_scalar_shader_stage(const struct brw_compiler *compiler, int stage);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index 0d49ab7b431..69eed4bc629 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -32,8 +32,8 @@
 /* This macro allows us to write the table almost as it appears in the PRM,
  * while restructuring it to turn it into the C code we want.
  */
-#define SF(sampl, filt, shad, ck, rt, ab, vb, so, color, sf) \
-   [BRW_SURFACEFORMAT_##sf] = { true, sampl, filt, shad, ck, rt, ab, vb, so, color, #sf},
+#define SF(sampl, filt, shad, ck, rt, ab, vb, so, color, ccs_e, sf) \
+   [BRW_SURFACEFORMAT_##sf] = { true, sampl, filt, shad, ck, rt, ab, vb, so, color, ccs_e, #sf},
 
 #define Y 0
 #define x 999
@@ -61,6 +61,7 @@
  * VB    - Input Vertex Buffer
  * SO    - Steamed Output Vertex Buffers (transform feedback)
  * color - Color Processing
+ * ccs_e - Lossless Compression Support (gen9+ only)
  * sf    - Surface Format
  *
  * See page 88 of the Sandybridge PRM VOL4_Part1 PDF.
@@ -71,257 +72,258 @@
  * - VOL2_Part1 section 2.5.11 Format Conversion (vertex fetch).
  * - VOL4_Part1 section 2.12.2.1.2 Sampler Output Channel Mapping.
  * - VOL4_Part1 section 3.9.11 Render Target Write.
+ * - Render Target Surface Types [SKL+]
  */
 const struct brw_surface_format_info surface_formats[] = {
-/* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32B32A32_FLOAT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32B32A32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32B32A32_UINT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, R32G32B32X32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32B32A32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64_PASSTHRU)
-   SF( Y, 50,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_FLOAT)
-   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_SINT)
-   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_UINT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32B32_SFIXED)
-   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x, 60, R16G16B16A16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16G16B16A16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16B16A16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16B16A16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16G16B16A16_FLOAT)
-   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32_FLOAT)
-   SF( Y, 70,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32_FLOAT_LD)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32_UINT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, R32_FLOAT_X8X24_TYPELESS)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, X32_TYPELESS_G8X24_UINT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, L32A32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64_FLOAT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R16G16B16X16_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R16G16B16X16_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, A32X32_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, L32X32_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, I32X32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16A16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16A16_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64_PASSTHRU)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  Y,  x, 60, B8G8R8A8_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B8G8R8A8_UNORM_SRGB)
-/* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, R10G10B10A2_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x, 60, R10G10B10A2_UNORM_SRGB)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R10G10B10A2_UINT)
-   SF( Y,  Y,  x,  x,  x,  Y,  Y,  x,  x, R10G10B10_SNORM_A2_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, R8G8B8A8_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, R8G8B8A8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R8G8B8A8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8B8A8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8B8A8_UINT)
-   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x,  x, R16G16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16G16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16G16_FLOAT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, B10G10R10A2_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, B10G10R10A2_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R11G11B10_FLOAT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32_UINT)
-   SF( Y, 50,  Y,  x,  Y,  Y,  Y,  Y,  x, R32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, R24_UNORM_X8_TYPELESS)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, X24_TYPELESS_G8_UINT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, L16A16_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, I24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, L24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, A24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, I32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, L32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, A32_FLOAT)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x, 60, B8G8R8X8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, B8G8R8X8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R8G8B8X8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R8G8B8X8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R9G9B9E5_SHAREDEXP)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, B10G10R10X2_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, L16A16_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_SNORM)
-/* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R10G10B10X2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8A8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8A8_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_USCALED)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B5G6R5_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B5G6R5_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B5G5R5A1_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5A1_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B4G4R4A4_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B4G4R4A4_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R8G8_UNORM)
-   SF( Y,  Y,  x,  Y,  Y, 60,  Y,  x,  x, R8G8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8_UINT)
-   SF( Y,  Y,  Y,  x,  Y, 45,  Y,  x, 70, R16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16_FLOAT)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, A8P8_UNORM_PALETTE0)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, A8P8_UNORM_PALETTE1)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, I16_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, L16_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, A16_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, L8A8_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, I16_FLOAT)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, L16_FLOAT)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, A16_FLOAT)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, L8A8_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, R5G5_SNORM_B6_UNORM)
-   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5X1_UNORM)
-   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5X1_UNORM_SRGB)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8_USCALED)
-/* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16_USCALED)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, P8A8_UNORM_PALETTE0)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, P8A8_UNORM_PALETTE1)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, A1B5G5R5_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, A4B4G4R4_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8A8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8A8_SINT)
-   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x, R8_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8_UINT)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, A8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, I8_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, L8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, P4A4_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, A4P4_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8_USCALED)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P8_UNORM_PALETTE0)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, L8_UNORM_SRGB)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P8_UNORM_PALETTE1)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P4A4_UNORM_PALETTE1)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, A4P4_UNORM_PALETTE1)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, Y8_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, I8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, I8_SINT)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, DXT1_RGB_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R1_UINT)
-   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, YCRCB_NORMAL)
-   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, YCRCB_SWAPUVY)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P2_UNORM_PALETTE0)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P2_UNORM_PALETTE1)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC1_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC2_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC3_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC4_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC5_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC1_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC2_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC3_UNORM_SRGB)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, MONO8)
-   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, YCRCB_SWAPUV)
-   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, YCRCB_SWAPY)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, DXT1_RGB)
-/* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, FXT1)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64B64A64_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64B64_FLOAT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC4_SNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC5_SNORM)
-   SF(50, 50,  x,  x,  x,  x, 60,  x,  x, R16G16B16_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_USCALED)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC6H_SF16)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC7_UNORM)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC7_UNORM_SRGB)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC6H_UF16)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, PLANAR_420_8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_UNORM_SRGB)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC1_RGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_RGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_R11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_RG11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_SIGNED_R11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_SIGNED_RG11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_SRGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R16G16B16_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R16G16B16_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64B64A64_PASSTHRU)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64B64_PASSTHRU)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_RGB8_PTA)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_SRGB8_PTA)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_EAC_RGBA8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_EAC_SRGB8_A8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_SINT)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_4x4_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_5x4_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_5x5_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_6x5_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_6x6_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x5_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x6_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x8_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x5_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x6_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x8_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x10_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_12x10_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_12x12_FLT16)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_4x4_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_5x4_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_5x5_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_6x5_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_6x6_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x5_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x6_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x8_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x5_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x6_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x8_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x10_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_12x10_U8sRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_12x12_U8sRGB)
+/* smpl filt shad CK  RT  AB  VB  SO  color ccs_e */
+   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x,   90,   R32G32B32A32_FLOAT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   90,   R32G32B32A32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   90,   R32G32B32A32_UINT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32B32A32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32B32A32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R64G64_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,   R32G32B32X32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32B32A32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32B32A32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R32G32B32A32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R64G64_PASSTHRU)
+   SF( Y, 50,  x,  x,  x,  x,  Y,  Y,  x,    x,   R32G32B32_FLOAT)
+   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x,    x,   R32G32B32_SINT)
+   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x,    x,   R32G32B32_UINT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32B32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32B32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32B32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32B32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R32G32B32_SFIXED)
+   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x, 60,   90,   R16G16B16A16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,   90,   R16G16B16A16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   90,   R16G16B16A16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   90,   R16G16B16A16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,   90,   R16G16B16A16_FLOAT)
+   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x,   90,   R32G32_FLOAT)
+   SF( Y, 70,  x,  x,  Y,  Y,  Y,  Y,  x,    x,   R32G32_FLOAT_LD)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   90,   R32G32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   90,   R32G32_UINT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,   R32_FLOAT_X8X24_TYPELESS)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x,    x,   X32_TYPELESS_G8X24_UINT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,   L32A32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R64_FLOAT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   R16G16B16X16_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,   90,   R16G16B16X16_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,   A32X32_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,   L32X32_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,   I32X32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16G16B16A16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16G16B16A16_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32G32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R32G32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R64_PASSTHRU)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  Y,  x, 60,   90,   B8G8R8A8_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x,    x,   B8G8R8A8_UNORM_SRGB)
+/* smpl filt shad CK  RT  AB  VB  SO  color ccs_e */
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60,    x,   R10G10B10A2_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x, 60,    x,   R10G10B10A2_UNORM_SRGB)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,    x,   R10G10B10A2_UINT)
+   SF( Y,  Y,  x,  x,  x,  Y,  Y,  x,  x,    x,   R10G10B10_SNORM_A2_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60,   90,   R8G8B8A8_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60,    x,   R8G8B8A8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,   90,   R8G8B8A8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   90,   R8G8B8A8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   90,   R8G8B8A8_UINT)
+   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x,  x,   90,   R16G16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,   90,   R16G16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   90,   R16G16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   90,   R16G16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,   90,   R16G16_FLOAT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60,    x,   B10G10R10A2_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60,    x,   B10G10R10A2_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,    x,   R11G11B10_FLOAT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   90,   R32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   90,   R32_UINT)
+   SF( Y, 50,  Y,  x,  Y,  Y,  Y,  Y,  x,   90,   R32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,   R24_UNORM_X8_TYPELESS)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x,    x,   X24_TYPELESS_G8_UINT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   L16A16_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,   I24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,   L24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,   A24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,   I32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,   L32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,   A32_FLOAT)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x, 60,   90,   B8G8R8X8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   B8G8R8X8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   R8G8B8X8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   R8G8B8X8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   R9G9B9E5_SHAREDEXP)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   B10G10R10X2_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   L16A16_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32_SNORM)
+/* smpl filt shad CK  RT  AB  VB  SO  color ccs_e */
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R10G10B10X2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8G8B8A8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8G8B8A8_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16G16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16G16_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R32_USCALED)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x,    x,   B5G6R5_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x,    x,   B5G6R5_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x,    x,   B5G5R5A1_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x,    x,   B5G5R5A1_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x,    x,   B4G4R4A4_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x,    x,   B4G4R4A4_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,    x,   R8G8_UNORM)
+   SF( Y,  Y,  x,  Y,  Y, 60,  Y,  x,  x,    x,   R8G8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,    x,   R8G8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,    x,   R8G8_UINT)
+   SF( Y,  Y,  Y,  x,  Y, 45,  Y,  x, 70,    x,   R16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,    x,   R16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,    x,   R16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,    x,   R16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,    x,   R16_FLOAT)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,   A8P8_UNORM_PALETTE0)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,   A8P8_UNORM_PALETTE1)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,   I16_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,   L16_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,   A16_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,   L8A8_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,   I16_FLOAT)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,   L16_FLOAT)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,   A16_FLOAT)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,   R5G5_SNORM_B6_UNORM)
+   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x,    x,   B5G5R5X1_UNORM)
+   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x,    x,   B5G5R5X1_UNORM_SRGB)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8G8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8G8_USCALED)
+/* smpl filt shad CK  RT  AB  VB  SO  color ccs_e */
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16_USCALED)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,   P8A8_UNORM_PALETTE0)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,   P8A8_UNORM_PALETTE1)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   A1B5G5R5_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   A4B4G4R4_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_SINT)
+   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x,    x,   R8_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,    x,   R8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,    x,   R8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,    x,   R8_UINT)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x,    x,   A8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   I8_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,   L8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   P4A4_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   A4P4_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8_USCALED)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   P8_UNORM_PALETTE0)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   L8_UNORM_SRGB)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   P8_UNORM_PALETTE1)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   P4A4_UNORM_PALETTE1)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   A4P4_UNORM_PALETTE1)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   Y8_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   I8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   I8_SINT)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   DXT1_RGB_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   R1_UINT)
+   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60,    x,   YCRCB_NORMAL)
+   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60,    x,   YCRCB_SWAPUVY)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   P2_UNORM_PALETTE0)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,   P2_UNORM_PALETTE1)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,   BC1_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,   BC2_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,   BC3_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   BC4_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   BC5_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   BC1_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   BC2_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   BC3_UNORM_SRGB)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x,    x,   MONO8)
+   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60,    x,   YCRCB_SWAPUV)
+   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60,    x,   YCRCB_SWAPY)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   DXT1_RGB)
+/* smpl filt shad CK  RT  AB  VB  SO  color ccs_e */
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   FXT1)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8G8B8_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8G8B8_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8G8B8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R8G8B8_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R64G64B64A64_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R64G64B64_FLOAT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   BC4_SNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,   BC5_SNORM)
+   SF(50, 50,  x,  x,  x,  x, 60,  x,  x,    x,   R16G16B16_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16G16B16_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16G16B16_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16G16B16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,   R16G16B16_USCALED)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,   BC6H_SF16)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,   BC7_UNORM)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,   BC7_UNORM_SRGB)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,   BC6H_UF16)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   PLANAR_420_8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R8G8B8_UNORM_SRGB)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC1_RGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_RGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_R11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_RG11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_SIGNED_R11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_SIGNED_RG11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_SRGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R16G16B16_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R16G16B16_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R10G10B10A2_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R10G10B10A2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R10G10B10A2_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R10G10B10A2_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   B10G10R10A2_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   B10G10R10A2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   B10G10R10A2_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   B10G10R10A2_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   B10G10R10A2_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R64G64B64A64_PASSTHRU)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R64G64B64_PASSTHRU)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_RGB8_PTA)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_SRGB8_PTA)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_EAC_RGBA8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_EAC_SRGB8_A8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R8G8B8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R8G8B8_SINT)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_4x4_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_5x4_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_5x5_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_6x5_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_6x6_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_8x5_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_8x6_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_8x8_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_10x5_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_10x6_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_10x8_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_10x10_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_12x10_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_12x12_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_4x4_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_5x4_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_5x5_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_6x5_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_6x6_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_8x5_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_8x6_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_8x8_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_10x5_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_10x6_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_10x8_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_10x10_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_12x10_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_12x12_U8sRGB)
 };
 #undef x
 #undef Y
@@ -771,6 +773,26 @@ brw_render_target_supported(struct brw_context *brw,
    return brw->format_supported_as_render_target[format];
 }
 
+/*
+ * True if the underlying hardware format can support lossless color
+ * compression.
+ */
+bool
+brw_losslessly_compressible_format(struct brw_context *brw,
+                                   uint32_t brw_format)
+{
+   const struct brw_surface_format_info * const sinfo =
+      &surface_formats[brw_format];
+   const int gen = brw->gen * 10;
+
+   assert(brw->gen >= 9);
+
+   if (gen >= sinfo->lossless_compression)
+      return true;
+
+   return false;
+}
+
 GLuint
 translate_tex_format(struct brw_context *brw,
                      mesa_format mesa_format,
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.h b/src/mesa/drivers/dri/i965/brw_surface_formats.h
index 5c7b60e680b..a5cd49f5260 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.h
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.h
@@ -34,6 +34,7 @@ struct brw_surface_format_info {
    int input_vb;
    int streamed_output_vb;
    int color_processing;
+   int lossless_compression;
    const char *name;
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index a086b43e11a..ae3cf728443 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -71,51 +71,6 @@ src_reg::src_reg()
    init();
 }
 
-src_reg::src_reg(float f)
-{
-   init();
-
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_F;
-   this->f = f;
-}
-
-src_reg::src_reg(uint32_t u)
-{
-   init();
-
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_UD;
-   this->ud = u;
-}
-
-src_reg::src_reg(int32_t i)
-{
-   init();
-
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_D;
-   this->d = i;
-}
-
-src_reg::src_reg(uint8_t vf[4])
-{
-   init();
-
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_VF;
-   memcpy(&this->ud, vf, sizeof(unsigned));
-}
-
-src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
-{
-   init();
-
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_VF;
-   this->ud = (vf0 <<  0) | (vf1 <<  8) | (vf2 << 16) | (vf3 << 24);
-}
-
 src_reg::src_reg(struct brw_reg reg) :
    backend_reg(reg)
 {
@@ -382,7 +337,9 @@ vec4_visitor::opt_vector_float()
 
       remaining_channels &= ~inst->dst.writemask;
       if (remaining_channels == 0) {
-         vec4_instruction *mov = MOV(inst->dst, imm);
+         unsigned vf;
+         memcpy(&vf, imm, sizeof(vf));
+         vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf));
          mov->dst.type = BRW_REGISTER_TYPE_F;
          mov->dst.writemask = WRITEMASK_XYZW;
          inst->insert_after(block, mov);
@@ -657,13 +614,13 @@ vec4_visitor::opt_algebraic()
 	    inst->opcode = BRW_OPCODE_MOV;
 	    switch (inst->src[0].type) {
 	    case BRW_REGISTER_TYPE_F:
-	       inst->src[0] = src_reg(0.0f);
+	       inst->src[0] = brw_imm_f(0.0f);
 	       break;
 	    case BRW_REGISTER_TYPE_D:
-	       inst->src[0] = src_reg(0);
+	       inst->src[0] = brw_imm_d(0);
 	       break;
 	    case BRW_REGISTER_TYPE_UD:
-	       inst->src[0] = src_reg(0u);
+	       inst->src[0] = brw_imm_ud(0u);
 	       break;
 	    default:
 	       unreachable("not reached");
@@ -1232,7 +1189,7 @@ vec4_visitor::eliminate_find_live_channel()
       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
          if (depth == 0) {
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = src_reg(0);
+            inst->src[0] = brw_imm_d(0);
             inst->force_writemask_all = true;
             progress = true;
          }
@@ -1701,7 +1658,7 @@ vec4_visitor::emit_shader_time_end()
     */
    src_reg reset_end = shader_end_time;
    reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
-   vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
+   vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u)));
    test->conditional_mod = BRW_CONDITIONAL_Z;
 
    emit(IF(BRW_PREDICATE_NORMAL));
@@ -1715,12 +1672,12 @@ vec4_visitor::emit_shader_time_end()
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   emit(ADD(diff, src_reg(diff), src_reg(-2u)));
+   emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u)));
 
    emit_shader_time_write(0, src_reg(diff));
-   emit_shader_time_write(1, src_reg(1u));
+   emit_shader_time_write(1, brw_imm_ud(1u));
    emit(BRW_OPCODE_ELSE);
-   emit_shader_time_write(2, src_reg(1u));
+   emit_shader_time_write(2, brw_imm_ud(1u));
    emit(BRW_OPCODE_ENDIF);
 }
 
@@ -1736,7 +1693,7 @@ vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
 
    offset.type = BRW_REGISTER_TYPE_UD;
    int index = shader_time_index * 3 + shader_time_subindex;
-   emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
+   emit(MOV(offset, brw_imm_d(index * SHADER_TIME_STRIDE)));
 
    time.type = BRW_REGISTER_TYPE_UD;
    emit(MOV(time, value));
@@ -1762,11 +1719,6 @@ vec4_visitor::convert_to_hw_regs()
             reg.negate = src.negate;
             break;
 
-         case IMM:
-            reg = brw_imm_reg(src.type);
-            reg.ud = src.ud;
-            break;
-
          case UNIFORM:
             reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
                                       (src.nr + src.reg_offset) / 2,
@@ -1783,6 +1735,7 @@ vec4_visitor::convert_to_hw_regs()
 
          case ARF:
          case FIXED_GRF:
+         case IMM:
             continue;
 
          case BAD_FILE:
@@ -1978,13 +1931,19 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
                const struct brw_vs_prog_key *key,
                struct brw_vs_prog_data *prog_data,
-               const nir_shader *shader,
+               const nir_shader *src_shader,
                gl_clip_plane *clip_planes,
                bool use_legacy_snorm_formula,
                int shader_time_index,
                unsigned *final_assembly_size,
                char **error_str)
 {
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
+                                      compiler->scalar_stage[MESA_SHADER_VERTEX]);
+   shader = brw_postprocess_nir(shader, compiler->devinfo,
+                                compiler->scalar_stage[MESA_SHADER_VERTEX]);
+
    const unsigned *assembly = NULL;
 
    unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
@@ -2002,7 +1961,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
     * vec4 mode, the hardware appears to wedge unless we read something.
     */
-   if (compiler->scalar_vs)
+   if (compiler->scalar_stage[MESA_SHADER_VERTEX])
       prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
    else
       prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
@@ -2021,7 +1980,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
    else
       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
 
-   if (compiler->scalar_vs) {
+   if (compiler->scalar_stage[MESA_SHADER_VERTEX]) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 52d68c5a33d..f94f7128a07 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -276,14 +276,9 @@ public:
                      uint32_t surface, src_reg surface_reg,
                      uint32_t sampler, src_reg sampler_reg);
 
-   uint32_t gather_channel(unsigned gather_component,
-                           uint32_t surface, uint32_t sampler);
    src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
                           src_reg sampler);
    void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
-   void swizzle_result(ir_texture_opcode op, dst_reg dest,
-                       src_reg orig_val, uint32_t sampler,
-                       const glsl_type *dest_type);
 
    void emit_ndc_computation();
    void emit_psiz_and_flags(dst_reg reg);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_builder.h b/src/mesa/drivers/dri/i965/brw_vec4_builder.h
index a76a4ce4639..be1427c7db7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_builder.h
@@ -484,7 +484,7 @@ namespace brw {
             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 
             MUL(y_times_a, y, a);
-            ADD(one_minus_a, negate(a), src_reg(1.0f));
+            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
          }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 1a09f76a20c..b13d36e2c7d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -30,6 +30,7 @@
 #include "brw_vec4_gs_visitor.h"
 #include "gen6_gs_visitor.h"
 #include "brw_fs.h"
+#include "brw_nir.h"
 
 namespace brw {
 
@@ -153,7 +154,7 @@ vec4_gs_visitor::emit_prolog()
     */
    this->current_annotation = "clear r0.2";
    dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
-   vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, 0u);
+   vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u));
    inst->force_writemask_all = true;
 
    /* Create a virtual register to hold the vertex count */
@@ -161,7 +162,7 @@ vec4_gs_visitor::emit_prolog()
 
    /* Initialize the vertex_count register to 0 */
    this->current_annotation = "initialize vertex_count";
-   inst = emit(MOV(dst_reg(this->vertex_count), 0u));
+   inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u)));
    inst->force_writemask_all = true;
 
    if (c->control_data_header_size_bits > 0) {
@@ -176,7 +177,7 @@ vec4_gs_visitor::emit_prolog()
        */
       if (c->control_data_header_size_bits <= 32) {
          this->current_annotation = "initialize control data bits";
-         inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
+         inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
          inst->force_writemask_all = true;
       }
    }
@@ -274,7 +275,7 @@ vec4_gs_visitor::emit_urb_write_header(int mrf)
    vec4_instruction *inst = emit(MOV(mrf_reg, r0));
    inst->force_writemask_all = true;
    emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
-        (uint32_t) gs_prog_data->output_vertex_size_hwords);
+        brw_imm_ud(gs_prog_data->output_vertex_size_hwords));
 }
 
 
@@ -354,11 +355,12 @@ vec4_gs_visitor::emit_control_data_bits()
    src_reg dword_index(this, glsl_type::uint_type);
    if (urb_write_flags) {
       src_reg prev_count(this, glsl_type::uint_type);
-      emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
+      emit(ADD(dst_reg(prev_count), this->vertex_count,
+               brw_imm_ud(0xffffffffu)));
       unsigned log2_bits_per_vertex =
          _mesa_fls(c->control_data_bits_per_vertex);
       emit(SHR(dst_reg(dword_index), prev_count,
-               (uint32_t) (6 - log2_bits_per_vertex)));
+               brw_imm_ud(6 - log2_bits_per_vertex)));
    }
 
    /* Start building the URB write message.  The first MRF gets a copy of
@@ -375,8 +377,9 @@ vec4_gs_visitor::emit_control_data_bits()
        * the appropriate OWORD within the control data header.
        */
       src_reg per_slot_offset(this, glsl_type::uint_type);
-      emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
-      emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
+      emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u)));
+      emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset,
+           brw_imm_ud(1u));
    }
 
    if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
@@ -388,10 +391,10 @@ vec4_gs_visitor::emit_control_data_bits()
        * together.
        */
       src_reg channel(this, glsl_type::uint_type);
-      inst = emit(AND(dst_reg(channel), dword_index, 3u));
+      inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u)));
       inst->force_writemask_all = true;
       src_reg one(this, glsl_type::uint_type);
-      inst = emit(MOV(dst_reg(one), 1u));
+      inst = emit(MOV(dst_reg(one), brw_imm_ud(1u)));
       inst->force_writemask_all = true;
       src_reg channel_mask(this, glsl_type::uint_type);
       inst = emit(SHL(dst_reg(channel_mask), one, channel));
@@ -441,11 +444,11 @@ vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
 
    /* reg::sid = stream_id */
    src_reg sid(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(sid), stream_id));
+   emit(MOV(dst_reg(sid), brw_imm_ud(stream_id)));
 
    /* reg:shift_count = 2 * (vertex_count - 1) */
    src_reg shift_count(this, glsl_type::uint_type);
-   emit(SHL(dst_reg(shift_count), this->vertex_count, 1u));
+   emit(SHL(dst_reg(shift_count), this->vertex_count, brw_imm_ud(1u)));
 
    /* Note: we're relying on the fact that the GEN SHL instruction only pays
     * attention to the lower 5 bits of its second source argument, so on this
@@ -503,8 +506,8 @@ vec4_gs_visitor::gs_emit_vertex(int stream_id)
        *     vertex_count & (32 / bits_per_vertex - 1) == 0
        */
       vec4_instruction *inst =
-         emit(AND(dst_null_d(), this->vertex_count,
-                  (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
+         emit(AND(dst_null_ud(), this->vertex_count,
+                  brw_imm_ud(32 / c->control_data_bits_per_vertex - 1)));
       inst->conditional_mod = BRW_CONDITIONAL_Z;
 
       emit(IF(BRW_PREDICATE_NORMAL));
@@ -512,7 +515,7 @@ vec4_gs_visitor::gs_emit_vertex(int stream_id)
          /* If vertex_count is 0, then no control data bits have been
           * accumulated yet, so we skip emitting them.
           */
-         emit(CMP(dst_null_d(), this->vertex_count, 0u,
+         emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u),
                   BRW_CONDITIONAL_NEQ));
          emit(IF(BRW_PREDICATE_NORMAL));
          emit_control_data_bits();
@@ -525,7 +528,7 @@ vec4_gs_visitor::gs_emit_vertex(int stream_id)
           * effect of any call to EndPrimitive() that the shader may have
           * made before outputting its first vertex.
           */
-         inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
+         inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
          inst->force_writemask_all = true;
       }
       emit(BRW_OPCODE_ENDIF);
@@ -586,9 +589,9 @@ vec4_gs_visitor::gs_end_primitive()
 
    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
    src_reg one(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(one), 1u));
+   emit(MOV(dst_reg(one), brw_imm_ud(1u)));
    src_reg prev_count(this, glsl_type::uint_type);
-   emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
+   emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu)));
    src_reg mask(this, glsl_type::uint_type);
    /* Note: we're relying on the fact that the GEN SHL instruction only pays
     * attention to the lower 5 bits of its second source argument, so on this
@@ -604,7 +607,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
                const struct brw_gs_prog_key *key,
                struct brw_gs_prog_data *prog_data,
-               const nir_shader *shader,
+               const nir_shader *src_shader,
                struct gl_shader_program *shader_prog,
                int shader_time_index,
                unsigned *final_assembly_size,
@@ -614,6 +617,12 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
    memset(&c, 0, sizeof(c));
    c.key = *key;
 
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
+                                      compiler->scalar_stage[MESA_SHADER_GEOMETRY]);
+   shader = brw_postprocess_nir(shader, compiler->devinfo,
+                                compiler->scalar_stage[MESA_SHADER_GEOMETRY]);
+
    prog_data->include_primitive_id =
       (shader->info.inputs_read & VARYING_BIT_PRIMITIVE_ID) != 0;
 
@@ -773,7 +782,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
    if (compiler->devinfo->gen == 6)
       max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (output_size_bytes > max_output_size_bytes)
-      return false;
+      return NULL;
 
 
    /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
@@ -819,7 +828,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
       brw_print_vue_map(stderr, &prog_data->base.vue_map);
    }
 
-   if (compiler->scalar_gs) {
+   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) {
       /* TODO: Support instanced GS.  We have basically no tests... */
       assert(prog_data->invocations == 1);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index bf098b41590..260b515ad42 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -122,7 +122,7 @@ vec4_visitor::nir_setup_inputs()
 {
    nir_inputs = ralloc_array(mem_ctx, src_reg, nir->num_inputs);
    for (unsigned i = 0; i < nir->num_inputs; i++) {
-      nir_inputs[i] = dst_reg();
+      nir_inputs[i] = src_reg();
    }
 
    nir_foreach_variable(var, &nir->inputs) {
@@ -373,7 +373,7 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
       }
 
       reg.writemask = writemask;
-      emit(MOV(reg, src_reg(instr->value.i[i])));
+      emit(MOV(reg, brw_imm_d(instr->value.i[i])));
 
       remaining &= ~writemask;
    }
@@ -444,10 +444,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       inst->base_mrf = 2;
       inst->mlen = 1; /* always at least one */
-      inst->src[1] = src_reg(index);
+      inst->src[1] = brw_imm_ud(index);
 
       /* MRF for the first parameter */
-      src_reg lod = src_reg(0);
+      src_reg lod = brw_imm_d(0);
       int param_base = inst->base_mrf;
       int writemask = WRITEMASK_X;
       emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod));
@@ -471,12 +471,12 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       if (const_uniform_block) {
          unsigned index = prog_data->base.binding_table.ssbo_start +
                           const_uniform_block->u[0];
-         surf_index = src_reg(index);
+         surf_index = brw_imm_ud(index);
          brw_mark_surface_used(&prog_data->base, index);
       } else {
          surf_index = src_reg(this, glsl_type::uint_type);
          emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1),
-                  src_reg(prog_data->base.binding_table.ssbo_start)));
+                  brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
          surf_index = emit_uniformize(surf_index);
 
          brw_mark_surface_used(&prog_data->base,
@@ -491,7 +491,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          emit(MOV(dst_reg(offset_reg), get_nir_src(instr->src[2], 1)));
       } else {
          const_offset_bytes = instr->const_index[0];
-         emit(MOV(dst_reg(offset_reg), src_reg(const_offset_bytes)));
+         emit(MOV(dst_reg(offset_reg), brw_imm_ud(const_offset_bytes)));
       }
 
       /* Value */
@@ -566,7 +566,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                if (skipped_channels > 0) {
                   if (!has_indirect) {
                      const_offset_bytes += 4 * skipped_channels;
-                     offset_reg = src_reg(const_offset_bytes);
+                     offset_reg = brw_imm_ud(const_offset_bytes);
                   } else {
                      emit(ADD(dst_reg(offset_reg), offset_reg,
                               brw_imm_ud(4 * skipped_channels)));
@@ -614,13 +614,13 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       if (const_uniform_block) {
          unsigned index = prog_data->base.binding_table.ssbo_start +
                           const_uniform_block->u[0];
-         surf_index = src_reg(index);
+         surf_index = brw_imm_ud(index);
 
          brw_mark_surface_used(&prog_data->base, index);
       } else {
          surf_index = src_reg(this, glsl_type::uint_type);
          emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1),
-                  src_reg(prog_data->base.binding_table.ssbo_start)));
+                  brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
          surf_index = emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
@@ -637,7 +637,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          emit(MOV(dst_reg(offset_reg), get_nir_src(instr->src[1], 1)));
       } else {
          const_offset_bytes = instr->const_index[0];
-         emit(MOV(dst_reg(offset_reg), src_reg(const_offset_bytes)));
+         emit(MOV(dst_reg(offset_reg), brw_imm_ud((const_offset_bytes))));
       }
 
       /* Read the vector */
@@ -762,7 +762,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           */
          const unsigned index = prog_data->base.binding_table.ubo_start +
                                 const_block_index->u[0];
-         surf_index = src_reg(index);
+         surf_index = brw_imm_ud(index);
          brw_mark_surface_used(&prog_data->base, index);
       } else {
          /* The block index is not a constant. Evaluate the index expression
@@ -772,7 +772,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          surf_index = src_reg(this, glsl_type::uint_type);
          emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int,
                                                    instr->num_components),
-                  src_reg(prog_data->base.binding_table.ubo_start)));
+                  brw_imm_ud(prog_data->base.binding_table.ubo_start)));
          surf_index = emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
@@ -787,11 +787,11 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg offset;
 
       if (!has_indirect)  {
-         offset = src_reg(const_offset / 16);
+         offset = brw_imm_ud(const_offset / 16);
       } else {
          offset = src_reg(this, glsl_type::uint_type);
          emit(SHR(dst_reg(offset), get_nir_src(instr->src[1], nir_type_int, 1),
-                  src_reg(4u)));
+                  brw_imm_ud(4u)));
       }
 
       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
@@ -848,12 +848,12 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
    if (const_surface) {
       unsigned surf_index = prog_data->base.binding_table.ssbo_start +
                             const_surface->u[0];
-      surface = src_reg(surf_index);
+      surface = brw_imm_ud(surf_index);
       brw_mark_surface_used(&prog_data->base, surf_index);
    } else {
       surface = src_reg(this, glsl_type::uint_type);
       emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]),
-               src_reg(prog_data->base.binding_table.ssbo_start)));
+               brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
 
       /* Assume this may touch any UBO. This is the same we do for other
        * UBO/SSBO accesses with non-constant surface.
@@ -1174,8 +1174,8 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 
       emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
                brw_conditional_for_nir_comparison(instr->op)));
-      emit(MOV(dst, src_reg(0)));
-      inst = emit(MOV(dst, src_reg(~0)));
+      emit(MOV(dst, brw_imm_d(0)));
+      inst = emit(MOV(dst, brw_imm_d(~0)));
       inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
       break;
    }
@@ -1192,8 +1192,8 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
                brw_conditional_for_nir_comparison(instr->op)));
 
-      emit(MOV(dst, src_reg(0)));
-      inst = emit(MOV(dst, src_reg(~0)));
+      emit(MOV(dst, brw_imm_d(0)));
+      inst = emit(MOV(dst, brw_imm_d(~0)));
       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
       break;
    }
@@ -1235,11 +1235,11 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_f2b:
-      emit(CMP(dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+      emit(CMP(dst, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
       break;
 
    case nir_op_i2b:
-      emit(CMP(dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      emit(CMP(dst, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
       break;
 
    case nir_op_fnoise1_1:
@@ -1321,9 +1321,9 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        * subtract the result from 31 to convert the MSB count into an LSB count.
        */
       src_reg src(dst);
-      emit(CMP(dst_null_d(), src, src_reg(-1), BRW_CONDITIONAL_NZ));
+      emit(CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ));
 
-      inst = emit(ADD(dst, src, src_reg(31)));
+      inst = emit(ADD(dst, src, brw_imm_d(31)));
       inst->predicate = BRW_PREDICATE_NORMAL;
       inst->src[0].negate = true;
       break;
@@ -1364,13 +1364,13 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
        * zero.
        */
-      emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+      emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
 
       op[0].type = BRW_REGISTER_TYPE_UD;
       dst.type = BRW_REGISTER_TYPE_UD;
-      emit(AND(dst, op[0], src_reg(0x80000000u)));
+      emit(AND(dst, op[0], brw_imm_ud(0x80000000u)));
 
-      inst = emit(OR(dst, src_reg(dst), src_reg(0x3f800000u)));
+      inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u)));
       inst->predicate = BRW_PREDICATE_NORMAL;
       dst.type = BRW_REGISTER_TYPE_F;
 
@@ -1385,9 +1385,9 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        *               -> non-negative val generates 0x00000000.
        *  Predicated OR sets 1 if val is positive.
        */
-      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
-      emit(ASR(dst, op[0], src_reg(31)));
-      inst = emit(OR(dst, src_reg(dst), src_reg(1)));
+      emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G));
+      emit(ASR(dst, op[0], brw_imm_d(31)));
+      inst = emit(OR(dst, src_reg(dst), brw_imm_d(1)));
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
@@ -1418,7 +1418,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_bcsel:
-      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
       inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
       switch (dst.writemask) {
       case WRITEMASK_X:
@@ -1465,10 +1465,10 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       unsigned swiz =
          brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
 
-      emit(CMP(dst_null_d(), swizzle(op[0], swiz), src_reg(0),
+      emit(CMP(dst_null_d(), swizzle(op[0], swiz), brw_imm_d(0),
                BRW_CONDITIONAL_NZ));
-      emit(MOV(dst, src_reg(0)));
-      inst = emit(MOV(dst, src_reg(~0)));
+      emit(MOV(dst, brw_imm_d(0)));
+      inst = emit(MOV(dst, brw_imm_d(~0)));
       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
       break;
    }
@@ -1502,7 +1502,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
       dst_reg masked = dst_reg(this, glsl_type::int_type);
       masked.writemask = dst.writemask;
-      emit(AND(masked, src_reg(dst), src_reg(1)));
+      emit(AND(masked, src_reg(dst), brw_imm_d(1)));
       src_reg masked_neg = src_reg(masked);
       masked_neg.negate = true;
       emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg));
@@ -1551,6 +1551,7 @@ ir_texture_opcode_for_nir_texop(nir_texop texop)
    case nir_texop_txf_ms: op = ir_txf_ms; break;
    case nir_texop_txl: op = ir_txl; break;
    case nir_texop_txs: op = ir_txs; break;
+   case nir_texop_samples_identical: op = ir_samples_identical; break;
    default:
       unreachable("unknown texture opcode");
    }
@@ -1566,7 +1567,7 @@ glsl_type_for_nir_alu_type(nir_alu_type alu_type,
       return glsl_type::vec(components);
    case nir_type_int:
       return glsl_type::ivec(components);
-   case nir_type_unsigned:
+   case nir_type_uint:
       return glsl_type::uvec(components);
    case nir_type_bool:
       return glsl_type::bvec(components);
@@ -1582,8 +1583,8 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
 {
    unsigned texture = instr->texture_index;
    unsigned sampler = instr->sampler_index;
-   src_reg texture_reg = src_reg(texture);
-   src_reg sampler_reg = src_reg(sampler);
+   src_reg texture_reg = brw_imm_ud(texture);
+   src_reg sampler_reg = brw_imm_ud(sampler);
    src_reg coordinate;
    const glsl_type *coord_type = NULL;
    src_reg shadow_comparitor;
@@ -1597,17 +1598,6 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
                                  nir_tex_instr_dest_size(instr));
    dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
 
-   /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
-    * emitting anything other than setting up the constant result.
-    */
-   if (instr->op == nir_texop_tg4) {
-      int swiz = GET_SWZ(key_tex->swizzles[sampler], instr->component);
-      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
-         emit(MOV(dest, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
-         return;
-      }
-   }
-
    /* Load the texture operation sources */
    for (unsigned i = 0; i < instr->num_srcs; i++) {
       switch (instr->src[i].src_type) {
@@ -1622,6 +1612,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
          switch (instr->op) {
          case nir_texop_txf:
          case nir_texop_txf_ms:
+         case nir_texop_samples_identical:
             coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D,
                                      src_size);
             coord_type = glsl_type::ivec(src_size);
@@ -1661,14 +1652,6 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
 
       case nir_tex_src_ms_index: {
          sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
-         assert(coord_type != NULL);
-         if (devinfo->gen >= 7 &&
-             key_tex->compressed_multisample_layout_mask & (1 << texture)) {
-            mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg);
-         } else {
-            mcs = src_reg(0u);
-         }
-         mcs = retype(mcs, BRW_REGISTER_TYPE_UD);
          break;
       }
 
@@ -1693,7 +1676,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
          /* Emit code to evaluate the actual indexing expression */
          src_reg src = get_nir_src(instr->src[i].src, 1);
          src_reg temp(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(temp), src, src_reg(texture)));
+         emit(ADD(dst_reg(temp), src, brw_imm_ud(texture)));
          texture_reg = emit_uniformize(temp);
          break;
       }
@@ -1702,7 +1685,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
          /* Emit code to evaluate the actual indexing expression */
          src_reg src = get_nir_src(instr->src[i].src, 1);
          src_reg temp(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(temp), src, src_reg(sampler)));
+         emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler)));
          sampler_reg = emit_uniformize(temp);
          break;
       }
@@ -1718,6 +1701,17 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
       }
    }
 
+   if (instr->op == nir_texop_txf_ms ||
+       instr->op == nir_texop_samples_identical) {
+      assert(coord_type != NULL);
+      if (devinfo->gen >= 7 &&
+          key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
+         mcs = emit_mcs_fetch(coord_type, coordinate, sampler_reg);
+      } else {
+         mcs = brw_imm_ud(0u);
+      }
+   }
+
    uint32_t constant_offset = 0;
    for (unsigned i = 0; i < 3; i++) {
       if (instr->const_offset[i] != 0) {
@@ -1727,8 +1721,17 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
    }
 
    /* Stuff the channel select bits in the top of the texture offset */
-   if (instr->op == nir_texop_tg4)
-      constant_offset |= gather_channel(instr->component, texture, sampler) << 16;
+   if (instr->op == nir_texop_tg4) {
+      if (instr->component == 1 &&
+          (key_tex->gather_channel_quirk_mask & (1 << texture))) {
+         /* gather4 sampler is broken for green channel on RG32F --
+          * we must ask for blue instead.
+          */
+         constant_offset |= 2 << 16;
+      } else {
+         constant_offset |= instr->component << 16;
+      }
+   }
 
    ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
index a7c286d3ac1..28002c56cdc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
@@ -71,7 +71,7 @@ namespace {
 
             bld.MOV(writemask(tmp, mask), src);
             if (n < 4)
-               bld.MOV(writemask(tmp, ~mask), 0);
+               bld.MOV(writemask(tmp, ~mask), brw_imm_d(0));
 
             return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1);
          }
@@ -143,7 +143,7 @@ namespace brw {
             /* Emit the message send instruction. */
             const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz);
             vec4_instruction *inst =
-               bld.emit(op, dst, src_reg(payload), usurface, arg);
+               bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg));
             inst->mlen = sz;
             inst->regs_written = ret_sz;
             inst->header_size = header_sz;
@@ -235,7 +235,7 @@ namespace brw {
             const vec4_builder ubld = bld.exec_all();
             const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
 
-            ubld.MOV(dst, src_reg(0));
+            ubld.MOV(dst, brw_imm_d(0));
 
             if (bld.shader->devinfo->gen == 7 &&
                 !bld.shader->devinfo->is_haswell) {
@@ -243,7 +243,7 @@ namespace brw {
                 * have no SIMD4x2 variant.  We only use the two X channels
                 * in that case, mask everything else out.
                 */
-               ubld.MOV(writemask(dst, WRITEMASK_W), src_reg(0x11));
+               ubld.MOV(writemask(dst, WRITEMASK_W), brw_imm_d(0x11));
             }
 
             return src_reg(dst);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 6b8798da71c..caf1ee02bf0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -408,7 +408,7 @@ vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
     * You should inspect the disasm output in order to verify that the MOV is
     * not optimized away.
     */
-   emit(MOV(tmp_dst, src_reg(0x12345678u)));
+   emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 #endif
 
    /* Give tmp the form below, where "." means untouched.
@@ -427,7 +427,7 @@ vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
     *   0xhhhh0000
     */
    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
-   emit(SHL(dst, tmp_src, src_reg(16u)));
+   emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 
    /* Finally, give the write-channels of dst the form of packHalf2x16's
     * output:
@@ -466,10 +466,10 @@ vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
    src_reg tmp_src(tmp_dst);
 
    tmp_dst.writemask = WRITEMASK_X;
-   emit(AND(tmp_dst, src0, src_reg(0xffffu)));
+   emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 
    tmp_dst.writemask = WRITEMASK_Y;
-   emit(SHR(tmp_dst, src0, src_reg(16u)));
+   emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 
    dst.writemask = WRITEMASK_XY;
    emit(F16TO32(dst, tmp_src));
@@ -484,7 +484,7 @@ vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
     * vector float and a type-converting MOV.
     */
    dst_reg shift(this, glsl_type::uvec4_type);
-   emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
+   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 
    dst_reg shifted(this, glsl_type::uvec4_type);
    src0.swizzle = BRW_SWIZZLE_XXXX;
@@ -494,7 +494,7 @@ vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
    dst_reg f(this, glsl_type::vec4_type);
    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 
-   emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
+   emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 }
 
 void
@@ -506,7 +506,7 @@ vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
     * vector float and a type-converting MOV.
     */
    dst_reg shift(this, glsl_type::uvec4_type);
-   emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
+   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 
    dst_reg shifted(this, glsl_type::uvec4_type);
    src0.swizzle = BRW_SWIZZLE_XXXX;
@@ -517,11 +517,11 @@ vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 
    dst_reg scaled(this, glsl_type::vec4_type);
-   emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
+   emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 
    dst_reg max(this, glsl_type::vec4_type);
-   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
-   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
+   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
+   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 }
 
 void
@@ -532,7 +532,7 @@ vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
    inst->saturate = true;
 
    dst_reg scaled(this, glsl_type::vec4_type);
-   emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
+   emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 
    dst_reg rounded(this, glsl_type::vec4_type);
    emit(RNDE(rounded, src_reg(scaled)));
@@ -548,13 +548,13 @@ void
 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 {
    dst_reg max(this, glsl_type::vec4_type);
-   emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
+   emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 
    dst_reg min(this, glsl_type::vec4_type);
-   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
+   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 
    dst_reg scaled(this, glsl_type::vec4_type);
-   emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
+   emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 
    dst_reg rounded(this, glsl_type::vec4_type);
    emit(RNDE(rounded, src_reg(scaled)));
@@ -716,7 +716,7 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
       x_times_one_minus_a.writemask = dst.writemask;
 
       emit(MUL(y_times_a, y, a));
-      emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
+      emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
    }
@@ -850,7 +850,7 @@ vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
             coordinate));
 
    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
-            src_reg(0)));
+            brw_imm_d(0)));
 
    emit(inst);
    return src_reg(inst->dst);
@@ -892,7 +892,7 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
     */
    if (op == ir_tex || op == ir_query_levels) {
       assert(lod.file == BAD_FILE);
-      lod = src_reg(0.0f);
+      lod = brw_imm_f(0.0f);
    }
 
    enum opcode opcode;
@@ -912,12 +912,18 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
       unreachable("TXB is not valid for vertex shaders.");
    case ir_lod:
       unreachable("LOD is not valid for vertex shaders.");
+   case ir_samples_identical: {
+      /* There are some challenges implementing this for vec4, and it seems
+       * unlikely to be used anyway.  For now, just return false ways.
+       */
+      emit(MOV(dest, brw_imm_ud(0u)));
+      return;
+   }
    default:
       unreachable("Unrecognized tex op");
    }
 
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
-      opcode, dst_reg(this, dest_type));
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 
    inst->offset = constant_offset;
 
@@ -963,7 +969,7 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
 
       if (zero_mask != 0) {
          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
-                  src_reg(0)));
+                  brw_imm_d(0)));
       }
       /* Load the shadow comparitor */
       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
@@ -1062,15 +1068,20 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
    if (op == ir_txs && is_cube_array) {
       emit_math(SHADER_OPCODE_INT_QUOTIENT,
                 writemask(inst->dst, WRITEMASK_Z),
-                src_reg(inst->dst), src_reg(6));
+                src_reg(inst->dst), brw_imm_d(6));
    }
 
    if (devinfo->gen == 6 && op == ir_tg4) {
       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
    }
 
-   swizzle_result(op, dest,
-                  src_reg(inst->dst), sampler, dest_type);
+   if (op == ir_query_levels) {
+      /* # levels is in .w */
+      src_reg swizzled(dest);
+      swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
+                                      SWIZZLE_W, SWIZZLE_W);
+      emit(MOV(dest, swizzled));
+   }
 }
 
 /**
@@ -1087,7 +1098,7 @@ vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
    dst_f.type = BRW_REGISTER_TYPE_F;
 
    /* Convert from UNORM to UINT */
-   emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
+   emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
    emit(MOV(dst, src_reg(dst_f)));
 
    if (wa & WA_SIGN) {
@@ -1095,90 +1106,8 @@ vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
        * shifting the sign bit into place, then shifting back
        * preserving sign.
        */
-      emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
-      emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
-   }
-}
-
-/**
- * Set up the gather channel based on the swizzle, for gather4.
- */
-uint32_t
-vec4_visitor::gather_channel(unsigned gather_component,
-                             uint32_t surface, uint32_t sampler)
-{
-   int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
-   switch (swiz) {
-      case SWIZZLE_X: return 0;
-      case SWIZZLE_Y:
-         /* gather4 sampler is broken for green channel on RG32F --
-          * we must ask for blue instead.
-          */
-         if (key_tex->gather_channel_quirk_mask & (1 << surface))
-            return 2;
-         return 1;
-      case SWIZZLE_Z: return 2;
-      case SWIZZLE_W: return 3;
-      default:
-         unreachable("Not reached"); /* zero, one swizzles handled already */
-   }
-}
-
-void
-vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
-                             src_reg orig_val, uint32_t sampler,
-                             const glsl_type *dest_type)
-{
-   int s = key_tex->swizzles[sampler];
-
-   dst_reg swizzled_result = dest;
-
-   if (op == ir_query_levels) {
-      /* # levels is in .w */
-      orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
-      emit(MOV(swizzled_result, orig_val));
-      return;
-   }
-
-   if (op == ir_txs || dest_type == glsl_type::float_type
-			|| s == SWIZZLE_NOOP || op == ir_tg4) {
-      emit(MOV(swizzled_result, orig_val));
-      return;
-   }
-
-
-   int zero_mask = 0, one_mask = 0, copy_mask = 0;
-   int swizzle[4] = {0};
-
-   for (int i = 0; i < 4; i++) {
-      switch (GET_SWZ(s, i)) {
-      case SWIZZLE_ZERO:
-	 zero_mask |= (1 << i);
-	 break;
-      case SWIZZLE_ONE:
-	 one_mask |= (1 << i);
-	 break;
-      default:
-	 copy_mask |= (1 << i);
-	 swizzle[i] = GET_SWZ(s, i);
-	 break;
-      }
-   }
-
-   if (copy_mask) {
-      orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-      swizzled_result.writemask = copy_mask;
-      emit(MOV(swizzled_result, orig_val));
-   }
-
-   if (zero_mask) {
-      swizzled_result.writemask = zero_mask;
-      emit(MOV(swizzled_result, src_reg(0.0f)));
-   }
-
-   if (one_mask) {
-      swizzled_result.writemask = one_mask;
-      emit(MOV(swizzled_result, src_reg(1.0f)));
+      emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
+      emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
    }
 }
 
@@ -1225,7 +1154,7 @@ vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
     */
    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
                                  src_payload,
-                                 src_reg(surf_index), src_reg(atomic_op));
+                                 brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
    inst->mlen = mlen;
 }
 
@@ -1245,7 +1174,7 @@ vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
     */
    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
                                  src_reg(offset),
-                                 src_reg(surf_index), src_reg(1));
+                                 brw_imm_ud(surf_index), brw_imm_d(1));
    inst->mlen = 1;
 }
 
@@ -1286,14 +1215,14 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
       dst_reg header1_w = header1;
       header1_w.writemask = WRITEMASK_W;
 
-      emit(MOV(header1, 0u));
+      emit(MOV(header1, brw_imm_ud(0u)));
 
       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
 
 	 current_annotation = "Point size";
-	 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
-	 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
+	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
+	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
       }
 
       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
@@ -1301,13 +1230,13 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
 
-         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
-         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
+         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
 
-         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
-         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
-         emit(SHL(flags1, src_reg(flags1), src_reg(4)));
+         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
+         emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
       }
 
@@ -1324,20 +1253,20 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
-         emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
+         emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
          vec4_instruction *inst;
-         inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
+         inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
          inst->predicate = BRW_PREDICATE_NORMAL;
          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
-         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
+         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
 
       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
    } else if (devinfo->gen < 6) {
-      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
    } else {
-      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
          dst_reg reg_w = reg;
          reg_w.writemask = WRITEMASK_W;
@@ -1529,13 +1458,13 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
       src_reg index = src_reg(this, glsl_type::int_type);
 
       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
-                                   src_reg(reg_offset)));
+                                   brw_imm_d(reg_offset)));
       emit_before(block, inst, MUL(dst_reg(index), index,
-                                   src_reg(message_header_scale)));
+                                   brw_imm_d(message_header_scale)));
 
       return index;
    } else {
-      return src_reg(reg_offset * message_header_scale);
+      return brw_imm_d(reg_offset * message_header_scale);
    }
 }
 
@@ -1547,24 +1476,24 @@ vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
       src_reg index = src_reg(this, glsl_type::int_type);
 
       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
-                                   src_reg(reg_offset)));
+                                   brw_imm_d(reg_offset)));
 
       /* Pre-gen6, the message header uses byte offsets instead of vec4
        * (16-byte) offset units.
        */
       if (devinfo->gen < 6) {
-         emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
+         emit_before(block, inst, MUL(dst_reg(index), index, brw_imm_d(16)));
       }
 
       return index;
    } else if (devinfo->gen >= 8) {
       /* Store the offset in a GRF so we can send-from-GRF. */
       src_reg offset = src_reg(this, glsl_type::int_type);
-      emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
+      emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset)));
       return offset;
    } else {
       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
-      return src_reg(reg_offset * message_header_scale);
+      return brw_imm_d(reg_offset * message_header_scale);
    }
 }
 
@@ -1753,7 +1682,7 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
                                              reg_offset);
 
    emit_pull_constant_load_reg(temp,
-                               src_reg(index),
+                               brw_imm_ud(index),
                                offset,
                                block, inst);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 5dd4f98cecc..fd8be7d972c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -50,7 +50,7 @@ vec4_vs_visitor::emit_prolog()
             dst_reg dst = reg;
             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
-            emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
+            emit(MUL(dst, src_reg(dst), brw_imm_f(1.0f / 65536.0f)));
          }
 
          /* Do sign recovery for 2101010 formats if required. */
@@ -58,8 +58,8 @@ vec4_vs_visitor::emit_prolog()
             if (sign_recovery_shift.file == BAD_FILE) {
                /* shift constant: <22,22,22,30> */
                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
-               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
-               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
+               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), brw_imm_ud(22u)));
+               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), brw_imm_ud(30u)));
             }
 
             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
@@ -87,16 +87,16 @@ vec4_vs_visitor::emit_prolog()
                   /* mul constant: 1 / (2^(b-1) - 1) */
                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
                   emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
-                           src_reg(1.0f / ((1<<9) - 1))));
+                           brw_imm_f(1.0f / ((1<<9) - 1))));
                   emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W),
-                           src_reg(1.0f / ((1<<1) - 1))));
+                           brw_imm_f(1.0f / ((1<<1) - 1))));
                }
 
                dst_reg dst = reg;
                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
                emit(MOV(dst, src_reg(reg_d)));
                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
-               emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), src_reg(-1.0f));
+               emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), brw_imm_f(-1.0f));
             } else {
                /* The following equations are from the OpenGL 3.2 specification:
                 *
@@ -113,9 +113,9 @@ vec4_vs_visitor::emit_prolog()
                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
                   emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ),
-                           src_reg(1.0f / ((1<<10) - 1))));
+                           brw_imm_f(1.0f / ((1<<10) - 1))));
                   emit(MOV(writemask(normalize_factor, WRITEMASK_W),
-                           src_reg(1.0f / ((1<<2) - 1))));
+                           brw_imm_f(1.0f / ((1<<2) - 1))));
                }
 
                dst_reg dst = reg;
@@ -124,8 +124,8 @@ vec4_vs_visitor::emit_prolog()
 
                /* For signed normalization, we want the numerator to be 2c+1. */
                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
-                  emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
-                  emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
+                  emit(MUL(dst, src_reg(dst), brw_imm_f(2.0f)));
+                  emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f)));
                }
 
                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 967448e0e41..7c783f66864 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -48,6 +48,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
                     struct brw_vertex_program *vp,
                     struct brw_vs_prog_key *key)
 {
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
    GLuint program_size;
    const GLuint *program;
    struct brw_vs_prog_data prog_data;
@@ -79,7 +80,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
     * by the state cache.
     */
    int param_count = vp->program.Base.nir->num_uniforms;
-   if (!brw->intelScreen->compiler->scalar_vs)
+   if (!compiler->scalar_stage[MESA_SHADER_VERTEX])
       param_count *= 4;
 
    if (vs)
@@ -102,7 +103,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
    if (prog) {
       brw_nir_setup_glsl_uniforms(vp->program.Base.nir, prog, &vp->program.Base,
                                   &prog_data.base.base,
-                                  brw->intelScreen->compiler->scalar_vs);
+                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
    } else {
       brw_nir_setup_arb_uniforms(vp->program.Base.nir, &vp->program.Base,
                                  &prog_data.base.base);
@@ -173,7 +174,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
    /* Emit GEN4 code.
     */
    char *error_str;
-   program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx, key,
+   program = brw_compile_vs(compiler, brw, mem_ctx, key,
                             &prog_data, vp->program.Base.nir,
                             brw_select_clip_planes(&brw->ctx),
                             !_mesa_is_gles3(&brw->ctx),
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 2fef188c17e..3840ce0fe57 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -65,7 +65,7 @@ gen6_gs_visitor::emit_prolog()
                                  (prog_data->vue_map.num_slots + 1) *
                                  nir->info.gs.vertices_out);
    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
+   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
 
    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
     * so initialize it once to R0.
@@ -87,13 +87,13 @@ gen6_gs_visitor::emit_prolog()
     * headers.
     */
    this->first_vertex = src_reg(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
+   emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
 
    /* The FF_SYNC message requires to know the number of primitives generated,
     * so keep a counter for this.
     */
    this->prim_count = src_reg(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(this->prim_count), 0u));
+   emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
 
    if (gs_prog_data->gen6_xfb_enabled) {
       /* Create a virtual register to hold destination indices in SOL */
@@ -170,7 +170,7 @@ gen6_gs_visitor::gs_emit_vertex(int stream_id)
       }
 
       emit(ADD(dst_reg(this->vertex_output_offset),
-               this->vertex_output_offset, 1u));
+               this->vertex_output_offset, brw_imm_ud(1u)));
    }
 
    /* Now buffer flags for this vertex */
@@ -181,9 +181,9 @@ gen6_gs_visitor::gs_emit_vertex(int stream_id)
       /* If we are outputting points, then every vertex has PrimStart and
        * PrimEnd set.
        */
-      emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
-               URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
-      emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
+      emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
+                              URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
+      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
    } else {
       /* Otherwise, we can only set the PrimStart flag, which we have stored
        * in the first_vertex register. We will have to wait until we execute
@@ -191,11 +191,12 @@ gen6_gs_visitor::gs_emit_vertex(int stream_id)
        * vertex.
        */
       emit(OR(dst, this->first_vertex,
-              (gs_prog_data->output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
-      emit(MOV(dst_reg(this->first_vertex), 0u));
+              brw_imm_ud(gs_prog_data->output_topology <<
+                         URB_WRITE_PRIM_TYPE_SHIFT)));
+      emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
    }
    emit(ADD(dst_reg(this->vertex_output_offset),
-            this->vertex_output_offset, 1u));
+            this->vertex_output_offset, brw_imm_ud(1u)));
 }
 
 void
@@ -218,10 +219,10 @@ gen6_gs_visitor::gs_end_primitive()
     * below).
     */
    unsigned num_output_vertices = nir->info.gs.vertices_out;
-   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
-            BRW_CONDITIONAL_L));
-   vec4_instruction *inst = emit(CMP(dst_null_d(),
-                                     this->vertex_count, 0u,
+   emit(CMP(dst_null_ud(), this->vertex_count,
+            brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
+   vec4_instruction *inst = emit(CMP(dst_null_ud(),
+                                     this->vertex_count, brw_imm_ud(0u),
                                      BRW_CONDITIONAL_NEQ));
    inst->predicate = BRW_PREDICATE_NORMAL;
    emit(IF(BRW_PREDICATE_NORMAL));
@@ -231,19 +232,19 @@ gen6_gs_visitor::gs_end_primitive()
        * vertex.
        */
       src_reg offset(this, glsl_type::uint_type);
-      emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
+      emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
 
       src_reg dst(this->vertex_output);
       dst.reladdr = ralloc(mem_ctx, src_reg);
       memcpy(dst.reladdr, &offset, sizeof(src_reg));
 
-      emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
-      emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
+      emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
+      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
 
       /* Set the first vertex flag to indicate that the next vertex will start
        * a primitive.
        */
-      emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
+      emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
    }
    emit(BRW_OPCODE_ENDIF);
 }
@@ -262,7 +263,8 @@ gen6_gs_visitor::emit_urb_write_header(int mrf)
     */
    src_reg flags_offset(this, glsl_type::uint_type);
    emit(ADD(dst_reg(flags_offset),
-            this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
+            this->vertex_output_offset,
+            brw_imm_d(prog_data->vue_map.num_slots)));
 
    src_reg flags_data(this->vertex_output);
    flags_data.reladdr = ralloc(mem_ctx, src_reg);
@@ -321,7 +323,7 @@ gen6_gs_visitor::emit_thread_end()
     * points because in the point case we set PrimEnd on all vertices.
     */
    if (nir->info.gs.output_primitive != GL_POINTS) {
-      emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
+      emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
       emit(IF(BRW_PREDICATE_NORMAL));
       gs_end_primitive();
       emit(BRW_OPCODE_ENDIF);
@@ -347,7 +349,7 @@ gen6_gs_visitor::emit_thread_end()
    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
 
    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
-   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
+   emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
    emit(IF(BRW_PREDICATE_NORMAL));
    {
       this->current_annotation = "gen6 thread end: ff_sync";
@@ -364,15 +366,15 @@ gen6_gs_visitor::emit_thread_end()
                      dst_reg(this->temp), this->prim_count, this->svbi);
       } else {
          inst = emit(GS_OPCODE_FF_SYNC,
-                     dst_reg(this->temp), this->prim_count, src_reg(0u));
+                     dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
       }
       inst->base_mrf = base_mrf;
 
       /* Loop over all buffered vertices and emit URB write messages */
       this->current_annotation = "gen6 thread end: urb writes init";
       src_reg vertex(this, glsl_type::uint_type);
-      emit(MOV(dst_reg(vertex), 0u));
-      emit(MOV(dst_reg(this->vertex_output_offset), 0u));
+      emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
+      emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
 
       this->current_annotation = "gen6 thread end: urb writes";
       emit(BRW_OPCODE_DO);
@@ -416,7 +418,7 @@ gen6_gs_visitor::emit_thread_end()
 
                mrf++;
                emit(ADD(dst_reg(this->vertex_output_offset),
-                        this->vertex_output_offset, 1u));
+                        this->vertex_output_offset, brw_imm_ud(1u)));
 
                /* If this was max_usable_mrf, we can't fit anything more into
                 * this URB WRITE. Same if we reached the max. message length.
@@ -437,9 +439,9 @@ gen6_gs_visitor::emit_thread_end()
           * writing the next vertex.
           */
          emit(ADD(dst_reg(this->vertex_output_offset),
-                  this->vertex_output_offset, 1u));
+                  this->vertex_output_offset, brw_imm_ud(1u)));
 
-         emit(ADD(dst_reg(vertex), vertex, 1u));
+         emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
       }
       emit(BRW_OPCODE_WHILE);
 
@@ -468,8 +470,8 @@ gen6_gs_visitor::emit_thread_end()
    if (gs_prog_data->gen6_xfb_enabled) {
       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
       src_reg data(this, glsl_type::uint_type);
-      emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
-      emit(SHL(dst_reg(data), data, src_reg(16u)));
+      emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
+      emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
       emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
    }
 
@@ -588,8 +590,8 @@ gen6_gs_visitor::xfb_write()
 
    this->current_annotation = "gen6 thread end: svb writes init";
 
-   emit(MOV(dst_reg(this->vertex_output_offset), 0u));
-   emit(MOV(dst_reg(this->sol_prim_written), 0u));
+   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+   emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
 
    /* Check that at least one primitive can be written
     *
@@ -600,7 +602,7 @@ gen6_gs_visitor::xfb_write()
     * transform feedback is in interleaved or separate attribs mode.
     */
    src_reg sol_temp(this, glsl_type::uvec4_type);
-   emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
+   emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
 
    /* Compare SVBI calculated number with the maximum value, which is
     * in R1.4 (previously saved in this->max_svbi) for gen6.
@@ -623,7 +625,7 @@ gen6_gs_visitor::xfb_write()
 
    /* Write transform feedback data for all processed vertices. */
    for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
-      emit(MOV(dst_reg(sol_temp), i));
+      emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
                BRW_CONDITIONAL_L));
       emit(IF(BRW_PREDICATE_NORMAL));
@@ -644,8 +646,8 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
    /* Check for buffer overflow: we need room to write the complete primitive
     * (all vertices). Otherwise, avoid writing any vertices for it
     */
-   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
-   emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
+   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
+   emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
    emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
    emit(IF(BRW_PREDICATE_NORMAL));
@@ -683,7 +685,7 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
          src_reg data(this->vertex_output);
          data.reladdr = ralloc(mem_ctx, src_reg);
          int offset = get_vertex_output_offset_for_varying(vertex, varying);
-         emit(MOV(dst_reg(this->vertex_output_offset), offset));
+         emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
          memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
          data.type = output_reg[varying].type;
 
@@ -710,9 +712,9 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
              */
             emit(ADD(dst_reg(this->destination_indices),
                      this->destination_indices,
-                     src_reg(num_verts)));
+                     brw_imm_ud(num_verts)));
             emit(ADD(dst_reg(this->sol_prim_written),
-                     this->sol_prim_written, 1u));
+                     this->sol_prim_written, brw_imm_ud(1u)));
          }
 
       }
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index 9f4a5db3592..d508c4c9278 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -136,8 +136,8 @@ emit_pipeline_stat(struct brw_context *brw, drm_intel_bo *bo,
       IA_VERTICES_COUNT,   /* VERTICES_SUBMITTED */
       IA_PRIMITIVES_COUNT, /* PRIMITIVES_SUBMITTED */
       VS_INVOCATION_COUNT, /* VERTEX_SHADER_INVOCATIONS */
-      0, /* HS_INVOCATION_COUNT,*/  /* TESS_CONTROL_SHADER_PATCHES */
-      0, /* DS_INVOCATION_COUNT,*/  /* TESS_EVALUATION_SHADER_INVOCATIONS */
+      HS_INVOCATION_COUNT, /* TESS_CONTROL_SHADER_PATCHES */
+      DS_INVOCATION_COUNT, /* TESS_EVALUATION_SHADER_INVOCATIONS */
       GS_PRIMITIVES_COUNT, /* GEOMETRY_SHADER_PRIMITIVES_EMITTED */
       PS_INVOCATION_COUNT, /* FRAGMENT_SHADER_INVOCATIONS */
       CS_INVOCATION_COUNT, /* COMPUTE_SHADER_INVOCATIONS */
@@ -231,6 +231,8 @@ gen6_queryobj_get_results(struct gl_context *ctx,
    case GL_CLIPPING_INPUT_PRIMITIVES_ARB:
    case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB:
    case GL_COMPUTE_SHADER_INVOCATIONS_ARB:
+   case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
+   case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
       query->Base.Result = results[1] - results[0];
       break;
 
@@ -250,8 +252,6 @@ gen6_queryobj_get_results(struct gl_context *ctx,
          query->Base.Result /= 4;
       break;
 
-   case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
-   case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
    default:
       unreachable("Unrecognized query target in brw_queryobj_get_results()");
    }
@@ -329,11 +329,11 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
    case GL_CLIPPING_INPUT_PRIMITIVES_ARB:
    case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB:
    case GL_COMPUTE_SHADER_INVOCATIONS_ARB:
+   case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
+   case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
       emit_pipeline_stat(brw, query->bo, query->Base.Stream, query->Base.Target, 0);
       break;
 
-   case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
-   case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
    default:
       unreachable("Unrecognized query target in brw_begin_query()");
    }
@@ -381,12 +381,12 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
    case GL_CLIPPING_INPUT_PRIMITIVES_ARB:
    case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB:
    case GL_GEOMETRY_SHADER_INVOCATIONS:
+   case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
+   case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
       emit_pipeline_stat(brw, query->bo,
                          query->Base.Stream, query->Base.Target, 1);
       break;
 
-   case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
-   case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
    default:
       unreachable("Unrecognized query target in brw_end_query()");
    }
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index 69162171c4e..161de77e156 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -118,7 +118,7 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
 
    /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
     *
-    *     A PIPE_CONTOL command with the CS Stall bit set must be programmed
+    *     A PIPE_CONTROL command with the CS Stall bit set must be programmed
     *     in the ring after this instruction.
     *
     * No such restriction exists for Haswell or Baytrail.
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index 140a6544983..9cdd1c71b4d 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -187,7 +187,13 @@ gen8_emit_fast_clear_color(struct brw_context *brw,
                            struct intel_mipmap_tree *mt,
                            uint32_t *surf)
 {
-   surf[7] |= mt->fast_clear_color_value;
+   if (brw->gen >= 9) {
+      surf[12] = mt->gen9_fast_clear_color.ui[0];
+      surf[13] = mt->gen9_fast_clear_color.ui[1];
+      surf[14] = mt->gen9_fast_clear_color.ui[2];
+      surf[15] = mt->gen9_fast_clear_color.ui[3];
+   } else
+      surf[7] |= mt->fast_clear_color_value;
 }
 
 static void
@@ -208,6 +214,7 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
    int surf_index = surf_offset - &brw->wm.base.surf_offset[0];
    unsigned tiling_mode, pitch;
    const unsigned tr_mode = surface_tiling_resource_mode(mt->tr_mode);
+   const uint32_t surf_type = translate_tex_target(target);
 
    if (mt->format == MESA_FORMAT_S_UINT8) {
       tiling_mode = GEN8_SURFACE_TILING_W;
@@ -231,9 +238,14 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
        */
       if (brw->gen >= 9 || mt->num_samples == 1)
          assert(mt->halign == 16);
+
+      if (brw->gen >= 9) {
+         assert(mt->num_samples > 1 ||
+                brw_losslessly_compressible_format(brw, surf_type));
+      }
+
    }
 
-   const uint32_t surf_type = translate_tex_target(target);
    uint32_t *surf = allocate_surface_state(brw, surf_offset, surf_index);
 
    surf[0] = SET_FIELD(surf_type, BRW_SURFACE_TYPE) |
diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index c00d2e786f3..f53c4ab518a 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -75,6 +75,10 @@ static const struct debug_control debug_control[] = {
    { "cs",          DEBUG_CS },
    { "hex",         DEBUG_HEX },
    { "nocompact",   DEBUG_NO_COMPACTION },
+   { "hs",          DEBUG_TCS },
+   { "tcs",         DEBUG_TCS },
+   { "ds",          DEBUG_TES },
+   { "tes",         DEBUG_TES },
    { NULL,    0 }
 };
 
@@ -83,8 +87,8 @@ intel_debug_flag_for_shader_stage(gl_shader_stage stage)
 {
    uint64_t flags[] = {
       [MESA_SHADER_VERTEX] = DEBUG_VS,
-      [MESA_SHADER_TESS_CTRL] = 0,
-      [MESA_SHADER_TESS_EVAL] = 0,
+      [MESA_SHADER_TESS_CTRL] = DEBUG_TCS,
+      [MESA_SHADER_TESS_EVAL] = DEBUG_TES,
       [MESA_SHADER_GEOMETRY] = DEBUG_GS,
       [MESA_SHADER_FRAGMENT] = DEBUG_WM,
       [MESA_SHADER_COMPUTE] = DEBUG_CS,
diff --git a/src/mesa/drivers/dri/i965/intel_debug.h b/src/mesa/drivers/dri/i965/intel_debug.h
index 98bd7e93956..9c6030a6d7d 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.h
+++ b/src/mesa/drivers/dri/i965/intel_debug.h
@@ -69,6 +69,8 @@ extern uint64_t INTEL_DEBUG;
 #define DEBUG_CS                  (1ull << 33)
 #define DEBUG_HEX                 (1ull << 34)
 #define DEBUG_NO_COMPACTION       (1ull << 35)
+#define DEBUG_TCS                 (1ull << 36)
+#define DEBUG_TES                 (1ull << 37)
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 386b63c123d..2e2459c125b 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -333,6 +333,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_texture_compression_bptc = true;
       ctx->Extensions.ARB_texture_view = true;
       ctx->Extensions.ARB_shader_storage_buffer_object = true;
+      ctx->Extensions.EXT_shader_samples_identical = true;
 
       if (can_do_pipelined_register_writes(brw)) {
          ctx->Extensions.ARB_draw_indirect = true;
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index b1a7632d82f..87e01366932 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -35,6 +35,7 @@
 
 #include "brw_blorp.h"
 #include "brw_context.h"
+#include "brw_state.h"
 
 #include "main/enums.h"
 #include "main/fbobject.h"
@@ -192,6 +193,12 @@ intel_tiling_supports_non_msrt_mcs(struct brw_context *brw, unsigned tiling)
  *
  *     - MCS buffer for non-MSRT is supported only for RT formats 32bpp,
  *       64bpp, and 128bpp.
+ *
+ * From the Skylake documentation, it is made clear that X-tiling is no longer
+ * supported:
+ *
+ *     - MCS and Lossless compression is supported for TiledY/TileYs/TileYf
+ *     non-MSRTs only.
  */
 static bool
 intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw,
@@ -201,14 +208,6 @@ intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw,
    if (brw->gen < 7)
       return false;
 
-   if (brw->gen >= 9) {
-      /* FINISHME: Enable singlesample fast MCS clears on SKL after all GPU
-       * FINISHME: hangs are resolved.
-       */
-      perf_debug("singlesample fast MCS clears disabled on gen9");
-      return false;
-   }
-
    if (mt->disable_aux_buffers)
       return false;
 
@@ -259,7 +258,11 @@ intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw,
    if (!brw->format_supported_as_render_target[mt->format])
       return false;
 
-   return true;
+   if (brw->gen >= 9) {
+      const uint32_t brw_format = brw_format_for_mesa_format(mt->format);
+      return brw_losslessly_compressible_format(brw, brw_format);
+   } else
+      return true;
 }
 
 
@@ -1495,6 +1498,17 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    intel_get_non_msrt_mcs_alignment(mt, &block_width_px, &block_height);
    unsigned width_divisor = block_width_px * 4;
    unsigned height_divisor = block_height * 8;
+
+   /* The Skylake MCS is twice as tall as the Broadwell MCS.
+    *
+    * In pre-Skylake, each bit in the MCS contained the state of 2 cachelines
+    * in the main surface. In Skylake, it's two bits.  The extra bit
+    * doubles the MCS height, not width, because in Skylake the MCS is always
+    * Y-tiled.
+    */
+   if (brw->gen >= 9)
+      height_divisor /= 2;
+
    unsigned mcs_width =
       ALIGN(mt->logical_width0, width_divisor) / width_divisor;
    unsigned mcs_height =
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 805cd714d88..64f73ea9ae5 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -633,15 +633,22 @@ struct intel_mipmap_tree
     * The SURFACE_STATE bits associated with the last fast color clear to this
     * color mipmap tree, if any.
     *
-    * This value will only ever contain ones in bits 28-31, so it is safe to
-    * OR into dword 7 of SURFACE_STATE.
+    * Prior to GEN9 there is a single bit for RGBA clear values which gives you
+    * the option of 2^4 clear colors. Each bit determines if the color channel
+    * is fully saturated or unsaturated (Cherryview does add a 32b value per
+    * channel, but it is globally applied instead of being part of the render
+    * surface state). Starting with GEN9, the surface state accepts a 32b value
+    * for each color channel.
     *
     * @see RENDER_SURFACE_STATE.RedClearColor
     * @see RENDER_SURFACE_STATE.GreenClearColor
     * @see RENDER_SURFACE_STATE.BlueClearColor
     * @see RENDER_SURFACE_STATE.AlphaClearColor
     */
-   uint32_t fast_clear_color_value;
+   union {
+      uint32_t fast_clear_color_value;
+      union gl_color_union gen9_fast_clear_color;
+   };
 
    /**
     * Disable allocation of auxiliary buffers, such as the HiZ buffer and MCS
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 62d39f70ec4..034d8a507fe 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -107,7 +107,7 @@ TEST_F(cmod_propagation_test, basic)
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   fs_reg zero(0.0f);
+   fs_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest, src0, src1);
    bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
 
@@ -139,7 +139,7 @@ TEST_F(cmod_propagation_test, cmp_nonzero)
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   fs_reg nonzero(1.0f);
+   fs_reg nonzero(brw_imm_f(1.0f));
    bld.ADD(dest, src0, src1);
    bld.CMP(bld.null_reg_f(), dest, nonzero, BRW_CONDITIONAL_GE);
 
@@ -171,7 +171,7 @@ TEST_F(cmod_propagation_test, non_cmod_instruction)
    const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::uint_type);
    fs_reg src0 = v->vgrf(glsl_type::uint_type);
-   fs_reg zero(0u);
+   fs_reg zero(brw_imm_ud(0u));
    bld.FBL(dest, src0);
    bld.CMP(bld.null_reg_ud(), dest, zero, BRW_CONDITIONAL_GE);
 
@@ -205,7 +205,7 @@ TEST_F(cmod_propagation_test, intervening_flag_write)
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
-   fs_reg zero(0.0f);
+   fs_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest, src0, src1);
    bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
    bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
@@ -244,7 +244,7 @@ TEST_F(cmod_propagation_test, intervening_flag_read)
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
-   fs_reg zero(0.0f);
+   fs_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest0, src0, src1);
    set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
    bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
@@ -282,7 +282,7 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
-   fs_reg zero(0.0f);
+   fs_reg zero(brw_imm_f(0.0f));
    bld.ADD(offset(dest, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dest, src2)
       ->regs_written = 4;
@@ -323,7 +323,7 @@ TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
-   fs_reg zero(0.0f);
+   fs_reg zero(brw_imm_f(0.0f));
    set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
    set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
    bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
@@ -360,7 +360,7 @@ TEST_F(cmod_propagation_test, negate)
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   fs_reg zero(0.0f);
+   fs_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest, src0, src1);
    dest.negate = true;
    bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
@@ -425,7 +425,7 @@ TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::int_type);
    fs_reg src1 = v->vgrf(glsl_type::int_type);
-   fs_reg zero(0.0f);
+   fs_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest, src0, src1);
    bld.CMP(bld.null_reg_f(), retype(dest, BRW_REGISTER_TYPE_F), zero,
            BRW_CONDITIONAL_GE);
@@ -458,8 +458,8 @@ TEST_F(cmod_propagation_test, andnz_one)
    const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
-   fs_reg zero(0.0f);
-   fs_reg one(1);
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg one(brw_imm_d(1));
 
    bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
    set_condmod(BRW_CONDITIONAL_NZ,
@@ -493,8 +493,8 @@ TEST_F(cmod_propagation_test, andnz_non_one)
    const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
-   fs_reg zero(0.0f);
-   fs_reg nonone(38);
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg nonone(brw_imm_d(38));
 
    bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
    set_condmod(BRW_CONDITIONAL_NZ,
@@ -528,8 +528,8 @@ TEST_F(cmod_propagation_test, andz_one)
    const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
-   fs_reg zero(0.0f);
-   fs_reg one(1);
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg one(brw_imm_d(1));
 
    bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
    set_condmod(BRW_CONDITIONAL_Z,
diff --git a/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
index 9aa2fcc7907..e5e566c60bc 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
@@ -145,7 +145,7 @@ TEST_F(cmod_propagation_test, basic)
    dst_reg dest = dst_reg(v, glsl_type::float_type);
    src_reg src0 = src_reg(v, glsl_type::float_type);
    src_reg src1 = src_reg(v, glsl_type::float_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    dst_reg dest_null = bld.null_reg_f();
    dest_null.writemask = WRITEMASK_X;
 
@@ -181,7 +181,7 @@ TEST_F(cmod_propagation_test, basic_different_dst_writemask)
    dst_reg dest = dst_reg(v, glsl_type::float_type);
    src_reg src0 = src_reg(v, glsl_type::float_type);
    src_reg src1 = src_reg(v, glsl_type::float_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    dst_reg dest_null = bld.null_reg_f();
 
    bld.ADD(dest, src0, src1);
@@ -217,8 +217,8 @@ TEST_F(cmod_propagation_test, andz_one)
    const vec4_builder bld = vec4_builder(v).at_end();
    dst_reg dest = dst_reg(v, glsl_type::int_type);
    src_reg src0 = src_reg(v, glsl_type::float_type);
-   src_reg zero(0.0f);
-   src_reg one(1);
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg one(brw_imm_d(1));
 
    bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
    set_condmod(BRW_CONDITIONAL_Z,
@@ -253,7 +253,7 @@ TEST_F(cmod_propagation_test, non_cmod_instruction)
    const vec4_builder bld = vec4_builder(v).at_end();
    dst_reg dest = dst_reg(v, glsl_type::uint_type);
    src_reg src0 = src_reg(v, glsl_type::uint_type);
-   src_reg zero(0u);
+   src_reg zero(brw_imm_ud(0u));
    bld.FBL(dest, src0);
    bld.CMP(bld.null_reg_ud(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
 
@@ -288,7 +288,7 @@ TEST_F(cmod_propagation_test, intervening_flag_write)
    src_reg src0 = src_reg(v, glsl_type::float_type);
    src_reg src1 = src_reg(v, glsl_type::float_type);
    src_reg src2 = src_reg(v, glsl_type::float_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest, src0, src1);
    bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
    bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
@@ -328,7 +328,7 @@ TEST_F(cmod_propagation_test, intervening_flag_read)
    src_reg src0 = src_reg(v, glsl_type::float_type);
    src_reg src1 = src_reg(v, glsl_type::float_type);
    src_reg src2 = src_reg(v, glsl_type::float_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest0, src0, src1);
    set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
    bld.CMP(bld.null_reg_f(), src_reg(dest0), zero, BRW_CONDITIONAL_GE);
@@ -367,7 +367,7 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
    src_reg src0 = src_reg(v, glsl_type::float_type);
    src_reg src1 = src_reg(v, glsl_type::float_type);
    src_reg src2 = src_reg(v, glsl_type::vec2_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    bld.ADD(offset(dest, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dest, src2)
       ->regs_written = 4;
@@ -409,7 +409,7 @@ TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
    src_reg src0 = src_reg(v, glsl_type::float_type);
    src_reg src1 = src_reg(v, glsl_type::float_type);
    src_reg src2 = src_reg(v, glsl_type::float_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    dst_reg dest_null = bld.null_reg_f();
    dest_null.writemask = WRITEMASK_X;
 
@@ -449,7 +449,7 @@ TEST_F(cmod_propagation_test, negate)
    dst_reg dest = dst_reg(v, glsl_type::float_type);
    src_reg src0 = src_reg(v, glsl_type::float_type);
    src_reg src1 = src_reg(v, glsl_type::float_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest, src0, src1);
    src_reg tmp_src = src_reg(dest);
    tmp_src.negate = true;
@@ -521,7 +521,7 @@ TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
    dst_reg dest = dst_reg(v, glsl_type::int_type);
    src_reg src0 = src_reg(v, glsl_type::int_type);
    src_reg src1 = src_reg(v, glsl_type::int_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    bld.ADD(dest, src0, src1);
    bld.CMP(bld.null_reg_f(), retype(src_reg(dest), BRW_REGISTER_TYPE_F), zero,
            BRW_CONDITIONAL_GE);
@@ -555,8 +555,8 @@ TEST_F(cmod_propagation_test, andnz_non_one)
    const vec4_builder bld = vec4_builder(v).at_end();
    dst_reg dest = dst_reg(v, glsl_type::int_type);
    src_reg src0 = src_reg(v, glsl_type::float_type);
-   src_reg zero(0.0f);
-   src_reg nonone(38);
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg nonone(brw_imm_d(38));
 
    bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
    set_condmod(BRW_CONDITIONAL_NZ,
@@ -594,7 +594,7 @@ TEST_F(cmod_propagation_test, basic_vec4)
    dst_reg dest = dst_reg(v, glsl_type::vec4_type);
    src_reg src0 = src_reg(v, glsl_type::vec4_type);
    src_reg src1 = src_reg(v, glsl_type::vec4_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
 
    bld.MUL(dest, src0, src1);
    bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_NZ);
@@ -628,7 +628,7 @@ TEST_F(cmod_propagation_test, basic_vec4_different_dst_writemask)
    dest.writemask = WRITEMASK_X;
    src_reg src0 = src_reg(v, glsl_type::vec4_type);
    src_reg src1 = src_reg(v, glsl_type::vec4_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    dst_reg dest_null = bld.null_reg_f();
 
    bld.MUL(dest, src0, src1);
@@ -668,7 +668,7 @@ TEST_F(cmod_propagation_test, mad_one_component_vec4)
    src_reg src2 = src_reg(v, glsl_type::vec4_type);
    src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
    src2.negate = true;
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    src_reg tmp(dest);
    tmp.swizzle = BRW_SWIZZLE_XXXX;
    dst_reg dest_null = bld.null_reg_f();
@@ -710,7 +710,7 @@ TEST_F(cmod_propagation_test, mad_more_one_component_vec4)
    src_reg src2 = src_reg(v, glsl_type::vec4_type);
    src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
    src2.negate = true;
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    src_reg tmp(dest);
    tmp.swizzle = BRW_SWIZZLE_XXXX;
    dst_reg dest_null = bld.null_reg_f();
@@ -751,7 +751,7 @@ TEST_F(cmod_propagation_test, cmp_mov_vec4)
    src_reg src0 = src_reg(v, glsl_type::ivec4_type);
    src0.swizzle = BRW_SWIZZLE_XXXX;
    src0.file = UNIFORM;
-   src_reg nonone = retype(src_reg(16), BRW_REGISTER_TYPE_D);
+   src_reg nonone = retype(brw_imm_d(16), BRW_REGISTER_TYPE_D);
    src_reg mov_src = src_reg(dest);
    mov_src.swizzle = BRW_SWIZZLE_XXXX;
    dst_reg dest_null = bld.null_reg_d();
@@ -790,7 +790,7 @@ TEST_F(cmod_propagation_test, mul_cmp_different_channels_vec4)
    dst_reg dest = dst_reg(v, glsl_type::vec4_type);
    src_reg src0 = src_reg(v, glsl_type::vec4_type);
    src_reg src1 = src_reg(v, glsl_type::vec4_type);
-   src_reg zero(0.0f);
+   src_reg zero(brw_imm_f(0.0f));
    src_reg cmp_src = src_reg(dest);
    cmp_src.swizzle = BRW_SWIZZLE4(0,1,3,2);
 
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index a1f91d9c56a..ede409b6919 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -162,7 +162,7 @@ TEST_F(copy_propagation_test, test_swizzle_writemask)
                                                       SWIZZLE_X,
                                                       SWIZZLE_Z))));
 
-   v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), src_reg(1.0f)));
+   v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), brw_imm_f(1.0f)));
 
    vec4_instruction *test_mov =
       v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(SWIZZLE_W,
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index d84e2e98ec0..90a6bc3618f 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -135,7 +135,7 @@ TEST_F(register_coalesce_test, test_compute_to_mrf)
    m0.writemask = WRITEMASK_X;
    m0.type = BRW_REGISTER_TYPE_F;
 
-   vec4_instruction *mul = v->emit(v->MUL(temp, something, src_reg(1.0f)));
+   vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
    v->emit(v->MOV(m0, src_reg(temp)));
 
    register_coalesce(v);
@@ -159,7 +159,7 @@ TEST_F(register_coalesce_test, test_multiple_use)
    m1.type = BRW_REGISTER_TYPE_F;
 
    src_reg src = src_reg(temp);
-   vec4_instruction *mul = v->emit(v->MUL(temp, something, src_reg(1.0f)));
+   vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
    src.swizzle = BRW_SWIZZLE_XXXX;
    v->emit(v->MOV(m0, src));
    src.swizzle = BRW_SWIZZLE_XYZW;
diff --git a/src/mesa/drivers/dri/i965/test_vf_float_conversions.cpp b/src/mesa/drivers/dri/i965/test_vf_float_conversions.cpp
index 6a8bceabf16..7af97d0d097 100644
--- a/src/mesa/drivers/dri/i965/test_vf_float_conversions.cpp
+++ b/src/mesa/drivers/dri/i965/test_vf_float_conversions.cpp
@@ -40,15 +40,10 @@ void vf_float_conversion_test::SetUp() {
       int ebits = (vf >> 4) & 0x7;
       int mbits = vf & 0xf;
 
-      int e = ebits - 3;
+      float x = 1.0f + mbits / 16.0f;
+      int exp = ebits - 3;
 
-      float value = 1.0f;
-
-      value += mbits / 16.0f;
-
-      value *= exp2f(e);
-
-      vf_to_float[vf] = value;
+      vf_to_float[vf] = ldexpf(x, exp);
    }
 }
 
@@ -98,3 +93,18 @@ TEST_F(vf_float_conversion_test, test_special_case_0)
    EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(+0.0f))), f2u(+0.0f));
    EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(-0.0f))), f2u(-0.0f));
 }
+
+TEST_F(vf_float_conversion_test, test_nonrepresentable_float_input)
+{
+   EXPECT_EQ(brw_float_to_vf(+32.0f), -1);
+   EXPECT_EQ(brw_float_to_vf(-32.0f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+16.5f), -1);
+   EXPECT_EQ(brw_float_to_vf(-16.5f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+8.25f), -1);
+   EXPECT_EQ(brw_float_to_vf(-8.25f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+4.125f), -1);
+   EXPECT_EQ(brw_float_to_vf(-4.125f), -1);
+}
diff --git a/src/mesa/drivers/dri/r200/r200_tex.h b/src/mesa/drivers/dri/r200/r200_tex.h
index a8c31b741ed..14f5e71fadf 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.h
+++ b/src/mesa/drivers/dri/r200/r200_tex.h
@@ -63,7 +63,9 @@ static const struct tx_table tx_table_be[] =
    [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
    [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
    [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8X8_UNORM ] = { R200_TXFORMAT_ARGB8888, 0 },
    [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_X8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888, 0 },
    [ MESA_FORMAT_BGR_UNORM8 ] = { 0xffffffff, 0 },
    [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
    [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
@@ -91,7 +93,9 @@ static const struct tx_table tx_table_le[] =
    [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
    [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
    [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8X8_UNORM ] = { R200_TXFORMAT_ARGB8888, 0 },
    [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_X8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888, 0 },
    [ MESA_FORMAT_BGR_UNORM8 ] = { R200_TXFORMAT_ARGB8888, 0 },
    [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
    [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h b/src/mesa/drivers/dri/radeon/radeon_tex.h
index f8ec432755a..37c2fa0dc2f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
@@ -63,6 +63,8 @@ static const struct tx_table tx_table[] =
    [ MESA_FORMAT_R8G8B8A8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
    [ MESA_FORMAT_B8G8R8A8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
    [ MESA_FORMAT_A8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8X8_UNORM ] = { RADEON_TXFORMAT_ARGB8888, 0 },
+   [ MESA_FORMAT_X8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB8888, 0 },
    [ MESA_FORMAT_BGR_UNORM8 ] = { RADEON_TXFORMAT_ARGB8888, 0 },
    [ MESA_FORMAT_B5G6R5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
    [ MESA_FORMAT_R5G6B5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index ddf7f497f1e..2ae22e9e691 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -67,7 +67,7 @@ legal_src_factor(const struct gl_context *ctx, GLenum factor)
    case GL_SRC1_ALPHA:
    case GL_ONE_MINUS_SRC1_COLOR:
    case GL_ONE_MINUS_SRC1_ALPHA:
-      return _mesa_is_desktop_gl(ctx)
+      return ctx->API != API_OPENGLES
          && ctx->Extensions.ARB_blend_func_extended;
    default:
       return GL_FALSE;
@@ -100,14 +100,14 @@ legal_dst_factor(const struct gl_context *ctx, GLenum factor)
    case GL_ONE_MINUS_CONSTANT_ALPHA:
       return _mesa_is_desktop_gl(ctx) || ctx->API == API_OPENGLES2;
    case GL_SRC_ALPHA_SATURATE:
-      return (_mesa_is_desktop_gl(ctx)
+      return (ctx->API != API_OPENGLES
               && ctx->Extensions.ARB_blend_func_extended)
          || _mesa_is_gles3(ctx);
    case GL_SRC1_COLOR:
    case GL_SRC1_ALPHA:
    case GL_ONE_MINUS_SRC1_COLOR:
    case GL_ONE_MINUS_SRC1_ALPHA:
-      return _mesa_is_desktop_gl(ctx)
+      return ctx->API != API_OPENGLES
          && ctx->Extensions.ARB_blend_func_extended;
    default:
       return GL_FALSE;
@@ -404,7 +404,7 @@ _mesa_BlendEquation( GLenum mode )
    ctx->Color._BlendEquationPerBuffer = GL_FALSE;
 
    if (ctx->Driver.BlendEquationSeparate)
-      (*ctx->Driver.BlendEquationSeparate)( ctx, mode, mode );
+      ctx->Driver.BlendEquationSeparate(ctx, mode, mode);
 }
 
 
@@ -582,7 +582,7 @@ _mesa_BlendColor( GLclampf red, GLclampf green, GLclampf blue, GLclampf alpha )
    ctx->Color.BlendColor[3] = CLAMP(tmp[3], 0.0F, 1.0F);
 
    if (ctx->Driver.BlendColor)
-      (*ctx->Driver.BlendColor)(ctx, ctx->Color.BlendColor);
+      ctx->Driver.BlendColor(ctx, ctx->Color.BlendColor);
 }
 
 
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 93588a2ee18..83e238ae825 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -731,7 +731,7 @@ _mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    /* Call the device driver function only if fb is the bound read buffer */
    if (fb == ctx->ReadBuffer) {
       if (ctx->Driver.ReadBuffer)
-         (*ctx->Driver.ReadBuffer)(ctx, buffer);
+         ctx->Driver.ReadBuffer(ctx, buffer);
    }
 }
 
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index 42f67990784..a8a667e3c12 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -369,10 +369,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
          break;
       case GL_DEBUG_OUTPUT:
       case GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB:
-         if (!_mesa_is_desktop_gl(ctx))
-            goto invalid_enum_error;
-         else
-            _mesa_set_debug_state_int(ctx, cap, state);
+         _mesa_set_debug_state_int(ctx, cap, state);
          break;
       case GL_DITHER:
          if (ctx->Color.DitherFlag == state)
@@ -1225,10 +1222,7 @@ _mesa_IsEnabled( GLenum cap )
          return ctx->Polygon.CullFlag;
       case GL_DEBUG_OUTPUT:
       case GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB:
-         if (!_mesa_is_desktop_gl(ctx))
-            goto invalid_enum_error;
-         else
-            return (GLboolean) _mesa_get_debug_state_int(ctx, cap);
+         return (GLboolean) _mesa_get_debug_state_int(ctx, cap);
       case GL_DEPTH_TEST:
          return ctx->Depth.Test;
       case GL_DITHER:
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index f720de316e4..366b119aba3 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -978,9 +978,13 @@ _mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
                          GLenum severity, GLint length,
                          const GLchar *buf)
 {
-   const char *callerstr = "glDebugMessageInsert";
-
    GET_CURRENT_CONTEXT(ctx);
+   const char *callerstr;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glDebugMessageInsert";
+   else
+      callerstr = "glDebugMessageInsertKHR";
 
    if (!validate_params(ctx, INSERT, callerstr, source, type, severity))
       return; /* GL_INVALID_ENUM */
@@ -1004,15 +1008,21 @@ _mesa_GetDebugMessageLog(GLuint count, GLsizei logSize, GLenum *sources,
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_debug_state *debug;
+   const char *callerstr;
    GLuint ret;
 
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glGetDebugMessageLog";
+   else
+      callerstr = "glGetDebugMessageLogKHR";
+
    if (!messageLog)
       logSize = 0;
 
    if (logSize < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glGetDebugMessageLog(logSize=%d : logSize must not be"
-                  " negative)", logSize);
+                  "%s(logSize=%d : logSize must not be negative)",
+                  callerstr, logSize);
       return 0;
    }
 
@@ -1066,9 +1076,14 @@ _mesa_DebugMessageControl(GLenum gl_source, GLenum gl_type,
    enum mesa_debug_source source = gl_enum_to_debug_source(gl_source);
    enum mesa_debug_type type = gl_enum_to_debug_type(gl_type);
    enum mesa_debug_severity severity = gl_enum_to_debug_severity(gl_severity);
-   const char *callerstr = "glDebugMessageControl";
+   const char *callerstr;
    struct gl_debug_state *debug;
 
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glDebugMessageControl";
+   else
+      callerstr = "glDebugMessageControlKHR";
+
    if (count < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(count=%d : count must not be negative)", callerstr,
@@ -1124,10 +1139,15 @@ _mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
                      const GLchar *message)
 {
    GET_CURRENT_CONTEXT(ctx);
-   const char *callerstr = "glPushDebugGroup";
+   const char *callerstr;
    struct gl_debug_state *debug;
    struct gl_debug_message *emptySlot;
 
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glPushDebugGroup";
+   else
+      callerstr = "glPushDebugGroupKHR";
+
    switch(source) {
    case GL_DEBUG_SOURCE_APPLICATION:
    case GL_DEBUG_SOURCE_THIRD_PARTY:
@@ -1176,10 +1196,15 @@ void GLAPIENTRY
 _mesa_PopDebugGroup(void)
 {
    GET_CURRENT_CONTEXT(ctx);
-   const char *callerstr = "glPopDebugGroup";
+   const char *callerstr;
    struct gl_debug_state *debug;
    struct gl_debug_message *gdmessage, msg;
 
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glPopDebugGroup";
+   else
+      callerstr = "glPopDebugGroupKHR";
+
    debug = _mesa_lock_debug_state(ctx);
    if (!debug)
       return;
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index e94d2b74749..fa50cb68cca 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -40,7 +40,6 @@
 struct gl_extensions _mesa_extension_override_enables;
 struct gl_extensions _mesa_extension_override_disables;
 static char *extra_extensions = NULL;
-static char *cant_disable_extensions = NULL;
 
 
 /**
@@ -68,29 +67,30 @@ const struct mesa_extension _mesa_extension_table[] = {
 #undef EXT
 };
 
+static bool disabled_extensions[ARRAY_SIZE(_mesa_extension_table)];
 
 /**
  * Given an extension name, lookup up the corresponding member of struct
- * gl_extensions and return that member's offset (in bytes).  If the name is
- * not found in the \c _mesa_extension_table, return 0.
+ * gl_extensions and return that member's index.  If the name is
+ * not found in the \c _mesa_extension_table, return -1.
  *
  * \param name Name of extension.
- * \return Offset of member in struct gl_extensions.
+ * \return Index of member in struct gl_extensions.
  */
-static size_t
-name_to_offset(const char* name)
+static int
+name_to_index(const char* name)
 {
    unsigned i;
 
    if (name == 0)
-      return 0;
+      return -1;
 
    for (i = 0; i < ARRAY_SIZE(_mesa_extension_table); ++i) {
       if (strcmp(name, _mesa_extension_table[i].name) == 0)
-	 return _mesa_extension_table[i].offset;
+	 return i;
    }
 
-   return 0;
+   return -1;
 }
 
 /**
@@ -206,11 +206,11 @@ _mesa_enable_sw_extensions(struct gl_context *ctx)
  * \return offset of extensions withint `ext' or 0 if extension is not known
  */
 static size_t
-set_extension(struct gl_extensions *ext, const char *name, GLboolean state)
+set_extension(struct gl_extensions *ext, int i, GLboolean state)
 {
    size_t offset;
 
-   offset = name_to_offset(name);
+   offset = i < 0 ? 0 : _mesa_extension_table[i].offset;
    if (offset != 0 && (offset != o(dummy_true) || state != GL_FALSE)) {
       ((GLboolean *) ext)[offset] = state;
    }
@@ -240,12 +240,6 @@ get_extension_override( struct gl_context *ctx )
 {
    override_extensions_in_context(ctx);
 
-   if (cant_disable_extensions != NULL) {
-      _mesa_problem(ctx,
-                    "Trying to disable permanently enabled extensions: %s",
-	            cant_disable_extensions);
-   }
-
    if (extra_extensions == NULL) {
       return calloc(1, sizeof(char));
    } else {
@@ -257,7 +251,7 @@ get_extension_override( struct gl_context *ctx )
 
 
 /**
- * \brief Free extra_extensions and cant_disable_extensions strings
+ * \brief Free extra_extensions string
  *
  * These strings are allocated early during the first context creation by
  * _mesa_one_time_init_extension_overrides.
@@ -266,7 +260,6 @@ static void
 free_unknown_extensions_strings(void)
 {
    free(extra_extensions);
-   free(cant_disable_extensions);
 }
 
 
@@ -295,21 +288,20 @@ _mesa_one_time_init_extension_overrides(void)
 
    /* extra_exts: List of unrecognized extensions. */
    extra_extensions = calloc(ALIGN(strlen(env_const) + 2, 4), sizeof(char));
-   cant_disable_extensions = calloc(ALIGN(strlen(env_const) + 2, 4), sizeof(char));
 
    /* Copy env_const because strtok() is destructive. */
    env = strdup(env_const);
 
-   if (env == NULL || extra_extensions == NULL ||
-           cant_disable_extensions == NULL) {
-       free(env);
-       free(extra_extensions);
-       free(cant_disable_extensions);
-       return;
+   if (env == NULL ||
+       extra_extensions == NULL) {
+      free(env);
+      free(extra_extensions);
+      return;
    }
 
    for (ext = strtok(env, " "); ext != NULL; ext = strtok(NULL, " ")) {
       int enable;
+      int i;
       bool recognized;
       switch (ext[0]) {
       case '+':
@@ -325,7 +317,8 @@ _mesa_one_time_init_extension_overrides(void)
          break;
       }
 
-      offset = set_extension(&_mesa_extension_override_enables, ext, enable);
+      i = name_to_index(ext);
+      offset = set_extension(&_mesa_extension_override_enables, i, enable);
       if (offset != 0 && (offset != o(dummy_true) || enable != GL_FALSE)) {
          ((GLboolean *) &_mesa_extension_override_disables)[offset] = !enable;
          recognized = true;
@@ -333,14 +326,12 @@ _mesa_one_time_init_extension_overrides(void)
          recognized = false;
       }
 
-      if (!recognized) {
-         if (enable) {
-            strcat(extra_extensions, ext);
-            strcat(extra_extensions, " ");
-         } else if (offset == o(dummy_true)) {
-            strcat(cant_disable_extensions, ext);
-            strcat(cant_disable_extensions, " ");
-         }
+      if (i >= 0)
+         disabled_extensions[i] = !enable;
+
+      if (!recognized && enable) {
+         strcat(extra_extensions, ext);
+         strcat(extra_extensions, " ");
       }
    }
 
@@ -354,13 +345,6 @@ _mesa_one_time_init_extension_overrides(void)
    } else if (extra_extensions[len - 1] == ' ') {
       extra_extensions[len - 1] = '\0';
    }
-   len = strlen(cant_disable_extensions);
-   if (len == 0) {
-      free(cant_disable_extensions);
-      cant_disable_extensions = NULL;
-   } else if (cant_disable_extensions[len - 1] == ' ') {
-      cant_disable_extensions[len - 1] = '\0';
-   }
 }
 
 
@@ -401,7 +385,8 @@ _mesa_extension_supported(const struct gl_context *ctx, extension_index i)
    const bool *base = (bool *) &ctx->Extensions;
    const struct mesa_extension *ext = _mesa_extension_table + i;
 
-   return (ctx->Version >= ext->version[ctx->API]) && base[ext->offset];
+   return !disabled_extensions[i] &&
+          (ctx->Version >= ext->version[ctx->API]) && base[ext->offset];
 }
 
 /**
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index d12fd9f1c8d..051d69a3613 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -1,8 +1,31 @@
+/* The extension table is alphabetically sorted by the extension name string column. */
+
 #define GLL 0
 #define GLC 0
 #define ES1 0
 #define ES2 0
 #define  x ~0
+
+EXT(3DFX_texture_compression_FXT1           , TDFX_texture_compression_FXT1          , GLL, GLC,  x ,  x , 1999)
+
+EXT(AMD_conservative_depth                  , ARB_conservative_depth                 , GLL, GLC,  x ,  x , 2009)
+EXT(AMD_draw_buffers_blend                  , ARB_draw_buffers_blend                 , GLL, GLC,  x ,  x , 2009)
+EXT(AMD_performance_monitor                 , AMD_performance_monitor                , GLL, GLC,  x ,  x , 2007)
+EXT(AMD_pinned_memory                       , AMD_pinned_memory                      , GLL, GLC,  x ,  x , 2013)
+EXT(AMD_seamless_cubemap_per_texture        , AMD_seamless_cubemap_per_texture       , GLL, GLC,  x ,  x , 2009)
+EXT(AMD_shader_stencil_export               , ARB_shader_stencil_export              , GLL, GLC,  x ,  x , 2009)
+EXT(AMD_shader_trinary_minmax               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
+EXT(AMD_vertex_shader_layer                 , AMD_vertex_shader_layer                ,  x , GLC,  x ,  x , 2012)
+EXT(AMD_vertex_shader_viewport_index        , AMD_vertex_shader_viewport_index       ,  x , GLC,  x ,  x , 2012)
+
+EXT(ANGLE_texture_compression_dxt3          , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2011)
+EXT(ANGLE_texture_compression_dxt5          , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2011)
+
+EXT(APPLE_object_purgeable                  , APPLE_object_purgeable                 , GLL, GLC,  x ,  x , 2006)
+EXT(APPLE_packed_pixels                     , dummy_true                             , GLL,  x ,  x ,  x , 2002)
+EXT(APPLE_texture_max_level                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
+EXT(APPLE_vertex_array_object               , dummy_true                             , GLL,  x ,  x ,  x , 2002)
+
 EXT(ARB_ES2_compatibility                   , ARB_ES2_compatibility                  , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_ES3_compatibility                   , ARB_ES3_compatibility                  , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_arrays_of_arrays                    , ARB_arrays_of_arrays                   , GLL, GLC,  x ,  x , 2012)
@@ -16,9 +39,9 @@ EXT(ARB_color_buffer_float                  , ARB_color_buffer_float
 EXT(ARB_compressed_texture_pixel_storage    , dummy_true                             , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_compute_shader                      , ARB_compute_shader                     , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_conditional_render_inverted         , ARB_conditional_render_inverted        , GLL, GLC,  x ,  x , 2014)
+EXT(ARB_conservative_depth                  , ARB_conservative_depth                 , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_copy_buffer                         , dummy_true                             , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_copy_image                          , ARB_copy_image                         , GLL, GLC,  x ,  x , 2012)
-EXT(ARB_conservative_depth                  , ARB_conservative_depth                 , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_debug_output                        , dummy_true                             , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_depth_buffer_float                  , ARB_depth_buffer_float                 , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_depth_clamp                         , ARB_depth_clamp                        , GLL, GLC,  x ,  x , 2003)
@@ -56,8 +79,8 @@ EXT(ARB_multi_bind                          , dummy_true
 EXT(ARB_multi_draw_indirect                 , ARB_draw_indirect                      ,  x , GLC,  x ,  x , 2012)
 EXT(ARB_multisample                         , dummy_true                             , GLL,  x ,  x ,  x , 1994)
 EXT(ARB_multitexture                        , dummy_true                             , GLL,  x ,  x ,  x , 1998)
-EXT(ARB_occlusion_query2                    , ARB_occlusion_query2                   , GLL, GLC,  x ,  x , 2003)
 EXT(ARB_occlusion_query                     , ARB_occlusion_query                    , GLL,  x ,  x ,  x , 2001)
+EXT(ARB_occlusion_query2                    , ARB_occlusion_query2                   , GLL, GLC,  x ,  x , 2003)
 EXT(ARB_pipeline_statistics_query           , ARB_pipeline_statistics_query          , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_pixel_buffer_object                 , EXT_pixel_buffer_object                , GLL, GLC,  x ,  x , 2004)
 EXT(ARB_point_parameters                    , EXT_point_parameters                   , GLL,  x ,  x ,  x , 1997)
@@ -83,13 +106,13 @@ EXT(ARB_shader_subroutine                   , ARB_shader_subroutine
 EXT(ARB_shader_texture_image_samples        , ARB_shader_texture_image_samples       , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_shader_texture_lod                  , ARB_shader_texture_lod                 , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_shading_language_100                , dummy_true                             , GLL,  x ,  x ,  x , 2003)
-EXT(ARB_shading_language_packing            , ARB_shading_language_packing           , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_shading_language_420pack            , ARB_shading_language_420pack           , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_shading_language_packing            , ARB_shading_language_packing           , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_shadow                              , ARB_shadow                             , GLL,  x ,  x ,  x , 2001)
 EXT(ARB_stencil_texturing                   , ARB_stencil_texturing                  , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_sync                                , ARB_sync                               , GLL, GLC,  x ,  x , 2003)
-EXT(ARB_texture_barrier                     , NV_texture_barrier                     , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_tessellation_shader                 , ARB_tessellation_shader                ,  x , GLC,  x ,  x , 2009)
+EXT(ARB_texture_barrier                     , NV_texture_barrier                     , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_texture_border_clamp                , ARB_texture_border_clamp               , GLL,  x ,  x ,  x , 2000)
 EXT(ARB_texture_buffer_object               , ARB_texture_buffer_object              ,  x , GLC,  x ,  x , 2008)
 EXT(ARB_texture_buffer_object_rgb32         , ARB_texture_buffer_object_rgb32        ,  x , GLC,  x ,  x , 2009)
@@ -105,20 +128,20 @@ EXT(ARB_texture_env_crossbar                , ARB_texture_env_crossbar
 EXT(ARB_texture_env_dot3                    , ARB_texture_env_dot3                   , GLL,  x ,  x ,  x , 2001)
 EXT(ARB_texture_float                       , ARB_texture_float                      , GLL, GLC,  x ,  x , 2004)
 EXT(ARB_texture_gather                      , ARB_texture_gather                     , GLL, GLC,  x ,  x , 2009)
-EXT(ARB_texture_mirrored_repeat             , dummy_true                             , GLL,  x ,  x ,  x , 2001)
 EXT(ARB_texture_mirror_clamp_to_edge        , ARB_texture_mirror_clamp_to_edge       , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_texture_mirrored_repeat             , dummy_true                             , GLL,  x ,  x ,  x , 2001)
 EXT(ARB_texture_multisample                 , ARB_texture_multisample                , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_texture_non_power_of_two            , ARB_texture_non_power_of_two           , GLL, GLC,  x ,  x , 2003)
 EXT(ARB_texture_query_levels                , ARB_texture_query_levels               , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_texture_query_lod                   , ARB_texture_query_lod                  , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_texture_rectangle                   , NV_texture_rectangle                   , GLL, GLC,  x ,  x , 2004)
-EXT(ARB_texture_rgb10_a2ui                  , ARB_texture_rgb10_a2ui                 , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_texture_rg                          , ARB_texture_rg                         , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_texture_rgb10_a2ui                  , ARB_texture_rgb10_a2ui                 , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_texture_stencil8                    , ARB_texture_stencil8                   , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_texture_storage                     , dummy_true                             , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_texture_storage_multisample         , ARB_texture_multisample                , GLL, GLC,  x ,  x , 2012)
-EXT(ARB_texture_view                        , ARB_texture_view                       , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_texture_swizzle                     , EXT_texture_swizzle                    , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_texture_view                        , ARB_texture_view                       , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_timer_query                         , ARB_timer_query                        , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_transform_feedback2                 , ARB_transform_feedback2                , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_transform_feedback3                 , ARB_transform_feedback3                , GLL, GLC,  x ,  x , 2010)
@@ -127,28 +150,39 @@ EXT(ARB_transpose_matrix                    , dummy_true
 EXT(ARB_uniform_buffer_object               , ARB_uniform_buffer_object              , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_vertex_array_bgra                   , EXT_vertex_array_bgra                  , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_vertex_array_object                 , dummy_true                             , GLL, GLC,  x ,  x , 2006)
+EXT(ARB_vertex_attrib_64bit                 , ARB_vertex_attrib_64bit                ,  x , GLC,  x ,  x , 2010)
 EXT(ARB_vertex_attrib_binding               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_vertex_buffer_object                , dummy_true                             , GLL,  x ,  x ,  x , 2003)
 EXT(ARB_vertex_program                      , ARB_vertex_program                     , GLL,  x ,  x ,  x , 2002)
 EXT(ARB_vertex_shader                       , ARB_vertex_shader                      , GLL, GLC,  x ,  x , 2002)
-EXT(ARB_vertex_attrib_64bit                 , ARB_vertex_attrib_64bit                ,  x , GLC,  x ,  x , 2010)
 EXT(ARB_vertex_type_10f_11f_11f_rev         , ARB_vertex_type_10f_11f_11f_rev        , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_vertex_type_2_10_10_10_rev          , ARB_vertex_type_2_10_10_10_rev         , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_viewport_array                      , ARB_viewport_array                     ,  x , GLC,  x ,  x , 2010)
 EXT(ARB_window_pos                          , dummy_true                             , GLL,  x ,  x ,  x , 2001)
 
+EXT(ATI_blend_equation_separate             , EXT_blend_equation_separate            , GLL, GLC,  x ,  x , 2003)
+EXT(ATI_draw_buffers                        , dummy_true                             , GLL,  x ,  x ,  x , 2002)
+EXT(ATI_fragment_shader                     , ATI_fragment_shader                    , GLL,  x ,  x ,  x , 2001)
+EXT(ATI_separate_stencil                    , ATI_separate_stencil                   , GLL,  x ,  x ,  x , 2006)
+EXT(ATI_texture_compression_3dc             , ATI_texture_compression_3dc            , GLL,  x ,  x ,  x , 2004)
+EXT(ATI_texture_env_combine3                , ATI_texture_env_combine3               , GLL,  x ,  x ,  x , 2002)
+EXT(ATI_texture_float                       , ARB_texture_float                      , GLL, GLC,  x ,  x , 2002)
+EXT(ATI_texture_mirror_once                 , ATI_texture_mirror_once                , GLL, GLC,  x ,  x , 2006)
+
 EXT(EXT_abgr                                , dummy_true                             , GLL, GLC,  x ,  x , 1995)
 EXT(EXT_bgra                                , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_blend_color                         , EXT_blend_color                        , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_blend_equation_separate             , EXT_blend_equation_separate            , GLL, GLC,  x ,  x , 2003)
+EXT(EXT_blend_func_extended                 , ARB_blend_func_extended                ,  x ,  x ,  x , ES2, 2015)
 EXT(EXT_blend_func_separate                 , EXT_blend_func_separate                , GLL,  x ,  x ,  x , 1999)
-EXT(EXT_buffer_storage                      , ARB_buffer_storage                     ,  x ,  x ,  x ,  31, 2015)
-EXT(EXT_discard_framebuffer                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
 EXT(EXT_blend_minmax                        , EXT_blend_minmax                       , GLL,  x , ES1, ES2, 1995)
 EXT(EXT_blend_subtract                      , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_buffer_storage                      , ARB_buffer_storage                     ,  x ,  x ,  x ,  31, 2015)
+EXT(EXT_color_buffer_float                  , dummy_true                             ,  x ,  x , ES1,  30, 2013)
 EXT(EXT_compiled_vertex_array               , dummy_true                             , GLL,  x ,  x ,  x , 1996)
 EXT(EXT_copy_texture                        , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_depth_bounds_test                   , EXT_depth_bounds_test                  , GLL, GLC,  x ,  x , 2002)
+EXT(EXT_discard_framebuffer                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
 EXT(EXT_draw_buffers                        , dummy_true                             ,  x ,  x ,  x , ES2, 2012)
 EXT(EXT_draw_buffers2                       , EXT_draw_buffers2                      , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          ,  x ,  x ,  x , ES2, 2014)
@@ -172,20 +206,21 @@ EXT(EXT_point_parameters                    , EXT_point_parameters
 EXT(EXT_polygon_offset                      , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_polygon_offset_clamp                , EXT_polygon_offset_clamp               , GLL, GLC,  x ,  x , 2014)
 EXT(EXT_provoking_vertex                    , EXT_provoking_vertex                   , GLL, GLC,  x ,  x , 2009)
+EXT(EXT_read_format_bgra                    , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
 EXT(EXT_rescale_normal                      , dummy_true                             , GLL,  x ,  x ,  x , 1997)
 EXT(EXT_secondary_color                     , dummy_true                             , GLL,  x ,  x ,  x , 1999)
 EXT(EXT_separate_shader_objects             , dummy_true                             ,  x ,  x ,  x , ES2, 2013)
 EXT(EXT_separate_specular_color             , dummy_true                             , GLL,  x ,  x ,  x , 1997)
-EXT(EXT_shader_integer_mix                  , EXT_shader_integer_mix                 , GLL, GLC, ES1,  30, 2013)
+EXT(EXT_shader_integer_mix                  , EXT_shader_integer_mix                 , GLL, GLC,  x ,  30, 2013)
+EXT(EXT_shader_samples_identical            , EXT_shader_samples_identical           , GLL, GLC,  x ,  31, 2015)
 EXT(EXT_shadow_funcs                        , ARB_shadow                             , GLL,  x ,  x ,  x , 2002)
 EXT(EXT_stencil_two_side                    , EXT_stencil_two_side                   , GLL,  x ,  x ,  x , 2001)
 EXT(EXT_stencil_wrap                        , dummy_true                             , GLL,  x ,  x ,  x , 2002)
 EXT(EXT_subtexture                          , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_texture                             , dummy_true                             , GLL,  x ,  x ,  x , 1996)
 EXT(EXT_texture3D                           , dummy_true                             , GLL,  x ,  x ,  x , 1996)
 EXT(EXT_texture_array                       , EXT_texture_array                      , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_texture_compression_dxt1            , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2004)
-EXT(ANGLE_texture_compression_dxt3          , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2011)
-EXT(ANGLE_texture_compression_dxt5          , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2011)
 EXT(EXT_texture_compression_latc            , EXT_texture_compression_latc           , GLL,  x ,  x ,  x , 2006)
 EXT(EXT_texture_compression_rgtc            , ARB_texture_compression_rgtc           , GLL, GLC,  x ,  x , 2004)
 EXT(EXT_texture_compression_s3tc            , EXT_texture_compression_s3tc           , GLL, GLC,  x ,  x , 2000)
@@ -196,28 +231,66 @@ EXT(EXT_texture_env_combine                 , dummy_true
 EXT(EXT_texture_env_dot3                    , EXT_texture_env_dot3                   , GLL,  x ,  x ,  x , 2000)
 EXT(EXT_texture_filter_anisotropic          , EXT_texture_filter_anisotropic         , GLL, GLC, ES1, ES2, 1999)
 EXT(EXT_texture_format_BGRA8888             , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
-EXT(EXT_texture_rg                          , ARB_texture_rg                         ,  x ,  x ,  x , ES2, 2011)
-EXT(EXT_read_format_bgra                    , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
 EXT(EXT_texture_integer                     , EXT_texture_integer                    , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_texture_lod_bias                    , dummy_true                             , GLL,  x , ES1,  x , 1999)
 EXT(EXT_texture_mirror_clamp                , EXT_texture_mirror_clamp               , GLL, GLC,  x ,  x , 2004)
 EXT(EXT_texture_object                      , dummy_true                             , GLL,  x ,  x ,  x , 1995)
-EXT(EXT_texture                             , dummy_true                             , GLL,  x ,  x ,  x , 1996)
 EXT(EXT_texture_rectangle                   , NV_texture_rectangle                   , GLL,  x ,  x ,  x , 2004)
-EXT(EXT_texture_shared_exponent             , EXT_texture_shared_exponent            , GLL, GLC,  x ,  x , 2004)
-EXT(EXT_texture_snorm                       , EXT_texture_snorm                      , GLL, GLC,  x ,  x , 2009)
+EXT(EXT_texture_rg                          , ARB_texture_rg                         ,  x ,  x ,  x , ES2, 2011)
 EXT(EXT_texture_sRGB                        , EXT_texture_sRGB                       , GLL, GLC,  x ,  x , 2004)
 EXT(EXT_texture_sRGB_decode                 , EXT_texture_sRGB_decode                , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_texture_shared_exponent             , EXT_texture_shared_exponent            , GLL, GLC,  x ,  x , 2004)
+EXT(EXT_texture_snorm                       , EXT_texture_snorm                      , GLL, GLC,  x ,  x , 2009)
 EXT(EXT_texture_swizzle                     , EXT_texture_swizzle                    , GLL, GLC,  x ,  x , 2008)
 EXT(EXT_texture_type_2_10_10_10_REV         , dummy_true                             ,  x ,  x ,  x , ES2, 2008)
 EXT(EXT_timer_query                         , EXT_timer_query                        , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_transform_feedback                  , EXT_transform_feedback                 , GLL, GLC,  x ,  x , 2011)
 EXT(EXT_unpack_subimage                     , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
-EXT(EXT_vertex_array_bgra                   , EXT_vertex_array_bgra                  , GLL, GLC,  x ,  x , 2008)
 EXT(EXT_vertex_array                        , dummy_true                             , GLL,  x ,  x ,  x , 1995)
-EXT(EXT_color_buffer_float                  , dummy_true                             ,  x ,  x , ES1,  30, 2013)
+EXT(EXT_vertex_array_bgra                   , EXT_vertex_array_bgra                  , GLL, GLC,  x ,  x , 2008)
 
+EXT(IBM_multimode_draw_arrays               , dummy_true                             , GLL, GLC,  x ,  x , 1998)
+EXT(IBM_rasterpos_clip                      , dummy_true                             , GLL,  x ,  x ,  x , 1996)
+EXT(IBM_texture_mirrored_repeat             , dummy_true                             , GLL,  x ,  x ,  x , 1998)
 
+EXT(INGR_blend_func_separate                , EXT_blend_func_separate                , GLL,  x ,  x ,  x , 1999)
+
+EXT(INTEL_performance_query                 , INTEL_performance_query                , GLL, GLC,  x , ES2, 2013)
+
+EXT(KHR_context_flush_control               , dummy_true                             , GLL, GLC,  x , ES2, 2014)
+EXT(KHR_debug                               , dummy_true                             , GLL, GLC, ES1, ES2, 2012)
+EXT(KHR_texture_compression_astc_hdr        , KHR_texture_compression_astc_hdr       , GLL, GLC,  x , ES2, 2012)
+EXT(KHR_texture_compression_astc_ldr        , KHR_texture_compression_astc_ldr       , GLL, GLC,  x , ES2, 2012)
+
+EXT(MESA_pack_invert                        , MESA_pack_invert                       , GLL, GLC,  x ,  x , 2002)
+EXT(MESA_texture_signed_rgba                , EXT_texture_snorm                      , GLL, GLC,  x ,  x , 2009)
+EXT(MESA_window_pos                         , dummy_true                             , GLL,  x ,  x ,  x , 2000)
+EXT(MESA_ycbcr_texture                      , MESA_ycbcr_texture                     , GLL, GLC,  x ,  x , 2002)
+
+EXT(NV_blend_square                         , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(NV_conditional_render                   , NV_conditional_render                  , GLL, GLC,  x ,  x , 2008)
+EXT(NV_depth_clamp                          , ARB_depth_clamp                        , GLL, GLC,  x ,  x , 2001)
+EXT(NV_draw_buffers                         , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_fbo_color_attachments                , dummy_true                             ,  x ,  x ,  x , ES2, 2010)
+EXT(NV_fog_distance                         , NV_fog_distance                        , GLL,  x ,  x ,  x , 2001)
+EXT(NV_fragment_program_option              , NV_fragment_program_option             , GLL,  x ,  x ,  x , 2005)
+EXT(NV_light_max_exponent                   , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(NV_packed_depth_stencil                 , dummy_true                             , GLL, GLC,  x ,  x , 2000)
+EXT(NV_point_sprite                         , NV_point_sprite                        , GLL, GLC,  x ,  x , 2001)
+EXT(NV_primitive_restart                    , NV_primitive_restart                   , GLL,  x ,  x ,  x , 2002)
+EXT(NV_read_buffer                          , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_read_depth                           , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_read_depth_stencil                   , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_read_stencil                         , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_texgen_reflection                    , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(NV_texture_barrier                      , NV_texture_barrier                     , GLL, GLC,  x ,  x , 2009)
+EXT(NV_texture_env_combine4                 , NV_texture_env_combine4                , GLL,  x ,  x ,  x , 1999)
+EXT(NV_texture_rectangle                    , NV_texture_rectangle                   , GLL,  x ,  x ,  x , 2000)
+EXT(NV_vdpau_interop                        , NV_vdpau_interop                       , GLL, GLC,  x ,  x , 2010)
+
+EXT(OES_EGL_image                           , OES_EGL_image                          , GLL, GLC, ES1, ES2, 2006) /* FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */
+EXT(OES_EGL_image_external                  , OES_EGL_image_external                 ,  x ,  x , ES1, ES2, 2010)
+EXT(OES_EGL_sync                            , dummy_true                             ,  x ,  x , ES1, ES2, 2010)
 EXT(OES_blend_equation_separate             , EXT_blend_equation_separate            ,  x ,  x , ES1,  x , 2009)
 EXT(OES_blend_func_separate                 , EXT_blend_func_separate                ,  x ,  x , ES1,  x , 2009)
 EXT(OES_blend_subtract                      , dummy_true                             ,  x ,  x , ES1,  x , 2009)
@@ -230,9 +303,6 @@ EXT(OES_depth_texture                       , ARB_depth_texture
 EXT(OES_depth_texture_cube_map              , OES_depth_texture_cube_map             ,  x ,  x ,  x , ES2, 2012)
 EXT(OES_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          ,  x ,  x ,  x , ES2, 2014)
 EXT(OES_draw_texture                        , OES_draw_texture                       ,  x ,  x , ES1,  x , 2004)
-EXT(OES_EGL_sync                            , dummy_true                             ,  x ,  x , ES1, ES2, 2010)
-EXT(OES_EGL_image                           , OES_EGL_image                          , GLL, GLC, ES1, ES2, 2006) /* FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */
-EXT(OES_EGL_image_external                  , OES_EGL_image_external                 ,  x ,  x , ES1, ES2, 2010)
 EXT(OES_element_index_uint                  , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_fbo_render_mipmap                   , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_fixed_point                         , dummy_true                             ,  x ,  x , ES1,  x , 2002)
@@ -260,73 +330,17 @@ EXT(OES_texture_float_linear                , OES_texture_float_linear
 EXT(OES_texture_half_float                  , OES_texture_half_float                 ,  x ,  x ,  x , ES2, 2005)
 EXT(OES_texture_half_float_linear           , OES_texture_half_float_linear          ,  x ,  x ,  x , ES2, 2005)
 EXT(OES_texture_mirrored_repeat             , dummy_true                             ,  x ,  x , ES1,  x , 2005)
-EXT(OES_texture_storage_multisample_2d_array, ARB_texture_multisample                ,  x ,  x , ES1,  31, 2014)
 EXT(OES_texture_npot                        , ARB_texture_non_power_of_two           ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_texture_storage_multisample_2d_array, ARB_texture_multisample                ,  x ,  x , ES1,  31, 2014)
 EXT(OES_vertex_array_object                 , dummy_true                             ,  x ,  x , ES1, ES2, 2010)
 
-
-EXT(KHR_debug                               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
-EXT(KHR_context_flush_control               , dummy_true                             , GLL, GLC,  x , ES2, 2014)
-EXT(KHR_texture_compression_astc_hdr        , KHR_texture_compression_astc_hdr       , GLL, GLC,  x , ES2, 2012)
-EXT(KHR_texture_compression_astc_ldr        , KHR_texture_compression_astc_ldr       , GLL, GLC,  x , ES2, 2012)
-
-
-EXT(3DFX_texture_compression_FXT1           , TDFX_texture_compression_FXT1          , GLL, GLC,  x ,  x , 1999)
-EXT(AMD_conservative_depth                  , ARB_conservative_depth                 , GLL, GLC,  x ,  x , 2009)
-EXT(AMD_draw_buffers_blend                  , ARB_draw_buffers_blend                 , GLL, GLC,  x ,  x , 2009)
-EXT(AMD_performance_monitor                 , AMD_performance_monitor                , GLL, GLC,  x ,  x , 2007)
-EXT(AMD_pinned_memory                       , AMD_pinned_memory                      , GLL, GLC,  x ,  x , 2013)
-EXT(AMD_seamless_cubemap_per_texture        , AMD_seamless_cubemap_per_texture       , GLL, GLC,  x ,  x , 2009)
-EXT(AMD_shader_stencil_export               , ARB_shader_stencil_export              , GLL, GLC,  x ,  x , 2009)
-EXT(AMD_shader_trinary_minmax               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
-EXT(AMD_vertex_shader_layer                 , AMD_vertex_shader_layer                ,  x , GLC,  x ,  x , 2012)
-EXT(AMD_vertex_shader_viewport_index        , AMD_vertex_shader_viewport_index       ,  x , GLC,  x ,  x , 2012)
-EXT(APPLE_object_purgeable                  , APPLE_object_purgeable                 , GLL, GLC,  x ,  x , 2006)
-EXT(APPLE_packed_pixels                     , dummy_true                             , GLL,  x ,  x ,  x , 2002)
-EXT(APPLE_texture_max_level                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
-EXT(APPLE_vertex_array_object               , dummy_true                             , GLL,  x ,  x ,  x , 2002)
-EXT(ATI_blend_equation_separate             , EXT_blend_equation_separate            , GLL, GLC,  x ,  x , 2003)
-EXT(ATI_draw_buffers                        , dummy_true                             , GLL,  x ,  x ,  x , 2002)
-EXT(ATI_fragment_shader                     , ATI_fragment_shader                    , GLL,  x ,  x ,  x , 2001)
-EXT(ATI_separate_stencil                    , ATI_separate_stencil                   , GLL,  x ,  x ,  x , 2006)
-EXT(ATI_texture_compression_3dc             , ATI_texture_compression_3dc            , GLL,  x ,  x ,  x , 2004)
-EXT(ATI_texture_env_combine3                , ATI_texture_env_combine3               , GLL,  x ,  x ,  x , 2002)
-EXT(ATI_texture_float                       , ARB_texture_float                      , GLL, GLC,  x ,  x , 2002)
-EXT(ATI_texture_mirror_once                 , ATI_texture_mirror_once                , GLL, GLC,  x ,  x , 2006)
-EXT(IBM_multimode_draw_arrays               , dummy_true                             , GLL, GLC,  x ,  x , 1998)
-EXT(IBM_rasterpos_clip                      , dummy_true                             , GLL,  x ,  x ,  x , 1996)
-EXT(IBM_texture_mirrored_repeat             , dummy_true                             , GLL,  x ,  x ,  x , 1998)
-EXT(INGR_blend_func_separate                , EXT_blend_func_separate                , GLL,  x ,  x ,  x , 1999)
-EXT(INTEL_performance_query                 , INTEL_performance_query                , GLL, GLC,  x , ES2, 2013)
-EXT(MESA_pack_invert                        , MESA_pack_invert                       , GLL, GLC,  x ,  x , 2002)
-EXT(MESA_texture_signed_rgba                , EXT_texture_snorm                      , GLL, GLC,  x ,  x , 2009)
-EXT(MESA_window_pos                         , dummy_true                             , GLL,  x ,  x ,  x , 2000)
-EXT(MESA_ycbcr_texture                      , MESA_ycbcr_texture                     , GLL, GLC,  x ,  x , 2002)
-EXT(NV_blend_square                         , dummy_true                             , GLL,  x ,  x ,  x , 1999)
-EXT(NV_conditional_render                   , NV_conditional_render                  , GLL, GLC,  x ,  x , 2008)
-EXT(NV_depth_clamp                          , ARB_depth_clamp                        , GLL, GLC,  x ,  x , 2001)
-EXT(NV_draw_buffers                         , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
-EXT(NV_fbo_color_attachments                , dummy_true                             ,  x ,  x ,  x , ES2, 2010)
-EXT(NV_fog_distance                         , NV_fog_distance                        , GLL,  x ,  x ,  x , 2001)
-EXT(NV_fragment_program_option              , NV_fragment_program_option             , GLL,  x ,  x ,  x , 2005)
-EXT(NV_light_max_exponent                   , dummy_true                             , GLL,  x ,  x ,  x , 1999)
-EXT(NV_packed_depth_stencil                 , dummy_true                             , GLL, GLC,  x ,  x , 2000)
-EXT(NV_point_sprite                         , NV_point_sprite                        , GLL, GLC,  x ,  x , 2001)
-EXT(NV_primitive_restart                    , NV_primitive_restart                   , GLL,  x ,  x ,  x , 2002)
-EXT(NV_read_buffer                          , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
-EXT(NV_read_depth                           , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
-EXT(NV_read_depth_stencil                   , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
-EXT(NV_read_stencil                         , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
-EXT(NV_texgen_reflection                    , dummy_true                             , GLL,  x ,  x ,  x , 1999)
-EXT(NV_texture_barrier                      , NV_texture_barrier                     , GLL, GLC,  x ,  x , 2009)
-EXT(NV_texture_env_combine4                 , NV_texture_env_combine4                , GLL,  x ,  x ,  x , 1999)
-EXT(NV_texture_rectangle                    , NV_texture_rectangle                   , GLL,  x ,  x ,  x , 2000)
-EXT(NV_vdpau_interop                        , NV_vdpau_interop                       , GLL, GLC,  x ,  x , 2010)
 EXT(S3_s3tc                                 , ANGLE_texture_compression_dxt          , GLL, GLC,  x ,  x , 1999)
+
 EXT(SGIS_generate_mipmap                    , dummy_true                             , GLL,  x ,  x ,  x , 1997)
 EXT(SGIS_texture_border_clamp               , ARB_texture_border_clamp               , GLL,  x ,  x ,  x , 1997)
 EXT(SGIS_texture_edge_clamp                 , dummy_true                             , GLL,  x ,  x ,  x , 1997)
 EXT(SGIS_texture_lod                        , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+
 EXT(SUN_multi_draw_arrays                   , dummy_true                             , GLL,  x ,  x ,  x , 1999)
 #undef GLL
 #undef GLC
diff --git a/src/mesa/main/fog.c b/src/mesa/main/fog.c
index 45f343d61c8..1ad939cfde6 100644
--- a/src/mesa/main/fog.c
+++ b/src/mesa/main/fog.c
@@ -190,7 +190,7 @@ _mesa_Fogfv( GLenum pname, const GLfloat *params )
    }
 
    if (ctx->Driver.Fogfv) {
-      (*ctx->Driver.Fogfv)( ctx, pname, params );
+      ctx->Driver.Fogfv( ctx, pname, params );
    }
 
    return;
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index fbc7b8f8602..9b22b91ac1b 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -330,6 +330,9 @@ descriptor=[
 
 # GL_KHR_context_flush_control
   [ "CONTEXT_RELEASE_BEHAVIOR", "CONTEXT_ENUM(Const.ContextReleaseBehavior), NO_EXTRA" ],
+
+# blend_func_extended
+  [ "MAX_DUAL_SOURCE_DRAW_BUFFERS", "CONTEXT_INT(Const.MaxDualSourceDrawBuffers), extra_ARB_blend_func_extended" ],
 ]},
 
 # GLES3 is not a typo.
@@ -801,7 +804,6 @@ descriptor=[
 # GL_ARB_robustness
   [ "RESET_NOTIFICATION_STRATEGY_ARB", "CONTEXT_ENUM(Const.ResetStrategy), NO_EXTRA" ],
 
-  [ "MAX_DUAL_SOURCE_DRAW_BUFFERS", "CONTEXT_INT(Const.MaxDualSourceDrawBuffers), extra_ARB_blend_func_extended" ],
 
 # GL_ARB_uniform_buffer_object
   [ "MAX_GEOMETRY_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxUniformBlocks), extra_ARB_uniform_buffer_object_and_geometry_shader" ],
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 9873fdbf1a4..87c5a3a194f 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -121,7 +121,7 @@ _mesa_GetString( GLenum name )
    assert(ctx->Driver.GetString);
    {
       /* Give the driver the chance to handle this query */
-      const GLubyte *str = (*ctx->Driver.GetString)(ctx, name);
+      const GLubyte *str = ctx->Driver.GetString(ctx, name);
       if (str)
          return str;
    }
@@ -203,12 +203,18 @@ _mesa_GetPointerv( GLenum pname, GLvoid **params )
 {
    GET_CURRENT_CONTEXT(ctx);
    const GLuint clientUnit = ctx->Array.ActiveTexture;
+   const char *callerstr;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glGetPointerv";
+   else
+      callerstr = "glGetPointervKHR";
 
    if (!params)
       return;
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetPointerv %s\n", _mesa_enum_to_string(pname));
+      _mesa_debug(ctx, "%s %s\n", callerstr, _mesa_enum_to_string(pname));
 
    switch (pname) {
       case GL_VERTEX_ARRAY_POINTER:
@@ -268,10 +274,7 @@ _mesa_GetPointerv( GLenum pname, GLvoid **params )
          break;
       case GL_DEBUG_CALLBACK_FUNCTION_ARB:
       case GL_DEBUG_CALLBACK_USER_PARAM_ARB:
-         if (!_mesa_is_desktop_gl(ctx))
-            goto invalid_pname;
-         else
-            *params = _mesa_get_debug_state_ptr(ctx, pname);
+         *params = _mesa_get_debug_state_ptr(ctx, pname);
          break;
       default:
          goto invalid_pname;
@@ -280,7 +283,7 @@ _mesa_GetPointerv( GLenum pname, GLvoid **params )
    return;
 
 invalid_pname:
-   _mesa_error( ctx, GL_INVALID_ENUM, "glGetPointerv" );
+   _mesa_error( ctx, GL_INVALID_ENUM, "%s", callerstr);
    return;
 }
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 95cbba4ed57..4a849fb090d 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2193,6 +2193,7 @@ struct gl_ati_fragment_shader_state
 struct gl_subroutine_function
 {
    char *name;
+   int index;
    int num_compat_types;
    const struct glsl_type **types;
 };
@@ -3766,6 +3767,7 @@ struct gl_extensions
    GLboolean EXT_polygon_offset_clamp;
    GLboolean EXT_provoking_vertex;
    GLboolean EXT_shader_integer_mix;
+   GLboolean EXT_shader_samples_identical;
    GLboolean EXT_stencil_two_side;
    GLboolean EXT_texture_array;
    GLboolean EXT_texture_compression_latc;
diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c
index 1019f893ba8..41f370ce485 100644
--- a/src/mesa/main/objectlabel.c
+++ b/src/mesa/main/objectlabel.c
@@ -243,13 +243,19 @@ _mesa_ObjectLabel(GLenum identifier, GLuint name, GLsizei length,
                   const GLchar *label)
 {
    GET_CURRENT_CONTEXT(ctx);
+   const char *callerstr;
    char **labelPtr;
 
-   labelPtr = get_label_pointer(ctx, identifier, name, "glObjectLabel");
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glObjectLabel";
+   else
+      callerstr = "glObjectLabelKHR";
+
+   labelPtr = get_label_pointer(ctx, identifier, name, callerstr);
    if (!labelPtr)
       return;
 
-   set_label(ctx, labelPtr, label, length, "glObjectLabel");
+   set_label(ctx, labelPtr, label, length, callerstr);
 }
 
 void GLAPIENTRY
@@ -257,15 +263,21 @@ _mesa_GetObjectLabel(GLenum identifier, GLuint name, GLsizei bufSize,
                      GLsizei *length, GLchar *label)
 {
    GET_CURRENT_CONTEXT(ctx);
+   const char *callerstr;
    char **labelPtr;
 
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glGetObjectLabel";
+   else
+      callerstr = "glGetObjectLabelKHR";
+
    if (bufSize < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glGetObjectLabel(bufSize = %d)",
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(bufSize = %d)", callerstr,
                   bufSize);
       return;
    }
 
-   labelPtr = get_label_pointer(ctx, identifier, name, "glGetObjectLabel");
+   labelPtr = get_label_pointer(ctx, identifier, name, callerstr);
    if (!labelPtr)
       return;
 
@@ -276,17 +288,24 @@ void GLAPIENTRY
 _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label)
 {
    GET_CURRENT_CONTEXT(ctx);
-   char **labelPtr;
    struct gl_sync_object *const syncObj = (struct gl_sync_object *) ptr;
+   const char *callerstr;
+   char **labelPtr;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glObjectPtrLabel";
+   else
+      callerstr = "glObjectPtrLabelKHR";
 
    if (!_mesa_validate_sync(ctx, syncObj)) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glObjectPtrLabel (not a valid sync object)");
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s (not a valid sync object)",
+                  callerstr);
       return;
    }
 
    labelPtr = &syncObj->Label;
 
-   set_label(ctx, labelPtr, label, length, "glObjectPtrLabel");
+   set_label(ctx, labelPtr, label, length, callerstr);
 }
 
 void GLAPIENTRY
@@ -294,17 +313,24 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length,
                         GLchar *label)
 {
    GET_CURRENT_CONTEXT(ctx);
-   char **labelPtr;
    struct gl_sync_object *const syncObj = (struct gl_sync_object *) ptr;
+   const char *callerstr;
+   char **labelPtr;
+
+   if (_mesa_is_desktop_gl(ctx))
+      callerstr = "glGetObjectPtrLabel";
+   else
+      callerstr = "glGetObjectPtrLabelKHR";
 
    if (bufSize < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glGetObjectPtrLabel(bufSize = %d)",
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(bufSize = %d)", callerstr,
                   bufSize);
       return;
    }
 
    if (!_mesa_validate_sync(ctx, syncObj)) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glGetObjectPtrLabel (not a valid sync object)");
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s (not a valid sync object)",
+                  callerstr);
       return;
    }
 
diff --git a/src/mesa/main/points.c b/src/mesa/main/points.c
index 863e3c1af32..c2f2b6399cb 100644
--- a/src/mesa/main/points.c
+++ b/src/mesa/main/points.c
@@ -209,7 +209,7 @@ _mesa_PointParameterfv( GLenum pname, const GLfloat *params)
    }
 
    if (ctx->Driver.PointParameterfv)
-      (*ctx->Driver.PointParameterfv)(ctx, pname, params);
+      ctx->Driver.PointParameterfv(ctx, pname, params);
 }
 
 
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 58ba04153e6..79a91b5b6bd 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -661,6 +661,13 @@ _mesa_program_resource_index(struct gl_shader_program *shProg,
    switch (res->Type) {
    case GL_ATOMIC_COUNTER_BUFFER:
       return RESOURCE_ATC(res) - shProg->AtomicBuffers;
+   case GL_VERTEX_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE:
+      return RESOURCE_SUB(res)->index;
    case GL_UNIFORM_BLOCK:
    case GL_SHADER_STORAGE_BLOCK:
    case GL_TRANSFORM_FEEDBACK_VARYING:
@@ -1413,9 +1420,19 @@ _mesa_validate_pipeline_io(struct gl_pipeline_object *pipeline)
 
    for (idx = prev + 1; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) {
       if (shProg[idx]) {
-         if (!validate_io(shProg[prev]->_LinkedShaders[prev],
-                          shProg[idx]->_LinkedShaders[idx]))
-            return false;
+         /* Since we now only validate precision, we can skip this step for
+          * desktop GLSL shaders, there precision qualifier is ignored.
+          *
+          * From OpenGL 4.50 Shading Language spec, section 4.7:
+          *     "For the purposes of determining if an output from one shader
+          *     stage matches an input of the next stage, the precision
+          *     qualifier need not match."
+          */
+         if (shProg[prev]->IsES || shProg[idx]->IsES) {
+            if (!validate_io(shProg[prev]->_LinkedShaders[prev],
+                             shProg[idx]->_LinkedShaders[idx]))
+               return false;
+         }
          prev = idx;
       }
    }
diff --git a/src/mesa/main/tests/Makefile.am b/src/mesa/main/tests/Makefile.am
index bd7ab7365c0..d6977e20e85 100644
--- a/src/mesa/main/tests/Makefile.am
+++ b/src/mesa/main/tests/Makefile.am
@@ -27,6 +27,7 @@ AM_CPPFLAGS += -DHAVE_SHARED_GLAPI
 main_test_SOURCES +=			\
 	dispatch_sanity.cpp		\
 	mesa_formats.cpp			\
+	mesa_extensions.cpp			\
 	program_state_string.cpp
 
 main_test_LDADD += \
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index abe0f432572..97f81f932f6 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2421,6 +2421,11 @@ const struct function gles3_functions_possible[] = {
    { "glProgramUniform4uiEXT", 30, -1 },
    { "glProgramUniform4uivEXT", 30, -1 },
 
+   /* GL_EXT_blend_func_extended */
+   { "glBindFragDataLocationIndexedEXT", 30, -1 },
+   { "glGetFragDataIndexEXT", 30, -1 },
+   { "glBindFragDataLocationEXT", 30, -1 },
+
    { NULL, 0, -1 }
 };
 
@@ -2509,5 +2514,8 @@ const struct function gles31_functions_possible[] = {
    /* GL_EXT_buffer_storage */
    { "glBufferStorageEXT", 31, -1 },
 
+   /* GL_EXT_blend_func_extended */
+   { "glGetProgramResourceLocationIndexEXT", 31, -1 },
+
    { NULL, 0, -1 },
  };
diff --git a/src/mesa/main/tests/mesa_extensions.cpp b/src/mesa/main/tests/mesa_extensions.cpp
new file mode 100644
index 00000000000..0c7addd4282
--- /dev/null
+++ b/src/mesa/main/tests/mesa_extensions.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \name mesa_extensions.cpp
+ *
+ * Verify that the extensions table is sorted.
+ */
+
+#include <gtest/gtest.h>
+#include "util/macros.h"
+
+/**
+ * Debug/test: verify the extension table is alphabetically sorted.
+ */
+TEST(MesaExtensionsTest, AlphabeticallySorted)
+{
+   const char *ext_names[] = {
+   #define EXT(name_str, ...) #name_str,
+   #include "main/extensions_table.h"
+   #undef EXT
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(ext_names) - 1; ++i) {
+      const char *current_str = ext_names[i];
+      const char *next_str = ext_names[i+1];
+
+      /* We expect the extension table to be alphabetically sorted */
+      ASSERT_LT(strcmp(current_str, next_str), 0);
+   }
+}
diff --git a/src/mesa/main/texenv.c b/src/mesa/main/texenv.c
index 091922161c5..93c680650bb 100644
--- a/src/mesa/main/texenv.c
+++ b/src/mesa/main/texenv.c
@@ -495,7 +495,7 @@ _mesa_TexEnvfv( GLenum target, GLenum pname, const GLfloat *param )
 
    /* Tell device driver about the new texture environment */
    if (ctx->Driver.TexEnv) {
-      (*ctx->Driver.TexEnv)( ctx, target, pname, param );
+      ctx->Driver.TexEnv(ctx, target, pname, param);
    }
 }
 
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index d9453e3a281..ac7599f9fd4 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1333,21 +1333,6 @@ _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
       break;
    case GL_PROXY_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_CUBE_MAP_ARRAY:
-      /* From section 3.8.6, page 146 of OpenGL ES 3.0 spec:
-       *
-       *    "The ETC2/EAC texture compression algorithm supports only
-       *     two-dimensional images. If internalformat is an ETC2/EAC format,
-       *     glCompressedTexImage3D will generate an INVALID_OPERATION error if
-       *     target is not TEXTURE_2D_ARRAY."
-       *
-       * This should also be applicable for glTexStorage3D(). Other available
-       * targets for these functions are: TEXTURE_3D and TEXTURE_CUBE_MAP_ARRAY.
-       */
-      if (layout == MESA_FORMAT_LAYOUT_ETC2 && _mesa_is_gles3(ctx))
-            return write_error(error, GL_INVALID_OPERATION);
-
-      target_can_be_compresed = ctx->Extensions.ARB_texture_cube_map_array;
-
       /* From the KHR_texture_compression_astc_hdr spec:
        *
        *     Add a second new column "3D Tex." which is empty for all non-ASTC
@@ -1368,16 +1353,24 @@ _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
        *      8.19 is *not* checked'
        *
        * The instances of <internalformat> above should say <target>.
+       *
+       * ETC2/EAC formats are the only alternative in GLES and thus such errors
+       * have already been handled by normal ETC2/EAC behavior.
        */
 
-      /* Throw an INVALID_OPERATION error if the target is
-       * TEXTURE_CUBE_MAP_ARRAY and the format is not ASTC.
+      /* From section 3.8.6, page 146 of OpenGL ES 3.0 spec:
+       *
+       *    "The ETC2/EAC texture compression algorithm supports only
+       *     two-dimensional images. If internalformat is an ETC2/EAC format,
+       *     glCompressedTexImage3D will generate an INVALID_OPERATION error if
+       *     target is not TEXTURE_2D_ARRAY."
+       *
+       * This should also be applicable for glTexStorage3D(). Other available
+       * targets for these functions are: TEXTURE_3D and TEXTURE_CUBE_MAP_ARRAY.
        */
-      if (target_can_be_compresed &&
-          ctx->Extensions.KHR_texture_compression_astc_ldr &&
-          layout != MESA_FORMAT_LAYOUT_ASTC)
-         return write_error(error, GL_INVALID_OPERATION);
-
+      if (layout == MESA_FORMAT_LAYOUT_ETC2 && _mesa_is_gles3(ctx))
+            return write_error(error, GL_INVALID_OPERATION);
+      target_can_be_compresed = ctx->Extensions.ARB_texture_cube_map_array;
       break;
    case GL_TEXTURE_3D:
       switch (layout) {
@@ -1401,12 +1394,6 @@ _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
             return write_error(error, GL_INVALID_OPERATION);
          break;
       default:
-         /* Throw an INVALID_OPERATION error if the target is TEXTURE_3D and
-          * the format is not ASTC.
-          * See comment in switch case GL_TEXTURE_CUBE_MAP_ARRAY for more info.
-          */
-         if (ctx->Extensions.KHR_texture_compression_astc_ldr)
-            return write_error(error, GL_INVALID_OPERATION);
          break;
       }
    default:
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 1099d79d834..c5d8c483429 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -1920,6 +1920,8 @@ ir_to_mesa_visitor::visit(ir_texture *ir)
    case ir_query_levels:
       assert(!"Unexpected ir_query_levels opcode");
       break;
+   case ir_samples_identical:
+      unreachable("Unexpected ir_samples_identical opcode");
    case ir_texture_samples:
       unreachable("Unexpected ir_texture_samples opcode");
    }
diff --git a/src/mesa/state_tracker/st_cb_perfmon.c b/src/mesa/state_tracker/st_cb_perfmon.c
index 1bb5be397ae..8fdf0e8497f 100644
--- a/src/mesa/state_tracker/st_cb_perfmon.c
+++ b/src/mesa/state_tracker/st_cb_perfmon.c
@@ -36,69 +36,24 @@
 #include "pipe/p_screen.h"
 #include "util/u_memory.h"
 
-/**
- * Return a PIPE_QUERY_x type >= PIPE_QUERY_DRIVER_SPECIFIC, or -1 if
- * the driver-specific query doesn't exist.
- */
-static int
-find_query_type(struct pipe_screen *screen, const char *name)
-{
-   int num_queries;
-   int type = -1;
-   int i;
-
-   num_queries = screen->get_driver_query_info(screen, 0, NULL);
-   if (!num_queries)
-      return type;
-
-   for (i = 0; i < num_queries; i++) {
-      struct pipe_driver_query_info info;
-
-      if (!screen->get_driver_query_info(screen, i, &info))
-         continue;
-
-      if (!strncmp(info.name, name, strlen(name))) {
-         type = info.query_type;
-         break;
-      }
-   }
-   return type;
-}
-
-/**
- * Return TRUE if the underlying driver expose GPU counters.
- */
-static bool
-has_gpu_counters(struct pipe_screen *screen)
-{
-   int num_groups, gid;
-
-   num_groups = screen->get_driver_query_group_info(screen, 0, NULL);
-   for (gid = 0; gid < num_groups; gid++) {
-      struct pipe_driver_query_group_info group_info;
-
-      if (!screen->get_driver_query_group_info(screen, gid, &group_info))
-         continue;
-
-      if (group_info.type == PIPE_DRIVER_QUERY_GROUP_TYPE_GPU)
-         return true;
-   }
-   return false;
-}
-
 static bool
 init_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
 {
+   struct st_context *st = st_context(ctx);
    struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
-   struct pipe_screen *screen = st_context(ctx)->pipe->screen;
-   struct pipe_context *pipe = st_context(ctx)->pipe;
+   struct pipe_context *pipe = st->pipe;
+   unsigned *batch = NULL;
+   unsigned num_active_counters = 0;
+   unsigned max_batch_counters = 0;
+   unsigned num_batch_counters = 0;
    int gid, cid;
 
-   st_flush_bitmap_cache(st_context(ctx));
+   st_flush_bitmap_cache(st);
 
-   /* Create a query for each active counter. */
+   /* Determine the number of active counters. */
    for (gid = 0; gid < ctx->PerfMonitor.NumGroups; gid++) {
       const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[gid];
+      const struct st_perf_monitor_group *stg = &st->perfmon[gid];
 
       if (m->ActiveGroups[gid] > g->MaxActiveCounters) {
          /* Maximum number of counters reached. Cannot start the session. */
@@ -109,53 +64,96 @@ init_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
          return false;
       }
 
-      for (cid = 0; cid < g->NumCounters; cid++) {
-         const struct gl_perf_monitor_counter *c = &g->Counters[cid];
-         struct st_perf_counter_object *cntr;
-         int query_type;
+      num_active_counters += m->ActiveGroups[gid];
+      if (stg->has_batch)
+         max_batch_counters += m->ActiveGroups[gid];
+   }
 
-         if (!BITSET_TEST(m->ActiveCounters[gid], cid))
-            continue;
+   if (!num_active_counters)
+      return true;
+
+   stm->active_counters = CALLOC(num_active_counters,
+                                 sizeof(*stm->active_counters));
+   if (!stm->active_counters)
+      return false;
 
-         query_type = find_query_type(screen, c->Name);
-         assert(query_type != -1);
+   if (max_batch_counters) {
+      batch = CALLOC(max_batch_counters, sizeof(*batch));
+      if (!batch)
+         return false;
+   }
+
+   /* Create a query for each active counter. */
+   for (gid = 0; gid < ctx->PerfMonitor.NumGroups; gid++) {
+      const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[gid];
+      const struct st_perf_monitor_group *stg = &st->perfmon[gid];
+      BITSET_WORD tmp;
 
-         cntr = CALLOC_STRUCT(st_perf_counter_object);
-         if (!cntr)
-            return false;
+      BITSET_FOREACH_SET(cid, tmp, m->ActiveCounters[gid], g->NumCounters) {
+         const struct st_perf_monitor_counter *stc = &stg->counters[cid];
+         struct st_perf_counter_object *cntr =
+            &stm->active_counters[stm->num_active_counters];
 
-         cntr->query    = pipe->create_query(pipe, query_type, 0);
          cntr->id       = cid;
          cntr->group_id = gid;
-
-         list_addtail(&cntr->list, &stm->active_counters);
+         if (stc->flags & PIPE_DRIVER_QUERY_FLAG_BATCH) {
+            cntr->batch_index = num_batch_counters;
+            batch[num_batch_counters++] = stc->query_type;
+         } else {
+            cntr->query = pipe->create_query(pipe, stc->query_type, 0);
+            if (!cntr->query)
+               goto fail;
+         }
+         ++stm->num_active_counters;
       }
    }
+
+   /* Create the batch query. */
+   if (num_batch_counters) {
+      stm->batch_query = pipe->create_batch_query(pipe, num_batch_counters,
+                                                  batch);
+      stm->batch_result = CALLOC(num_batch_counters, sizeof(stm->batch_result->batch[0]));
+      if (!stm->batch_query || !stm->batch_result)
+         goto fail;
+   }
+
+   FREE(batch);
    return true;
+
+fail:
+   FREE(batch);
+   return false;
 }
 
 static void
 reset_perf_monitor(struct st_perf_monitor_object *stm,
                    struct pipe_context *pipe)
 {
-   struct st_perf_counter_object *cntr, *tmp;
+   unsigned i;
 
-   LIST_FOR_EACH_ENTRY_SAFE(cntr, tmp, &stm->active_counters, list) {
-      if (cntr->query)
-         pipe->destroy_query(pipe, cntr->query);
-      list_del(&cntr->list);
-      free(cntr);
+   for (i = 0; i < stm->num_active_counters; ++i) {
+      struct pipe_query *query = stm->active_counters[i].query;
+      if (query)
+         pipe->destroy_query(pipe, query);
    }
+   FREE(stm->active_counters);
+   stm->active_counters = NULL;
+   stm->num_active_counters = 0;
+
+   if (stm->batch_query) {
+      pipe->destroy_query(pipe, stm->batch_query);
+      stm->batch_query = NULL;
+   }
+   FREE(stm->batch_result);
+   stm->batch_result = NULL;
 }
 
 static struct gl_perf_monitor_object *
 st_NewPerfMonitor(struct gl_context *ctx)
 {
    struct st_perf_monitor_object *stq = ST_CALLOC_STRUCT(st_perf_monitor_object);
-   if (stq) {
-      list_inithead(&stq->active_counters);
+   if (stq)
       return &stq->base;
-   }
    return NULL;
 }
 
@@ -174,9 +172,9 @@ st_BeginPerfMonitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
 {
    struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
    struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct st_perf_counter_object *cntr;
+   unsigned i;
 
-   if (LIST_IS_EMPTY(&stm->active_counters)) {
+   if (!stm->num_active_counters) {
       /* Create a query for each active counter before starting
        * a new monitoring session. */
       if (!init_perf_monitor(ctx, m))
@@ -184,10 +182,15 @@ st_BeginPerfMonitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
    }
 
    /* Start the query for each active counter. */
-   LIST_FOR_EACH_ENTRY(cntr, &stm->active_counters, list) {
-      if (!pipe->begin_query(pipe, cntr->query))
+   for (i = 0; i < stm->num_active_counters; ++i) {
+      struct pipe_query *query = stm->active_counters[i].query;
+      if (query && !pipe->begin_query(pipe, query))
           goto fail;
    }
+
+   if (stm->batch_query && !pipe->begin_query(pipe, stm->batch_query))
+      goto fail;
+
    return true;
 
 fail:
@@ -201,11 +204,17 @@ st_EndPerfMonitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
 {
    struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
    struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct st_perf_counter_object *cntr;
+   unsigned i;
 
    /* Stop the query for each active counter. */
-   LIST_FOR_EACH_ENTRY(cntr, &stm->active_counters, list)
-      pipe->end_query(pipe, cntr->query);
+   for (i = 0; i < stm->num_active_counters; ++i) {
+      struct pipe_query *query = stm->active_counters[i].query;
+      if (query)
+         pipe->end_query(pipe, query);
+   }
+
+   if (stm->batch_query)
+      pipe->end_query(pipe, stm->batch_query);
 }
 
 static void
@@ -229,20 +238,26 @@ st_IsPerfMonitorResultAvailable(struct gl_context *ctx,
 {
    struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
    struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct st_perf_counter_object *cntr;
+   unsigned i;
 
-   if (LIST_IS_EMPTY(&stm->active_counters))
+   if (!stm->num_active_counters)
       return false;
 
    /* The result of a monitoring session is only available if the query of
     * each active counter is idle. */
-   LIST_FOR_EACH_ENTRY(cntr, &stm->active_counters, list) {
+   for (i = 0; i < stm->num_active_counters; ++i) {
+      struct pipe_query *query = stm->active_counters[i].query;
       union pipe_query_result result;
-      if (!pipe->get_query_result(pipe, cntr->query, FALSE, &result)) {
+      if (query && !pipe->get_query_result(pipe, query, FALSE, &result)) {
          /* The query is busy. */
          return false;
       }
    }
+
+   if (stm->batch_query &&
+       !pipe->get_query_result(pipe, stm->batch_query, FALSE, stm->batch_result))
+      return false;
+
    return true;
 }
 
@@ -255,7 +270,7 @@ st_GetPerfMonitorResult(struct gl_context *ctx,
 {
    struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
    struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct st_perf_counter_object *cntr;
+   unsigned i;
 
    /* Copy data to the supplied array (data).
     *
@@ -263,9 +278,15 @@ st_GetPerfMonitorResult(struct gl_context *ctx,
     * active counter. The API allows counters to appear in any order.
     */
    GLsizei offset = 0;
+   bool have_batch_query = false;
+
+   if (stm->batch_query)
+      have_batch_query = pipe->get_query_result(pipe, stm->batch_query, TRUE,
+                                                stm->batch_result);
 
    /* Read query results for each active counter. */
-   LIST_FOR_EACH_ENTRY(cntr, &stm->active_counters, list) {
+   for (i = 0; i < stm->num_active_counters; ++i) {
+      struct st_perf_counter_object *cntr = &stm->active_counters[i];
       union pipe_query_result result = { 0 };
       int gid, cid;
       GLenum type;
@@ -274,8 +295,14 @@ st_GetPerfMonitorResult(struct gl_context *ctx,
       gid  = cntr->group_id;
       type = ctx->PerfMonitor.Groups[gid].Counters[cid].Type;
 
-      if (!pipe->get_query_result(pipe, cntr->query, TRUE, &result))
-         continue;
+      if (cntr->query) {
+         if (!pipe->get_query_result(pipe, cntr->query, TRUE, &result))
+            continue;
+      } else {
+         if (!have_batch_query)
+            continue;
+         result.batch[0] = stm->batch_result->batch[cntr->batch_index];
+      }
 
       data[offset++] = gid;
       data[offset++] = cid;
@@ -307,18 +334,13 @@ st_init_perfmon(struct st_context *st)
    struct gl_perf_monitor_state *perfmon = &st->ctx->PerfMonitor;
    struct pipe_screen *screen = st->pipe->screen;
    struct gl_perf_monitor_group *groups = NULL;
+   struct st_perf_monitor_group *stgroups = NULL;
    int num_counters, num_groups;
    int gid, cid;
 
    if (!screen->get_driver_query_info || !screen->get_driver_query_group_info)
       return false;
 
-   if (!has_gpu_counters(screen)) {
-      /* According to the spec, GL_AMD_performance_monitor must only
-       * expose GPU counters. */
-      return false;
-   }
-
    /* Get the number of available queries. */
    num_counters = screen->get_driver_query_info(screen, 0, NULL);
    if (!num_counters)
@@ -331,29 +353,37 @@ st_init_perfmon(struct st_context *st)
    if (!groups)
       return false;
 
+   stgroups = CALLOC(num_groups, sizeof(*stgroups));
+   if (!stgroups)
+      goto fail_only_groups;
+
    for (gid = 0; gid < num_groups; gid++) {
       struct gl_perf_monitor_group *g = &groups[perfmon->NumGroups];
+      struct st_perf_monitor_group *stg = &stgroups[perfmon->NumGroups];
       struct pipe_driver_query_group_info group_info;
       struct gl_perf_monitor_counter *counters = NULL;
+      struct st_perf_monitor_counter *stcounters = NULL;
 
       if (!screen->get_driver_query_group_info(screen, gid, &group_info))
          continue;
 
-      if (group_info.type != PIPE_DRIVER_QUERY_GROUP_TYPE_GPU)
-         continue;
-
       g->Name = group_info.name;
       g->MaxActiveCounters = group_info.max_active_queries;
-      g->NumCounters = 0;
-      g->Counters = NULL;
 
       if (group_info.num_queries)
          counters = CALLOC(group_info.num_queries, sizeof(*counters));
       if (!counters)
          goto fail;
+      g->Counters = counters;
+
+      stcounters = CALLOC(group_info.num_queries, sizeof(*stcounters));
+      if (!stcounters)
+         goto fail;
+      stg->counters = stcounters;
 
       for (cid = 0; cid < num_counters; cid++) {
          struct gl_perf_monitor_counter *c = &counters[g->NumCounters];
+         struct st_perf_monitor_counter *stc = &stcounters[g->NumCounters];
          struct pipe_driver_query_info info;
 
          if (!screen->get_driver_query_info(screen, cid, &info))
@@ -364,6 +394,9 @@ st_init_perfmon(struct st_context *st)
          c->Name = info.name;
          switch (info.type) {
             case PIPE_DRIVER_QUERY_TYPE_UINT64:
+            case PIPE_DRIVER_QUERY_TYPE_BYTES:
+            case PIPE_DRIVER_QUERY_TYPE_MICROSECONDS:
+            case PIPE_DRIVER_QUERY_TYPE_HZ:
                c->Minimum.u64 = 0;
                c->Maximum.u64 = info.max_value.u64 ? info.max_value.u64 : -1;
                c->Type = GL_UNSIGNED_INT64_AMD;
@@ -386,18 +419,28 @@ st_init_perfmon(struct st_context *st)
             default:
                unreachable("Invalid driver query type!");
          }
+
+         stc->query_type = info.query_type;
+         stc->flags = info.flags;
+         if (stc->flags & PIPE_DRIVER_QUERY_FLAG_BATCH)
+            stg->has_batch = true;
+
          g->NumCounters++;
       }
-      g->Counters = counters;
       perfmon->NumGroups++;
    }
    perfmon->Groups = groups;
+   st->perfmon = stgroups;
 
    return true;
 
 fail:
-   for (gid = 0; gid < num_groups; gid++)
+   for (gid = 0; gid < num_groups; gid++) {
+      FREE(stgroups[gid].counters);
       FREE((void *)groups[gid].Counters);
+   }
+   FREE(stgroups);
+fail_only_groups:
    FREE(groups);
    return false;
 }
@@ -408,8 +451,11 @@ st_destroy_perfmon(struct st_context *st)
    struct gl_perf_monitor_state *perfmon = &st->ctx->PerfMonitor;
    int gid;
 
-   for (gid = 0; gid < perfmon->NumGroups; gid++)
+   for (gid = 0; gid < perfmon->NumGroups; gid++) {
+      FREE(st->perfmon[gid].counters);
       FREE((void *)perfmon->Groups[gid].Counters);
+   }
+   FREE(st->perfmon);
    FREE((void *)perfmon->Groups);
 }
 
diff --git a/src/mesa/state_tracker/st_cb_perfmon.h b/src/mesa/state_tracker/st_cb_perfmon.h
index 0b195de47fe..29732866bf8 100644
--- a/src/mesa/state_tracker/st_cb_perfmon.h
+++ b/src/mesa/state_tracker/st_cb_perfmon.h
@@ -26,21 +26,41 @@
 
 #include "util/list.h"
 
+struct st_perf_counter_object
+{
+   struct pipe_query *query;
+   int id;
+   int group_id;
+   unsigned batch_index;
+};
+
 /**
  * Subclass of gl_perf_monitor_object
  */
 struct st_perf_monitor_object
 {
    struct gl_perf_monitor_object base;
-   struct list_head active_counters;
+   unsigned num_active_counters;
+   struct st_perf_counter_object *active_counters;
+
+   struct pipe_query *batch_query;
+   union pipe_query_result *batch_result;
 };
 
-struct st_perf_counter_object
+/**
+ * Extra data per counter, supplementing gl_perf_monitor_counter with
+ * driver-specific information.
+ */
+struct st_perf_monitor_counter
 {
-   struct list_head list;
-   struct pipe_query *query;
-   int id;
-   int group_id;
+   unsigned query_type;
+   unsigned flags;
+};
+
+struct st_perf_monitor_group
+{
+   struct st_perf_monitor_counter *counters;
+   bool has_batch;
 };
 
 /**
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index c243f5cd966..60a9a4bb0d5 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -46,6 +46,7 @@ struct draw_stage;
 struct gen_mipmap_state;
 struct st_context;
 struct st_fragment_program;
+struct st_perf_monitor_group;
 struct u_upload_mgr;
 
 
@@ -217,6 +218,8 @@ struct st_context
    int32_t read_stamp;
 
    struct st_config_options options;
+
+   struct st_perf_monitor_group *perfmon;
 };
 
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 99e96e1f3ae..a2418e28a91 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -343,7 +343,7 @@ struct st_extension_cap_mapping {
 
 struct st_extension_format_mapping {
    int extension_offset[2];
-   enum pipe_format format[8];
+   enum pipe_format format[32];
 
    /* If TRUE, at least one format must be supported for the extensions to be
     * advertised. If FALSE, all the formats must be supported. */
@@ -569,6 +569,36 @@ void st_init_extensions(struct pipe_screen *screen,
           PIPE_FORMAT_BPTC_RGB_FLOAT,
           PIPE_FORMAT_BPTC_RGB_UFLOAT } },
 
+      { { o(KHR_texture_compression_astc_ldr) },
+        { PIPE_FORMAT_ASTC_4x4,
+          PIPE_FORMAT_ASTC_5x4,
+          PIPE_FORMAT_ASTC_5x5,
+          PIPE_FORMAT_ASTC_6x5,
+          PIPE_FORMAT_ASTC_6x6,
+          PIPE_FORMAT_ASTC_8x5,
+          PIPE_FORMAT_ASTC_8x6,
+          PIPE_FORMAT_ASTC_8x8,
+          PIPE_FORMAT_ASTC_10x5,
+          PIPE_FORMAT_ASTC_10x6,
+          PIPE_FORMAT_ASTC_10x8,
+          PIPE_FORMAT_ASTC_10x10,
+          PIPE_FORMAT_ASTC_12x10,
+          PIPE_FORMAT_ASTC_12x12,
+          PIPE_FORMAT_ASTC_4x4_SRGB,
+          PIPE_FORMAT_ASTC_5x4_SRGB,
+          PIPE_FORMAT_ASTC_5x5_SRGB,
+          PIPE_FORMAT_ASTC_6x5_SRGB,
+          PIPE_FORMAT_ASTC_6x6_SRGB,
+          PIPE_FORMAT_ASTC_8x5_SRGB,
+          PIPE_FORMAT_ASTC_8x6_SRGB,
+          PIPE_FORMAT_ASTC_8x8_SRGB,
+          PIPE_FORMAT_ASTC_10x5_SRGB,
+          PIPE_FORMAT_ASTC_10x6_SRGB,
+          PIPE_FORMAT_ASTC_10x8_SRGB,
+          PIPE_FORMAT_ASTC_10x10_SRGB,
+          PIPE_FORMAT_ASTC_12x10_SRGB,
+          PIPE_FORMAT_ASTC_12x12_SRGB } },
+
       { { o(EXT_texture_shared_exponent) },
         { PIPE_FORMAT_R9G9B9E5_FLOAT } },
 
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 144b7d6f659..2b92bade440 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -465,6 +465,64 @@ st_mesa_format_to_pipe_format(struct st_context *st, mesa_format mesaFormat)
    case MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1:
       return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGB8A1 : PIPE_FORMAT_B8G8R8A8_SRGB;
 
+   case MESA_FORMAT_RGBA_ASTC_4x4:
+      return PIPE_FORMAT_ASTC_4x4;
+   case MESA_FORMAT_RGBA_ASTC_5x4:
+      return PIPE_FORMAT_ASTC_5x4;
+   case MESA_FORMAT_RGBA_ASTC_5x5:
+      return PIPE_FORMAT_ASTC_5x5;
+   case MESA_FORMAT_RGBA_ASTC_6x5:
+      return PIPE_FORMAT_ASTC_6x5;
+   case MESA_FORMAT_RGBA_ASTC_6x6:
+      return PIPE_FORMAT_ASTC_6x6;
+   case MESA_FORMAT_RGBA_ASTC_8x5:
+      return PIPE_FORMAT_ASTC_8x5;
+   case MESA_FORMAT_RGBA_ASTC_8x6:
+      return PIPE_FORMAT_ASTC_8x6;
+   case MESA_FORMAT_RGBA_ASTC_8x8:
+      return PIPE_FORMAT_ASTC_8x8;
+   case MESA_FORMAT_RGBA_ASTC_10x5:
+      return PIPE_FORMAT_ASTC_10x5;
+   case MESA_FORMAT_RGBA_ASTC_10x6:
+      return PIPE_FORMAT_ASTC_10x6;
+   case MESA_FORMAT_RGBA_ASTC_10x8:
+      return PIPE_FORMAT_ASTC_10x8;
+   case MESA_FORMAT_RGBA_ASTC_10x10:
+      return PIPE_FORMAT_ASTC_10x10;
+   case MESA_FORMAT_RGBA_ASTC_12x10:
+      return PIPE_FORMAT_ASTC_12x10;
+   case MESA_FORMAT_RGBA_ASTC_12x12:
+      return PIPE_FORMAT_ASTC_12x12;
+
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4:
+      return PIPE_FORMAT_ASTC_4x4_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4:
+      return PIPE_FORMAT_ASTC_5x4_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5:
+      return PIPE_FORMAT_ASTC_5x5_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5:
+      return PIPE_FORMAT_ASTC_6x5_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6:
+      return PIPE_FORMAT_ASTC_6x6_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5:
+      return PIPE_FORMAT_ASTC_8x5_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6:
+      return PIPE_FORMAT_ASTC_8x6_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8:
+      return PIPE_FORMAT_ASTC_8x8_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5:
+      return PIPE_FORMAT_ASTC_10x5_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6:
+      return PIPE_FORMAT_ASTC_10x6_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8:
+      return PIPE_FORMAT_ASTC_10x8_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10:
+      return PIPE_FORMAT_ASTC_10x10_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10:
+      return PIPE_FORMAT_ASTC_12x10_SRGB;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12:
+      return PIPE_FORMAT_ASTC_12x12_SRGB;
+
    default:
       return PIPE_FORMAT_NONE;
    }
@@ -883,6 +941,64 @@ st_pipe_format_to_mesa_format(enum pipe_format format)
    case PIPE_FORMAT_ETC2_RG11_SNORM:
       return MESA_FORMAT_ETC2_SIGNED_RG11_EAC;
 
+   case PIPE_FORMAT_ASTC_4x4:
+      return MESA_FORMAT_RGBA_ASTC_4x4;
+   case PIPE_FORMAT_ASTC_5x4:
+      return MESA_FORMAT_RGBA_ASTC_5x4;
+   case PIPE_FORMAT_ASTC_5x5:
+      return MESA_FORMAT_RGBA_ASTC_5x5;
+   case PIPE_FORMAT_ASTC_6x5:
+      return MESA_FORMAT_RGBA_ASTC_6x5;
+   case PIPE_FORMAT_ASTC_6x6:
+      return MESA_FORMAT_RGBA_ASTC_6x6;
+   case PIPE_FORMAT_ASTC_8x5:
+      return MESA_FORMAT_RGBA_ASTC_8x5;
+   case PIPE_FORMAT_ASTC_8x6:
+      return MESA_FORMAT_RGBA_ASTC_8x6;
+   case PIPE_FORMAT_ASTC_8x8:
+      return MESA_FORMAT_RGBA_ASTC_8x8;
+   case PIPE_FORMAT_ASTC_10x5:
+      return MESA_FORMAT_RGBA_ASTC_10x5;
+   case PIPE_FORMAT_ASTC_10x6:
+      return MESA_FORMAT_RGBA_ASTC_10x6;
+   case PIPE_FORMAT_ASTC_10x8:
+      return MESA_FORMAT_RGBA_ASTC_10x8;
+   case PIPE_FORMAT_ASTC_10x10:
+      return MESA_FORMAT_RGBA_ASTC_10x10;
+   case PIPE_FORMAT_ASTC_12x10:
+      return MESA_FORMAT_RGBA_ASTC_12x10;
+   case PIPE_FORMAT_ASTC_12x12:
+      return MESA_FORMAT_RGBA_ASTC_12x12;
+
+   case PIPE_FORMAT_ASTC_4x4_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4;
+   case PIPE_FORMAT_ASTC_5x4_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4;
+   case PIPE_FORMAT_ASTC_5x5_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5;
+   case PIPE_FORMAT_ASTC_6x5_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5;
+   case PIPE_FORMAT_ASTC_6x6_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6;
+   case PIPE_FORMAT_ASTC_8x5_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5;
+   case PIPE_FORMAT_ASTC_8x6_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6;
+   case PIPE_FORMAT_ASTC_8x8_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8;
+   case PIPE_FORMAT_ASTC_10x5_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5;
+   case PIPE_FORMAT_ASTC_10x6_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6;
+   case PIPE_FORMAT_ASTC_10x8_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8;
+   case PIPE_FORMAT_ASTC_10x10_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10;
+   case PIPE_FORMAT_ASTC_12x10_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10;
+   case PIPE_FORMAT_ASTC_12x12_SRGB:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12;
+
    default:
       return MESA_FORMAT_NONE;
    }
@@ -1386,6 +1502,121 @@ static const struct format_mapping format_map[] = {
       { PIPE_FORMAT_BPTC_RGB_UFLOAT, 0 },
    },
 
+   /* ASTC */
+   {
+      { GL_COMPRESSED_RGBA_ASTC_4x4_KHR, 0 },
+      { PIPE_FORMAT_ASTC_4x4, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_5x4_KHR, 0 },
+      { PIPE_FORMAT_ASTC_5x4, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_5x5_KHR, 0 },
+      { PIPE_FORMAT_ASTC_5x5, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_6x5_KHR, 0 },
+      { PIPE_FORMAT_ASTC_6x5, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_6x6_KHR, 0 },
+      { PIPE_FORMAT_ASTC_6x6, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_8x5_KHR, 0 },
+      { PIPE_FORMAT_ASTC_8x5, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_8x6_KHR, 0 },
+      { PIPE_FORMAT_ASTC_8x6, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_8x8_KHR, 0 },
+      { PIPE_FORMAT_ASTC_8x8, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_10x5_KHR, 0 },
+      { PIPE_FORMAT_ASTC_10x5, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_10x6_KHR, 0 },
+      { PIPE_FORMAT_ASTC_10x6, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_10x8_KHR, 0 },
+      { PIPE_FORMAT_ASTC_10x8, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_10x10_KHR, 0 },
+      { PIPE_FORMAT_ASTC_10x10, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_12x10_KHR, 0 },
+      { PIPE_FORMAT_ASTC_12x10, 0},
+   },
+   {
+      { GL_COMPRESSED_RGBA_ASTC_12x12_KHR, 0 },
+      { PIPE_FORMAT_ASTC_12x12, 0},
+   },
+
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR, 0 },
+      { PIPE_FORMAT_ASTC_4x4_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR, 0 },
+      { PIPE_FORMAT_ASTC_5x4_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR, 0 },
+      { PIPE_FORMAT_ASTC_5x5_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR, 0 },
+      { PIPE_FORMAT_ASTC_6x5_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR, 0 },
+      { PIPE_FORMAT_ASTC_6x6_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR, 0 },
+      { PIPE_FORMAT_ASTC_8x5_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR, 0 },
+      { PIPE_FORMAT_ASTC_8x6_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR, 0 },
+      { PIPE_FORMAT_ASTC_8x8_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR, 0 },
+      { PIPE_FORMAT_ASTC_10x5_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR, 0 },
+      { PIPE_FORMAT_ASTC_10x6_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, 0 },
+      { PIPE_FORMAT_ASTC_10x8_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR, 0 },
+      { PIPE_FORMAT_ASTC_10x10_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, 0 },
+      { PIPE_FORMAT_ASTC_12x10_SRGB, 0},
+   },
+   {
+      { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR, 0 },
+      { PIPE_FORMAT_ASTC_12x12_SRGB, 0},
+   },
+
    /* signed/unsigned integer formats.
     */
    {
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 3ad1afdecda..40c77258de7 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -3236,6 +3236,8 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
    case ir_texture_samples:
       opcode = TGSI_OPCODE_TXQS;
       break;
+   case ir_samples_identical:
+      unreachable("Unexpected ir_samples_identical opcode");
    }
 
    if (ir->projector) {
diff --git a/src/vulkan/anv_pipeline.c b/src/vulkan/anv_pipeline.c
index 1193d1e7a5d..3d9e0705626 100644
--- a/src/vulkan/anv_pipeline.c
+++ b/src/vulkan/anv_pipeline.c
@@ -125,17 +125,7 @@ bool
 anv_is_scalar_shader_stage(const struct brw_compiler *compiler,
                            VkShaderStage stage)
 {
-   switch (stage) {
-   case VK_SHADER_STAGE_VERTEX:
-      return compiler->scalar_vs;
-   case VK_SHADER_STAGE_GEOMETRY:
-      return false;
-   case VK_SHADER_STAGE_FRAGMENT:
-   case VK_SHADER_STAGE_COMPUTE:
-      return true;
-   default:
-      unreachable("Unsupported shader stage");
-   }
+   return compiler->scalar_stage[vk_shader_stage_to_mesa_stage[stage]];
 }
 
 /* Eventually, this will become part of anv_CreateShader.  Unfortunately,
@@ -187,8 +177,7 @@ anv_shader_compile_to_nir(struct anv_device *device,
    }
    assert(entrypoint != NULL);
 
-   brw_preprocess_nir(nir, &device->info,
-                      anv_is_scalar_shader_stage(compiler, vk_stage));
+   nir = brw_preprocess_nir(nir, compiler->scalar_stage[stage]);
 
    nir_shader_gather_info(nir, entrypoint);
 
@@ -411,7 +400,7 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
    prog_data->binding_table.image_start = bias;
 
    /* Finish the optimization and compilation process */
-   brw_postprocess_nir(nir, &pipeline->device->info,
+   nir = brw_lower_nir(nir, &pipeline->device->info, NULL,
                        anv_is_scalar_shader_stage(compiler, stage));
 
    /* nir_lower_io will only handle the push constants; we need to set this
author	Jason Ekstrand <[email protected]>	2015-11-23 14:03:47 -0800
committer	Jason Ekstrand <[email protected]>	2015-11-23 14:03:47 -0800
commit	179fc4aae8f782453f0488e8dd508f9a01117376 (patch)
tree	5f0cc77b30d86b581fb968a71ba83c5e4c2546d7 /src
parent	e14b2c76b40398a61f45f5d058079641661a66cb (diff)
parent	d9b8fde963a53d4e06570d8bece97f806714507a (diff)