239 files changed, 8516 insertions, 7976 deletions
diff --git a/src/mesa/Android.gen.mk b/src/mesa/Android.gen.mk
index cc979547e0a..145f2594cda 100644
--- a/src/mesa/Android.gen.mk
+++ b/src/mesa/Android.gen.mk
@@ -115,9 +115,11 @@ $(intermediates)/main/api_exec.c: $(dispatch_deps)
 
 GET_HASH_GEN := $(LOCAL_PATH)/main/get_hash_generator.py
 
+$(intermediates)/main/get_hash.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(GET_HASH_GEN)
+$(intermediates)/main/get_hash.h: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml
 $(intermediates)/main/get_hash.h: $(glapi)/gl_and_es_API.xml \
                $(LOCAL_PATH)/main/get_hash_params.py $(GET_HASH_GEN)
-	@$(MESA_PYTHON2) $(GET_HASH_GEN) -f $< > $@
+	$(call es-gen)
 
 FORMAT_INFO := $(LOCAL_PATH)/main/format_info.py
 format_info_deps := \
@@ -125,8 +127,10 @@ format_info_deps := \
 	$(LOCAL_PATH)/main/format_parser.py \
 	$(FORMAT_INFO)
 
+$(intermediates)/main/format_info.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_INFO)
+$(intermediates)/main/format_info.h: PRIVATE_XML :=
 $(intermediates)/main/format_info.h: $(format_info_deps)
-	@$(MESA_PYTHON2) $(FORMAT_INFO) $< > $@
+	$(call es-gen, $<)
 
 FORMAT_PACK := $(LOCAL_PATH)/main/format_pack.py
 format_pack_deps := \
@@ -134,8 +138,10 @@ format_pack_deps := \
 	$(LOCAL_PATH)/main/format_parser.py \
 	$(FORMAT_PACK)
 
+$(intermediates)/main/format_pack.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_PACK)
+$(intermediates)/main/format_pack.c: PRIVATE_XML :=
 $(intermediates)/main/format_pack.c: $(format_pack_deps)
-	$(hide) $(MESA_PYTHON2) $(FORMAT_PACK) $< > $@
+	$(call es-gen, $<)
 
 FORMAT_UNPACK := $(LOCAL_PATH)/main/format_unpack.py
 format_unpack_deps := \
@@ -143,5 +149,7 @@ format_unpack_deps := \
 	$(LOCAL_PATH)/main/format_parser.py \
 	$(FORMAT_UNPACK)
 
+$(intermediates)/main/format_unpack.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_UNPACK)
+$(intermediates)/main/format_unpack.c: PRIVATE_XML :=
 $(intermediates)/main/format_unpack.c: $(format_unpack_deps)
-	$(hide) $(MESA_PYTHON2) $(FORMAT_UNPACK) $< > $@
+	$(call es-gen, $<)
diff --git a/src/mesa/Android.libmesa_glsl_utils.mk b/src/mesa/Android.libmesa_glsl_utils.mk
index 3497377af8c..ed620ac648c 100644
--- a/src/mesa/Android.libmesa_glsl_utils.mk
+++ b/src/mesa/Android.libmesa_glsl_utils.mk
@@ -44,7 +44,8 @@ LOCAL_C_INCLUDES := \
 LOCAL_SRC_FILES := \
 	main/imports.c \
 	program/prog_hash_table.c \
-	program/symbol_table.c
+	program/symbol_table.c \
+	program/dummy_errors.c
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -68,7 +69,8 @@ LOCAL_C_INCLUDES := \
 LOCAL_SRC_FILES := \
 	main/imports.c \
 	program/prog_hash_table.c \
-	program/symbol_table.c
+	program/symbol_table.c \
+	program/dummy_errors.c
 
 include $(MESA_COMMON_MK)
 include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 60114e4f66a..71794b5dada 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -60,7 +60,6 @@ main/git_sha1.h: main/git_sha1.h.tmp
 include Makefile.sources
 
 EXTRA_DIST = \
-	drivers/haiku \
 	drivers/SConscript \
 	main/format_info.py \
 	main/format_pack.py \
diff --git a/src/mesa/drivers/SConscript b/src/mesa/drivers/SConscript
index db656780c0b..5d654f538be 100644
--- a/src/mesa/drivers/SConscript
+++ b/src/mesa/drivers/SConscript
@@ -8,6 +8,3 @@ if env['dri']:
         'dri/common/xmlpool/SConscript',
         'dri/common/SConscript',
     ])
-
-if env['platform'] == 'haiku':
-    SConscript('haiku/swrast/SConscript')
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 0d094ddf4e6..71c1a763912 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -172,7 +172,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->UnmapRenderbuffer = _swrast_unmap_soft_renderbuffer;
    driver->RenderTexture = _swrast_render_texture;
    driver->FinishRenderTexture = _swrast_finish_render_texture;
-   driver->FramebufferRenderbuffer = _mesa_framebuffer_renderbuffer;
+   driver->FramebufferRenderbuffer = _mesa_FramebufferRenderbuffer_sw;
    driver->ValidateFramebuffer = _mesa_validate_framebuffer;
 
    driver->BlitFramebuffer = _swrast_BlitFramebuffer;
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index d2ab7b8ded9..214a68a9129 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -1211,7 +1211,8 @@ _mesa_meta_end(struct gl_context *ctx)
       _mesa_BindRenderbuffer(GL_RENDERBUFFER, save->RenderbufferName);
 
    if (state & MESA_META_DRAW_BUFFERS) {
-      _mesa_drawbuffers(ctx, ctx->Const.MaxDrawBuffers, save->ColorDrawBuffers, NULL);
+      _mesa_drawbuffers(ctx, ctx->DrawBuffer, ctx->Const.MaxDrawBuffers,
+                        save->ColorDrawBuffers, NULL);
    }
 
    ctx->Meta->SaveStackDepth--;
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index bb2164276b2..9cace2b245a 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -82,7 +82,7 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
    y_scale = samples * 0.5;
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && (samples & (samples - 1)) == 0);
+   assert(samples > 0 && is_power_of_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -263,7 +263,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
    }
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && (samples & (samples - 1)) == 0);
+   assert(samples > 0 && is_power_of_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -434,7 +434,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
           * (so the floating point exponent just gets increased), rather than
           * doing a naive sum and dividing.
           */
-         assert((samples & (samples - 1)) == 0);
+         assert(is_power_of_two(samples));
          /* Fetch each individual sample. */
          sample_resolve = rzalloc_size(mem_ctx, 1);
          for (i = 0; i < samples; i++) {
diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index ad6e7873ecd..d2474f52718 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -34,6 +34,7 @@
 #include "macros.h"
 #include "meta.h"
 #include "pbo.h"
+#include "readpix.h"
 #include "shaderapi.h"
 #include "state.h"
 #include "teximage.h"
@@ -150,7 +151,8 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims,
    bool success = false;
    int z;
 
-   if (!_mesa_is_bufferobj(packing->BufferObj) && !create_pbo)
+   if (!_mesa_is_bufferobj(packing->BufferObj) &&
+       (!create_pbo || pixels == NULL))
       return false;
 
    if (format == GL_DEPTH_COMPONENT ||
@@ -257,6 +259,7 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    GLuint pbo = 0, pbo_tex = 0, fbos[2] = { 0, 0 };
    int full_height, image_height;
    struct gl_texture_image *pbo_tex_image;
+   struct gl_renderbuffer *rb = NULL;
    GLenum status;
    bool success = false;
    int z;
@@ -273,6 +276,13 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    if (ctx->_ImageTransferState)
       return false;
 
+
+   if (!tex_image) {
+      rb = ctx->ReadBuffer->_ColorReadBuffer;
+      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format))
+         return false;
+   }
+
    /* For arrays, use a tall (height * depth) 2D texture but taking into
     * account the inter-image padding specified with the image height packing
     * property.
diff --git a/src/mesa/drivers/dri/Makefile.am b/src/mesa/drivers/dri/Makefile.am
index fa1de103b56..08a8e645521 100644
--- a/src/mesa/drivers/dri/Makefile.am
+++ b/src/mesa/drivers/dri/Makefile.am
@@ -60,6 +60,7 @@ mesa_dri_drivers_la_LIBADD = \
         ../../libmesa.la \
         common/libmegadriver_stub.la \
         common/libdricommon.la \
+        common/libxmlconfig.la \
         $(MEGADRIVERS_DEPS) \
         $(DRI_LIB_DEPS) \
         $()
diff --git a/src/mesa/drivers/dri/common/Android.mk b/src/mesa/drivers/dri/common/Android.mk
index a7fcd6d572a..6986f5e8cb4 100644
--- a/src/mesa/drivers/dri/common/Android.mk
+++ b/src/mesa/drivers/dri/common/Android.mk
@@ -39,7 +39,9 @@ intermediates := $(call local-generated-sources-dir)
 LOCAL_C_INCLUDES := \
     $(MESA_DRI_C_INCLUDES)
 
-LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+    $(LOCAL_PATH) \
+    $(intermediates)
 
 # swrast only
 ifeq ($(MESA_GPU_DRIVERS),swrast)
@@ -48,7 +50,9 @@ else
 LOCAL_SHARED_LIBRARIES := libdrm
 endif
 
-LOCAL_SRC_FILES := $(DRI_COMMON_FILES)
+LOCAL_SRC_FILES := \
+	$(DRI_COMMON_FILES) \
+	$(XMLCONFIG_FILES)
 
 MESA_DRI_OPTIONS_H := $(intermediates)/xmlpool/options.h
 LOCAL_GENERATED_SOURCES := $(MESA_DRI_OPTIONS_H)
diff --git a/src/mesa/drivers/dri/common/Makefile.am b/src/mesa/drivers/dri/common/Makefile.am
index da8f97a980e..ae19fcb3565 100644
--- a/src/mesa/drivers/dri/common/Makefile.am
+++ b/src/mesa/drivers/dri/common/Makefile.am
@@ -33,16 +33,20 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	$(DEFINES) \
-	$(EXPAT_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
 
 noinst_LTLIBRARIES = \
 	libdricommon.la \
+	libxmlconfig.la \
 	libmegadriver_stub.la \
 	libdri_test_stubs.la
 
 libdricommon_la_SOURCES = $(DRI_COMMON_FILES)
 
+libxmlconfig_la_SOURCES = $(XMLCONFIG_FILES)
+libxmlconfig_la_CFLAGS = $(AM_CFLAGS) $(EXPAT_CFLAGS)
+libxmlconfig_la_LIBADD = $(EXPAT_LIBS) -lm
+
 libdri_test_stubs_la_SOURCES = $(test_stubs_FILES)
 libdri_test_stubs_la_CFLAGS = $(AM_CFLAGS) -DNO_MAIN
 
diff --git a/src/mesa/drivers/dri/common/Makefile.sources b/src/mesa/drivers/dri/common/Makefile.sources
index d00ec5f7334..d5d8da8fcee 100644
--- a/src/mesa/drivers/dri/common/Makefile.sources
+++ b/src/mesa/drivers/dri/common/Makefile.sources
@@ -2,7 +2,9 @@ DRI_COMMON_FILES := \
 	utils.c \
 	utils.h \
 	dri_util.c \
-	dri_util.h \
+	dri_util.h
+
+XMLCONFIG_FILES := \
 	xmlconfig.c \
 	xmlconfig.h
 
diff --git a/src/mesa/drivers/dri/common/SConscript b/src/mesa/drivers/dri/common/SConscript
index 0bee1b41fc6..b402736db69 100644
--- a/src/mesa/drivers/dri/common/SConscript
+++ b/src/mesa/drivers/dri/common/SConscript
@@ -37,7 +37,7 @@ drienv.PkgUseModules('DRM')
 # else
 #env.Append(CPPDEFINES = ['__NOT_HAVE_DRM_H'])
 
-sources = drienv.ParseSourceList('Makefile.sources', 'DRI_COMMON_FILES')
+sources = drienv.ParseSourceList('Makefile.sources', ['DRI_COMMON_FILES', 'XMLCONFIG_FILES' ])
 
 dri_common = drienv.ConvenienceLibrary(
 	target = 'dri_common',
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index d6e875fcfeb..e7ababe0b67 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -162,13 +162,21 @@ driCreateNewScreen2(int scrn, int fd,
 	return NULL;
     }
 
-    int gl_version_override = _mesa_get_gl_version_override();
-    if (gl_version_override >= 31) {
-       psp->max_gl_core_version = MAX2(psp->max_gl_core_version,
-                                       gl_version_override);
-    } else {
-       psp->max_gl_compat_version = MAX2(psp->max_gl_compat_version,
-                                         gl_version_override);
+    struct gl_constants consts = { 0 };
+    gl_api api;
+    unsigned version;
+
+    api = API_OPENGLES2;
+    if (_mesa_override_gl_version_contextless(&consts, &api, &version))
+       psp->max_gl_es2_version = version;
+
+    api = API_OPENGL_COMPAT;
+    if (_mesa_override_gl_version_contextless(&consts, &api, &version)) {
+       if (api == API_OPENGL_CORE) {
+          psp->max_gl_core_version = version;
+       } else {
+          psp->max_gl_compat_version = version;
+       }
     }
 
     psp->api_mask = (1 << __DRI_API_OPENGL);
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 91da977acee..8ed8ff555ba 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -730,9 +730,9 @@ i830_update_draw_buffer(struct intel_context *intel)
     */
    if (ctx->NewState & _NEW_BUFFERS) {
       /* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
-      _mesa_update_framebuffer(ctx);
+      _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
-      _mesa_update_draw_buffer_bounds(ctx);
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
    }
 
    if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index 9b002236add..03c32e56d82 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -220,7 +220,7 @@ get_result_flags(const struct prog_instruction *inst)
 {
    GLuint flags = 0;
 
-   if (inst->SaturateMode == SATURATE_ZERO_ONE)
+   if (inst->Saturate)
       flags |= A0_DEST_SATURATE;
    if (inst->DstReg.WriteMask & WRITEMASK_X)
       flags |= A0_DEST_CHANNEL_X;
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 97bf81ed759..80bd249fa7b 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -732,9 +732,9 @@ i915_update_draw_buffer(struct intel_context *intel)
     */
    if (ctx->NewState & _NEW_BUFFERS) {
       /* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
-      _mesa_update_framebuffer(ctx);
+      _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
-      _mesa_update_draw_buffer_bounds(ctx);
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
    }
 
    if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
diff --git a/src/mesa/drivers/dri/i915/intel_fbo.c b/src/mesa/drivers/dri/i915/intel_fbo.c
index 24c318049c4..a5d5c5832fb 100644
--- a/src/mesa/drivers/dri/i915/intel_fbo.c
+++ b/src/mesa/drivers/dri/i915/intel_fbo.c
@@ -427,7 +427,7 @@ intel_framebuffer_renderbuffer(struct gl_context * ctx,
 {
    DBG("Intel FramebufferRenderbuffer %u %u\n", fb->Name, rb ? rb->Name : 0);
 
-   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb);
+   _mesa_FramebufferRenderbuffer_sw(ctx, fb, attachment, rb);
    intel_draw_buffer(ctx);
 }
 
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index cf2424e34b4..9c947be88a0 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -48,6 +48,7 @@ libi965_dri_la_LIBADD = $(INTEL_LIBS)
 TEST_LIBS = \
 	libi965_dri.la \
 	../common/libdricommon.la \
+	../common/libxmlconfig.la \
 	../common/libmegadriver_stub.la \
         ../../../libmesa.la \
 	$(DRI_LIB_DEPS) \
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 1ae93e1d5f3..981fe79b132 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -18,9 +18,11 @@ i965_FILES = \
 	brw_clip_unfilled.c \
 	brw_clip_util.c \
 	brw_compute.c \
+	brw_conditional_render.c \
 	brw_context.c \
 	brw_context.h \
 	brw_cs.cpp \
+	brw_cs.h \
 	brw_cubemap_normalize.cpp \
 	brw_curbe.c \
 	brw_dead_control_flow.cpp \
@@ -40,6 +42,7 @@ i965_FILES = \
 	brw_ff_gs.c \
 	brw_ff_gs_emit.c \
 	brw_ff_gs.h \
+	brw_fs_builder.h \
 	brw_fs_channel_expressions.cpp \
 	brw_fs_cmod_propagation.cpp \
 	brw_fs_combine_constants.cpp \
@@ -47,7 +50,6 @@ i965_FILES = \
 	brw_fs.cpp \
 	brw_fs_cse.cpp \
 	brw_fs_dead_code_eliminate.cpp \
-	brw_fs_fp.cpp \
 	brw_fs_generator.cpp \
 	brw_fs.h \
 	brw_fs_live_variables.cpp \
@@ -128,6 +130,7 @@ i965_FILES = \
 	brw_vs.h \
 	brw_vs_state.c \
 	brw_vs_surface_state.c \
+	brw_vue_map.c \
 	brw_wm.c \
 	brw_wm.h \
 	brw_wm_iz.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index c1b760920d9..789520c7353 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -29,7 +29,8 @@
 brw_blorp_eu_emitter::brw_blorp_eu_emitter(struct brw_context *brw,
                                            bool debug_flag)
    : mem_ctx(ralloc_context(NULL)),
-     generator(brw, mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key),
+     generator(brw->intelScreen->compiler, brw,
+               mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key),
                (struct brw_stage_prog_data *) rzalloc(mem_ctx, struct brw_wm_prog_data),
                NULL, 0, false, "BLORP")
 {
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
index 7e7770e43cd..f1f230e3751 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
@@ -141,12 +141,12 @@ bblock_t::combine_with(bblock_t *that)
 }
 
 void
-bblock_t::dump(backend_visitor *v) const
+bblock_t::dump(backend_shader *s) const
 {
    int ip = this->start_ip;
    foreach_inst_in_block(backend_instruction, inst, this) {
       fprintf(stderr, "%5d: ", ip);
-      v->dump_instruction(inst);
+      s->dump_instruction(inst);
       ip++;
    }
 }
@@ -231,6 +231,7 @@ cfg_t::cfg_t(exec_list *instructions)
          if (cur_else) {
             cur_else->add_successor(mem_ctx, cur_endif);
          } else {
+            assert(cur_if != NULL);
             cur_if->add_successor(mem_ctx, cur_endif);
          }
 
@@ -299,6 +300,7 @@ cfg_t::cfg_t(exec_list *instructions)
          inst->exec_node::remove();
          cur->instructions.push_tail(inst);
 
+         assert(cur_do != NULL && cur_while != NULL);
 	 cur->add_successor(mem_ctx, cur_do);
 	 set_next_block(&cur, cur_while, ip);
 
@@ -411,7 +413,7 @@ cfg_t::make_block_array()
 }
 
 void
-cfg_t::dump(backend_visitor *v)
+cfg_t::dump(backend_shader *s)
 {
    if (idom_dirty)
       calculate_idom();
@@ -423,8 +425,8 @@ cfg_t::dump(backend_visitor *v)
                  link->block->num);
       }
       fprintf(stderr, "\n");
-      if (v != NULL)
-         block->dump(v);
+      if (s != NULL)
+         block->dump(s);
       fprintf(stderr, "END B%d", block->num);
       foreach_list_typed(bblock_link, link, link, &block->children) {
          fprintf(stderr, " ->B%d",
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
index 56d7d07abdf..a09491781e6 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.h
+++ b/src/mesa/drivers/dri/i965/brw_cfg.h
@@ -60,7 +60,7 @@ struct bblock_t {
    bool is_successor_of(const bblock_t *block) const;
    bool can_combine_with(const bblock_t *that) const;
    void combine_with(bblock_t *that);
-   void dump(backend_visitor *v) const;
+   void dump(backend_shader *s) const;
 
    backend_instruction *start();
    const backend_instruction *start() const;
@@ -273,7 +273,7 @@ struct cfg_t {
    void calculate_idom();
    static bblock_t *intersect(bblock_t *b1, bblock_t *b2);
 
-   void dump(backend_visitor *v);
+   void dump(backend_shader *s);
    void dump_cfg();
    void dump_domtree();
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index 12314204803..1d4ba3cac7e 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -121,8 +121,9 @@ brw_fast_clear_depth(struct gl_context *ctx)
     * first.
     */
    if ((ctx->Scissor.EnableFlags & 1) && !noop_scissor(ctx, fb)) {
-      perf_debug("Failed to fast clear depth due to scissor being enabled.  "
-                 "Possible 5%% performance win if avoided.\n");
+      perf_debug("Failed to fast clear %dx%d depth because of scissors.  "
+                 "Possible 5%% performance win if avoided.\n",
+                 mt->logical_width0, mt->logical_height0);
       return false;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 32238341aae..dee74dba8af 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -32,6 +32,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "main/framebuffer.h"
 
 static void
 upload_clip_vp(struct brw_context *brw)
@@ -59,7 +60,9 @@ brw_upload_clip_unit(struct brw_context *brw)
    struct brw_clip_unit_state *clip;
 
    /* _NEW_BUFFERS */
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const float fb_width = (float)_mesa_geometric_width(fb);
+   const float fb_height = (float)_mesa_geometric_height(fb);
 
    upload_clip_vp(brw);
 
@@ -127,8 +130,8 @@ brw_upload_clip_unit(struct brw_context *brw)
    /* enable guardband clipping if we can */
    if (ctx->ViewportArray[0].X == 0 &&
        ctx->ViewportArray[0].Y == 0 &&
-       ctx->ViewportArray[0].Width == (float) fb->Width &&
-       ctx->ViewportArray[0].Height == (float) fb->Height)
+       ctx->ViewportArray[0].Width == fb_width &&
+       ctx->ViewportArray[0].Height == fb_height)
    {
       clip->clip5.guard_band_enable = 1;
       clip->clip6.clipper_viewport_state_ptr =
diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
index b3d6de51adc..5693ab507d4 100644
--- a/src/mesa/drivers/dri/i965/brw_compute.c
+++ b/src/mesa/drivers/dri/i965/brw_compute.c
@@ -45,7 +45,7 @@ brw_emit_gpgpu_walker(struct brw_context *brw, const GLuint *num_groups)
    unsigned thread_width_max =
       (group_size + simd_size - 1) / simd_size;
 
-   uint32_t right_mask = (1u << simd_size) - 1;
+   uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
    const unsigned right_non_aligned = group_size & (simd_size - 1);
    if (right_non_aligned != 0)
       right_mask >>= (simd_size - right_non_aligned);
diff --git a/src/mesa/drivers/dri/i965/brw_conditional_render.c b/src/mesa/drivers/dri/i965/brw_conditional_render.c
new file mode 100644
index 00000000000..6d37c3b6928
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_conditional_render.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Neil Roberts <[email protected]>
+ */
+
+/** @file brw_conditional_render.c
+ *
+ * Support for conditional rendering based on query objects
+ * (GL_NV_conditional_render, GL_ARB_conditional_render_inverted) on Gen7+.
+ */
+
+#include "main/imports.h"
+#include "main/condrender.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+
+static void
+set_predicate_enable(struct brw_context *brw,
+                     bool value)
+{
+   if (value)
+      brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
+   else
+      brw->predicate.state = BRW_PREDICATE_STATE_DONT_RENDER;
+}
+
+static void
+set_predicate_for_result(struct brw_context *brw,
+                         struct brw_query_object *query,
+                         bool inverted)
+{
+   int load_op;
+
+   assert(query->bo != NULL);
+
+   brw_load_register_mem64(brw,
+                           MI_PREDICATE_SRC0,
+                           query->bo,
+                           I915_GEM_DOMAIN_INSTRUCTION,
+                           0, /* write domain */
+                           0 /* offset */);
+   brw_load_register_mem64(brw,
+                           MI_PREDICATE_SRC1,
+                           query->bo,
+                           I915_GEM_DOMAIN_INSTRUCTION,
+                           0, /* write domain */
+                           8 /* offset */);
+
+   if (inverted)
+      load_op = MI_PREDICATE_LOADOP_LOAD;
+   else
+      load_op = MI_PREDICATE_LOADOP_LOADINV;
+
+   BEGIN_BATCH(1);
+   OUT_BATCH(GEN7_MI_PREDICATE |
+             load_op |
+             MI_PREDICATE_COMBINEOP_SET |
+             MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
+   ADVANCE_BATCH();
+
+   brw->predicate.state = BRW_PREDICATE_STATE_USE_BIT;
+}
+
+static void
+brw_begin_conditional_render(struct gl_context *ctx,
+                             struct gl_query_object *q,
+                             GLenum mode)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *) q;
+   bool inverted;
+
+   if (!brw->predicate.supported)
+      return;
+
+   switch (mode) {
+   case GL_QUERY_WAIT:
+   case GL_QUERY_NO_WAIT:
+   case GL_QUERY_BY_REGION_WAIT:
+   case GL_QUERY_BY_REGION_NO_WAIT:
+      inverted = false;
+      break;
+   case GL_QUERY_WAIT_INVERTED:
+   case GL_QUERY_NO_WAIT_INVERTED:
+   case GL_QUERY_BY_REGION_WAIT_INVERTED:
+   case GL_QUERY_BY_REGION_NO_WAIT_INVERTED:
+      inverted = true;
+      break;
+   default:
+      unreachable("Unexpected conditional render mode");
+   }
+
+   /* If there are already samples from a BLT operation or if the query object
+    * is ready then we can avoid looking at the values in the buffer and just
+    * decide whether to draw using the CPU without stalling.
+    */
+   if (query->Base.Result || query->Base.Ready)
+      set_predicate_enable(brw, (query->Base.Result != 0) ^ inverted);
+   else
+      set_predicate_for_result(brw, query, inverted);
+}
+
+static void
+brw_end_conditional_render(struct gl_context *ctx,
+                           struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   /* When there is no longer a conditional render in progress it should
+    * always render.
+    */
+   brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
+}
+
+void
+brw_init_conditional_render_functions(struct dd_function_table *functions)
+{
+   functions->BeginConditionalRender = brw_begin_conditional_render;
+   functions->EndConditionalRender = brw_end_conditional_render;
+}
+
+bool
+brw_check_conditional_render(struct brw_context *brw)
+{
+   if (brw->predicate.supported) {
+      /* In some cases it is possible to determine that the primitives should
+       * be skipped without needing the predicate enable bit and still without
+       * stalling.
+       */
+      return brw->predicate.state != BRW_PREDICATE_STATE_DONT_RENDER;
+   } else if (brw->ctx.Query.CondRenderQuery) {
+      perf_debug("Conditional rendering is implemented in software and may "
+                 "stall.\n");
+      return _mesa_check_conditional_render(&brw->ctx);
+   } else {
+      return true;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 23838056690..ebf12fab69e 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -50,6 +50,7 @@
 
 #include "brw_context.h"
 #include "brw_defines.h"
+#include "brw_shader.h"
 #include "brw_draw.h"
 #include "brw_state.h"
 
@@ -68,8 +69,6 @@
 #include "tnl/t_pipeline.h"
 #include "util/ralloc.h"
 
-#include "glsl/nir/nir.h"
-
 /***************************************
  * Mesa's Driver Functions
  ***************************************/
@@ -289,6 +288,8 @@ brw_init_driver_functions(struct brw_context *brw,
    else
       gen4_init_queryobj_functions(functions);
    brw_init_compute_functions(functions);
+   if (brw->gen >= 7)
+      brw_init_conditional_render_functions(functions);
 
    functions->QuerySamplesForFormat = brw_query_samples_for_format;
 
@@ -427,11 +428,7 @@ brw_initialize_context_constants(struct brw_context *brw)
 
    ctx->Const.MinLineWidth = 1.0;
    ctx->Const.MinLineWidthAA = 1.0;
-   if (brw->gen >= 9 || brw->is_cherryview) {
-      ctx->Const.MaxLineWidth = 40.0;
-      ctx->Const.MaxLineWidthAA = 40.0;
-      ctx->Const.LineWidthGranularity = 0.125;
-   } else if (brw->gen >= 6) {
+   if (brw->gen >= 6) {
       ctx->Const.MaxLineWidth = 7.375;
       ctx->Const.MaxLineWidthAA = 7.375;
       ctx->Const.LineWidthGranularity = 0.125;
@@ -441,6 +438,13 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.LineWidthGranularity = 0.5;
    }
 
+   /* For non-antialiased lines, we have to round the line width to the
+    * nearest whole number. Make sure that we don't advertise a line
+    * width that, when rounded, will be beyond the actual hardware
+    * maximum.
+    */
+   assert(roundf(ctx->Const.MaxLineWidth) <= ctx->Const.MaxLineWidth);
+
    ctx->Const.MinPointSize = 1.0;
    ctx->Const.MinPointSizeAA = 1.0;
    ctx->Const.MaxPointSize = 255.0;
@@ -544,6 +548,7 @@ brw_initialize_context_constants(struct brw_context *brw)
     */
    ctx->Const.UniformBufferOffsetAlignment = 16;
    ctx->Const.TextureBufferOffsetAlignment = 16;
+   ctx->Const.MaxTextureBufferSize = 128 * 1024 * 1024;
 
    if (brw->gen >= 6) {
       ctx->Const.MaxVarying = 32;
@@ -553,51 +558,12 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxInputComponents = 128;
    }
 
-   static const nir_shader_compiler_options nir_options = {
-      .native_integers = true,
-      /* In order to help allow for better CSE at the NIR level we tell NIR
-       * to split all ffma instructions during opt_algebraic and we then
-       * re-combine them as a later step.
-       */
-      .lower_ffma = true,
-      .lower_sub = true,
-   };
-
    /* We want the GLSL compiler to emit code that uses condition codes */
    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-      ctx->Const.ShaderCompilerOptions[i].MaxIfDepth = brw->gen < 6 ? 16 : UINT_MAX;
-      ctx->Const.ShaderCompilerOptions[i].EmitCondCodes = true;
-      ctx->Const.ShaderCompilerOptions[i].EmitNoNoise = true;
-      ctx->Const.ShaderCompilerOptions[i].EmitNoMainReturn = true;
-      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectInput = true;
-      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectOutput =
-	 (i == MESA_SHADER_FRAGMENT);
-      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectTemp =
-	 (i == MESA_SHADER_FRAGMENT);
-      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectUniform = false;
-      ctx->Const.ShaderCompilerOptions[i].LowerClipDistance = true;
+      ctx->Const.ShaderCompilerOptions[i] =
+         brw->intelScreen->compiler->glsl_compiler_options[i];
    }
 
-   ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = true;
-   ctx->Const.ShaderCompilerOptions[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
-
-   if (brw->scalar_vs) {
-      /* If we're using the scalar backend for vertex shaders, we need to
-       * configure these accordingly.
-       */
-      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
-      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
-      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = false;
-
-      if (brw_env_var_as_boolean("INTEL_USE_NIR", true))
-         ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions = &nir_options;
-   }
-
-   if (brw_env_var_as_boolean("INTEL_USE_NIR", true))
-      ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions = &nir_options;
-
-   ctx->Const.ShaderCompilerOptions[MESA_SHADER_COMPUTE].NirOptions = &nir_options;
-
    /* ARB_viewport_array */
    if (brw->gen >= 6 && ctx->API == API_OPENGL_CORE) {
       ctx->Const.MaxViewports = GEN6_NUM_VIEWPORTS;
@@ -612,6 +578,12 @@ brw_initialize_context_constants(struct brw_context *brw)
    /* ARB_gpu_shader5 */
    if (brw->gen >= 7)
       ctx->Const.MaxVertexStreams = MIN2(4, MAX_VERTEX_STREAMS);
+
+   /* ARB_framebuffer_no_attachments */
+   ctx->Const.MaxFramebufferWidth = ctx->Const.MaxViewportWidth;
+   ctx->Const.MaxFramebufferHeight = ctx->Const.MaxViewportHeight;
+   ctx->Const.MaxFramebufferLayers = ctx->Const.MaxArrayTextureLayers;
+   ctx->Const.MaxFramebufferSamples = max_samples;
 }
 
 static void
@@ -814,10 +786,9 @@ brwCreateContext(gl_api api,
    _mesa_meta_init(ctx);
 
    brw_process_driconf_options(brw);
-   brw_process_intel_debug_variable(brw);
 
-   if (brw->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
-      brw->scalar_vs = true;
+   if (INTEL_DEBUG & DEBUG_PERF)
+      brw->perf_debug = true;
 
    brw_initialize_context_constants(brw);
 
@@ -894,6 +865,8 @@ brwCreateContext(gl_api api,
    brw->gs.enabled = false;
    brw->sf.viewport_transform_enable = true;
 
+   brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
+
    ctx->VertexProgram._MaintainTnlProgram = true;
    ctx->FragmentProgram._MaintainTexEnvProgram = true;
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index cb4cc7fb36b..9e1f722df9e 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -611,6 +611,12 @@ struct brw_ff_gs_prog_data {
    unsigned svbi_postincrement_value;
 };
 
+enum shader_dispatch_mode {
+   DISPATCH_MODE_4X1_SINGLE = 0,
+   DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
+   DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
+   DISPATCH_MODE_SIMD8 = 3,
+};
 
 /* Note: brw_vue_prog_data_compare() must be updated when adding fields to
  * this struct!
@@ -628,7 +634,7 @@ struct brw_vue_prog_data {
     */
    GLuint urb_entry_size;
 
-   bool simd8;
+   enum shader_dispatch_mode dispatch_mode;
 };
 
 
@@ -726,14 +732,6 @@ struct brw_gs_prog_data
    int invocations;
 
    /**
-    * Dispatch mode, can be any of:
-    * GEN7_GS_DISPATCH_MODE_DUAL_OBJECT
-    * GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE
-    * GEN7_GS_DISPATCH_MODE_SINGLE
-    */
-   int dispatch_mode;
-
-   /**
     * Gen6 transform feedback enabled flag.
     */
    bool gen6_xfb_enabled;
@@ -829,20 +827,10 @@ struct brw_tracked_state {
 enum shader_time_shader_type {
    ST_NONE,
    ST_VS,
-   ST_VS_WRITTEN,
-   ST_VS_RESET,
    ST_GS,
-   ST_GS_WRITTEN,
-   ST_GS_RESET,
    ST_FS8,
-   ST_FS8_WRITTEN,
-   ST_FS8_RESET,
    ST_FS16,
-   ST_FS16_WRITTEN,
-   ST_FS16_RESET,
    ST_CS,
-   ST_CS_WRITTEN,
-   ST_CS_RESET,
 };
 
 struct brw_vertex_buffer {
@@ -972,6 +960,22 @@ struct brw_stage_state
    uint32_t sampler_offset;
 };
 
+enum brw_predicate_state {
+   /* The first two states are used if we can determine whether to draw
+    * without having to look at the values in the query object buffer. This
+    * will happen if there is no conditional render in progress, if the query
+    * object is already completed or if something else has already added
+    * samples to the preliminary result such as via a BLT command.
+    */
+   BRW_PREDICATE_STATE_RENDER,
+   BRW_PREDICATE_STATE_DONT_RENDER,
+   /* In this case whether to draw or not depends on the result of an
+    * MI_PREDICATE command so the predicate enable bit needs to be checked.
+    */
+   BRW_PREDICATE_STATE_USE_BIT
+};
+
+struct shader_times;
 
 /**
  * brw_context is derived from gl_context.
@@ -1131,7 +1135,6 @@ struct brw_context
    bool has_pln;
    bool no_simd8;
    bool use_rep_send;
-   bool scalar_vs;
 
    /**
     * Some versions of Gen hardware don't do centroid interpolation correctly
@@ -1408,6 +1411,11 @@ struct brw_context
    } query;
 
    struct {
+      enum brw_predicate_state state;
+      bool supported;
+   } predicate;
+
+   struct {
       /** A map from pipeline statistics counter IDs to MMIO addresses. */
       const int *statistics_registers;
 
@@ -1453,6 +1461,7 @@ struct brw_context
       uint32_t offset;
       uint32_t size;
       enum aub_state_struct_type type;
+      int index;
    } *state_batch_list;
    int state_batch_count;
 
@@ -1492,7 +1501,7 @@ struct brw_context
       const char **names;
       int *ids;
       enum shader_time_shader_type *types;
-      uint64_t *cumulative;
+      struct shader_times *cumulative;
       int num_entries;
       int max_entries;
       double report_time;
@@ -1606,12 +1615,21 @@ void brw_write_depth_count(struct brw_context *brw, drm_intel_bo *bo, int idx);
 void brw_store_register_mem64(struct brw_context *brw,
                               drm_intel_bo *bo, uint32_t reg, int idx);
 
+/** brw_conditional_render.c */
+void brw_init_conditional_render_functions(struct dd_function_table *functions);
+bool brw_check_conditional_render(struct brw_context *brw);
+
 /** intel_batchbuffer.c */
 void brw_load_register_mem(struct brw_context *brw,
                            uint32_t reg,
                            drm_intel_bo *bo,
                            uint32_t read_domains, uint32_t write_domain,
                            uint32_t offset);
+void brw_load_register_mem64(struct brw_context *brw,
+                             uint32_t reg,
+                             drm_intel_bo *bo,
+                             uint32_t read_domains, uint32_t write_domain,
+                             uint32_t offset);
 
 /*======================================================================
  * brw_state_dump.c
@@ -1991,6 +2009,10 @@ void intel_context_destroy(struct brw_context *brw);
 void
 brw_initialize_context_constants(struct brw_context *brw);
 
+bool
+gen9_use_linear_1d_layout(const struct brw_context *brw,
+                          const struct intel_mipmap_tree *mt);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
index 2432875d0f4..42a082b57b6 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -88,9 +88,15 @@ brw_cs_emit(struct brw_context *brw,
    cfg_t *cfg = NULL;
    const char *fail_msg = NULL;
 
+   int st_index = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      st_index = brw_get_shader_time_index(brw, prog, &cp->Base, ST_CS);
+
    /* Now the main event: Visit the shader IR and generate our CS IR for it.
     */
-   fs_visitor v8(brw, mem_ctx, key, prog_data, prog, cp, 8);
+   fs_visitor v8(brw->intelScreen->compiler, brw,
+                 mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
+                 &cp->Base, 8, st_index);
    if (!v8.run_cs()) {
       fail_msg = v8.fail_msg;
    } else if (local_workgroup_size <= 8 * brw->max_cs_threads) {
@@ -98,7 +104,9 @@ brw_cs_emit(struct brw_context *brw,
       prog_data->simd_size = 8;
    }
 
-   fs_visitor v16(brw, mem_ctx, key, prog_data, prog, cp, 16);
+   fs_visitor v16(brw->intelScreen->compiler, brw,
+                  mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
+                  &cp->Base, 16, st_index);
    if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
        !fail_msg && !v8.simd16_unsupported &&
        local_workgroup_size <= 16 * brw->max_cs_threads) {
@@ -126,7 +134,8 @@ brw_cs_emit(struct brw_context *brw,
       return NULL;
    }
 
-   fs_generator g(brw, mem_ctx, (void*) key, &prog_data->base, &cp->Base,
+   fs_generator g(brw->intelScreen->compiler, brw,
+                  mem_ctx, (void*) key, &prog_data->base, &cp->Base,
                   v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
    if (INTEL_DEBUG & DEBUG_CS) {
       char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
@@ -368,9 +377,11 @@ brw_upload_cs_state(struct brw_context *brw)
 
 extern "C"
 const struct brw_tracked_state brw_cs_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_CS_PROG_DATA,
+   /* explicit initialisers aren't valid C++, comment
+    * them for documentation purposes */
+   /* .dirty = */{
+      /* .mesa = */ 0,
+      /* .brw = */  BRW_NEW_CS_PROG_DATA,
    },
-   .emit = brw_upload_cs_state
+   /* .emit = */ brw_upload_cs_state
 };
diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
index 03f838dd9ae..61f25811cb2 100644
--- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
+++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
@@ -36,11 +36,11 @@
  *   - if/else/endif
  */
 bool
-dead_control_flow_eliminate(backend_visitor *v)
+dead_control_flow_eliminate(backend_shader *s)
 {
    bool progress = false;
 
-   foreach_block_safe (block, v->cfg) {
+   foreach_block_safe (block, s->cfg) {
       bblock_t *if_block = NULL, *else_block = NULL, *endif_block = block;
       bool found = false;
 
@@ -115,7 +115,7 @@ dead_control_flow_eliminate(backend_visitor *v)
    }
 
    if (progress)
-      v->invalidate_live_intervals();
+      s->invalidate_live_intervals();
 
    return progress;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
index 57a4dabc83c..83fd9b1e79e 100644
--- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
+++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
@@ -23,4 +23,4 @@
 
 #include "brw_shader.h"
 
-bool dead_control_flow_eliminate(backend_visitor *v);
+bool dead_control_flow_eliminate(backend_shader *s);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 3c704ee9d08..c113d52a3d3 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -38,6 +38,7 @@
       fieldval & field ## _MASK;                                        \
    })
 
+#define GET_BITS(data, high, low) ((data & INTEL_MASK((high), (low))) >> (low))
 #define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
 
 #ifndef BRW_DEFINES_H
@@ -51,6 +52,7 @@
 # define GEN4_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL (0 << 15)
 # define GEN4_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     (1 << 15)
 # define GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE      (1 << 10)
+# define GEN7_3DPRIM_PREDICATE_ENABLE               (1 << 8)
 /* DW1 */
 # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL (0 << 8)
 # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     (1 << 8)
@@ -530,9 +532,11 @@
 #define GEN7_SURFACE_ARYSPC_FULL	(0 << 10)
 #define GEN7_SURFACE_ARYSPC_LOD0	(1 << 10)
 
-/* Surface state DW0 */
+/* Surface state DW1 */
 #define GEN8_SURFACE_MOCS_SHIFT         24
 #define GEN8_SURFACE_MOCS_MASK          INTEL_MASK(30, 24)
+#define GEN8_SURFACE_QPITCH_SHIFT       0
+#define GEN8_SURFACE_QPITCH_MASK        INTEL_MASK(14, 0)
 
 /* Surface state DW2 */
 #define BRW_SURFACE_HEIGHT_SHIFT	19
@@ -590,6 +594,15 @@
 #define GEN7_SURFACE_MOCS_SHIFT                 16
 #define GEN7_SURFACE_MOCS_MASK                  INTEL_MASK(19, 16)
 
+#define GEN9_SURFACE_TRMODE_SHIFT          18
+#define GEN9_SURFACE_TRMODE_MASK           INTEL_MASK(19, 18)
+#define GEN9_SURFACE_TRMODE_NONE           0
+#define GEN9_SURFACE_TRMODE_TILEYF         1
+#define GEN9_SURFACE_TRMODE_TILEYS         2
+
+#define GEN9_SURFACE_MIP_TAIL_START_LOD_SHIFT      8
+#define GEN9_SURFACE_MIP_TAIL_START_LOD_MASK       INTEL_MASK(11, 8)
+
 /* Surface state DW6 */
 #define GEN7_SURFACE_MCS_ENABLE                 (1 << 0)
 #define GEN7_SURFACE_MCS_PITCH_SHIFT            3
@@ -606,6 +619,8 @@
 #define GEN8_SURFACE_AUX_MODE_HIZ               3
 
 /* Surface state DW7 */
+#define GEN9_SURFACE_RT_COMPRESSION_SHIFT       30
+#define GEN9_SURFACE_RT_COMPRESSION_MASK        INTEL_MASK(30, 30)
 #define GEN7_SURFACE_CLEAR_COLOR_SHIFT		28
 #define GEN7_SURFACE_SCS_R_SHIFT                25
 #define GEN7_SURFACE_SCS_R_MASK                 INTEL_MASK(27, 25)
@@ -1131,6 +1146,11 @@ enum opcode {
     * Terminate the compute shader.
     */
    CS_OPCODE_CS_TERMINATE,
+
+   /**
+    * GLSL barrier()
+    */
+   SHADER_OPCODE_BARRIER,
 };
 
 enum brw_urb_write_flags {
@@ -1592,6 +1612,14 @@ enum brw_message_target {
 #define BRW_SCRATCH_SPACE_SIZE_1M     10
 #define BRW_SCRATCH_SPACE_SIZE_2M     11
 
+#define BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY         0
+#define BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY        1
+#define BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG          2
+#define BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP        3
+#define BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG          4
+#define BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE 5
+#define BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE      6
+
 
 #define CMD_URB_FENCE                 0x6000
 #define CMD_CS_URB_STATE              0x6001
@@ -1769,9 +1797,8 @@ enum brw_message_target {
 # define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID		1
 # define GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT		20
 # define GEN7_GS_INSTANCE_CONTROL_SHIFT			15
-# define GEN7_GS_DISPATCH_MODE_SINGLE			(0 << 11)
-# define GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE		(1 << 11)
-# define GEN7_GS_DISPATCH_MODE_DUAL_OBJECT		(2 << 11)
+# define GEN7_GS_DISPATCH_MODE_SHIFT                    11
+# define GEN7_GS_DISPATCH_MODE_MASK                     INTEL_MASK(12, 11)
 # define GEN6_GS_STATISTICS_ENABLE			(1 << 10)
 # define GEN6_GS_SO_STATISTICS_ENABLE			(1 << 9)
 # define GEN6_GS_RENDERING_ENABLE			(1 << 8)
@@ -2470,8 +2497,8 @@ enum brw_wm_barycentric_interp_mode {
  * cache settings.  We still use only either write-back or write-through; and
  * rely on the documented default values.
  */
-#define SKL_MOCS_WB 9
-#define SKL_MOCS_WT 5
+#define SKL_MOCS_WB (0b001001 << 1)
+#define SKL_MOCS_WT (0b000101 << 1)
 
 #define MEDIA_VFE_STATE                         0x7000
 /* GEN7 DW2, GEN8+ DW3 */
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 95e262a361b..1075c5acba5 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -402,6 +402,16 @@ static const char *const gen6_sfid[16] = {
    [HSW_SFID_CRE]                      = "cre",
 };
 
+static const char *const gen7_gateway_subfuncid[8] = {
+   [BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY] = "open",
+   [BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY] = "close",
+   [BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG] = "forward msg",
+   [BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP] = "get timestamp",
+   [BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG] = "barrier msg",
+   [BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE] = "update state",
+   [BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE] = "mmio read/write",
+};
+
 static const char *const dp_write_port_msg_type[8] = {
    [0b000] = "OWord block write",
    [0b001] = "OWord dual block write",
@@ -977,13 +987,14 @@ src0_3src(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst)
               brw_inst_3src_src0_reg_nr(devinfo, inst));
    if (err == -1)
       return 0;
-   if (src0_subreg_nr)
+   if (src0_subreg_nr || brw_inst_3src_src0_rep_ctrl(devinfo, inst))
       format(file, ".%d", src0_subreg_nr);
    if (brw_inst_3src_src0_rep_ctrl(devinfo, inst))
       string(file, "<0,1,0>");
-   else
+   else {
       string(file, "<4,4,1>");
-   err |= src_swizzle(file, brw_inst_3src_src0_swizzle(devinfo, inst));
+      err |= src_swizzle(file, brw_inst_3src_src0_swizzle(devinfo, inst));
+   }
    err |= control(file, "src da16 reg type", three_source_reg_encoding,
                   brw_inst_3src_src_type(devinfo, inst), NULL);
    return err;
@@ -1003,13 +1014,14 @@ src1_3src(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst)
               brw_inst_3src_src1_reg_nr(devinfo, inst));
    if (err == -1)
       return 0;
-   if (src1_subreg_nr)
+   if (src1_subreg_nr || brw_inst_3src_src1_rep_ctrl(devinfo, inst))
       format(file, ".%d", src1_subreg_nr);
    if (brw_inst_3src_src1_rep_ctrl(devinfo, inst))
       string(file, "<0,1,0>");
-   else
+   else {
       string(file, "<4,4,1>");
-   err |= src_swizzle(file, brw_inst_3src_src1_swizzle(devinfo, inst));
+      err |= src_swizzle(file, brw_inst_3src_src1_swizzle(devinfo, inst));
+   }
    err |= control(file, "src da16 reg type", three_source_reg_encoding,
                   brw_inst_3src_src_type(devinfo, inst), NULL);
    return err;
@@ -1030,13 +1042,14 @@ src2_3src(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst)
               brw_inst_3src_src2_reg_nr(devinfo, inst));
    if (err == -1)
       return 0;
-   if (src2_subreg_nr)
+   if (src2_subreg_nr || brw_inst_3src_src2_rep_ctrl(devinfo, inst))
       format(file, ".%d", src2_subreg_nr);
    if (brw_inst_3src_src2_rep_ctrl(devinfo, inst))
       string(file, "<0,1,0>");
-   else
+   else {
       string(file, "<4,4,1>");
-   err |= src_swizzle(file, brw_inst_3src_src2_swizzle(devinfo, inst));
+      err |= src_swizzle(file, brw_inst_3src_src2_swizzle(devinfo, inst));
+   }
    err |= control(file, "src da16 reg type", three_source_reg_encoding,
                   brw_inst_3src_src_type(devinfo, inst), NULL);
    return err;
@@ -1495,6 +1508,12 @@ brw_disassemble_inst(FILE *file, const struct brw_device_info *devinfo,
             break;
          case BRW_SFID_THREAD_SPAWNER:
             break;
+
+         case BRW_SFID_MESSAGE_GATEWAY:
+            format(file, " (%s)",
+                   gen7_gateway_subfuncid[brw_inst_gateway_subfuncid(devinfo, inst)]);
+            break;
+
          case GEN7_SFID_DATAPORT_DATA_CACHE:
             if (devinfo->gen >= 7) {
                format(file, " (");
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 96e23697923..b91597a9f5d 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -92,8 +92,10 @@ get_hw_prim_for_gl_prim(int mode)
 {
    if (mode >= BRW_PRIM_OFFSET)
       return mode - BRW_PRIM_OFFSET;
-   else
+   else {
+      assert(mode < ARRAY_SIZE(prim_to_hw_prim));
       return prim_to_hw_prim[mode];
+   }
 }
 
 
@@ -178,6 +180,7 @@ static void brw_emit_prim(struct brw_context *brw,
    int verts_per_instance;
    int vertex_access_type;
    int indirect_flag;
+   int predicate_enable;
 
    DBG("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode),
        prim->start, prim->count);
@@ -258,10 +261,14 @@ static void brw_emit_prim(struct brw_context *brw,
       indirect_flag = 0;
    }
 
-
    if (brw->gen >= 7) {
+      if (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
+         predicate_enable = GEN7_3DPRIM_PREDICATE_ENABLE;
+      else
+         predicate_enable = 0;
+
       BEGIN_BATCH(7);
-      OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag);
+      OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable);
       OUT_BATCH(hw_prim | vertex_access_type);
    } else {
       BEGIN_BATCH(6);
@@ -561,12 +568,7 @@ void brw_draw_prims( struct gl_context *ctx,
 
    assert(unused_tfb_object == NULL);
 
-   if (ctx->Query.CondRenderQuery) {
-      perf_debug("Conditional rendering is implemented in software and may "
-                 "stall.  This should be fixed in the driver.\n");
-   }
-
-   if (!_mesa_check_conditional_render(ctx))
+   if (!brw_check_conditional_render(brw))
       return;
 
    /* Handle primitive restart if needed */
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 0e7be1e1ea0..761aa0ec5fa 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -361,6 +361,8 @@ brw_jump_scale(const struct brw_device_info *devinfo)
    return 1;
 }
 
+void brw_barrier(struct brw_codegen *p, struct brw_reg src);
+
 /* If/else/endif.  Works by manipulating the execution flags on each
  * channel.
  */
@@ -390,6 +392,8 @@ brw_inst *brw_JMPI(struct brw_codegen *p, struct brw_reg index,
 
 void brw_NOP(struct brw_codegen *p);
 
+void brw_WAIT(struct brw_codegen *p);
+
 /* Special case: there is never a destination, execution size will be
  * taken from src0:
  */
diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
index 69cb114b945..67f0b45ac04 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_compact.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
@@ -849,6 +849,12 @@ set_3src_source_index(const struct brw_device_info *devinfo,
 static bool
 has_unmapped_bits(const struct brw_device_info *devinfo, brw_inst *src)
 {
+   /* EOT can only be mapped on a send if the src1 is an immediate */
+   if ((brw_inst_opcode(devinfo, src) == BRW_OPCODE_SENDC ||
+        brw_inst_opcode(devinfo, src) == BRW_OPCODE_SEND) &&
+       brw_inst_eot(devinfo, src))
+      return true;
+
    /* Check for instruction bits that don't map to any of the fields of the
     * compacted instruction.  The instruction cannot be compacted if any of
     * them are set.  They overlap with:
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index e78d0bec268..0f536046f6f 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -914,6 +914,8 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
          brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
          brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
          break;
+      default:
+         unreachable("not reached");
       }
    }
 
@@ -3404,3 +3406,54 @@ void brw_shader_time_add(struct brw_codegen *p,
 
    brw_pop_insn_state(p);
 }
+
+
+/**
+ * Emit the SEND message for a barrier
+ */
+void
+brw_barrier(struct brw_codegen *p, struct brw_reg src)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+   struct brw_inst *inst;
+
+   assert(devinfo->gen >= 7);
+
+   inst = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, inst, brw_null_reg());
+   brw_set_src0(p, inst, src);
+   brw_set_src1(p, inst, brw_null_reg());
+
+   brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
+                              1 /* msg_length */,
+                              0 /* response_length */,
+                              false /* header_present */,
+                              false /* end_of_thread */);
+
+   brw_inst_set_gateway_notify(devinfo, inst, 1);
+   brw_inst_set_gateway_subfuncid(devinfo, inst,
+                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
+
+   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
+}
+
+
+/**
+ * Emit the wait instruction for a barrier
+ */
+void
+brw_WAIT(struct brw_codegen *p)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+   struct brw_inst *insn;
+
+   struct brw_reg src = brw_notification_reg();
+
+   insn = next_insn(p, BRW_OPCODE_WAIT);
+   brw_set_dest(p, insn, src);
+   brw_set_src0(p, insn, src);
+   brw_set_src1(p, insn, brw_null_reg());
+
+   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5ce1dfc6633..2c0ff961182 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -49,6 +49,8 @@
 #include "glsl/glsl_types.h"
 #include "program/sampler.h"
 
+using namespace brw;
+
 void
 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
               const fs_reg *src, unsigned sources)
@@ -212,152 +214,13 @@ fs_inst::resize_sources(uint8_t num_sources)
    }
 }
 
-#define ALU1(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
-   }
-
-#define ALU2(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1)                                   \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
-   }
-
-#define ALU2_ACC(op)                                                    \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1)                                   \
-   {                                                                    \
-      fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
-      inst->writes_accumulator = true;                                  \
-      return inst;                                                      \
-   }
-
-#define ALU3(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1, const fs_reg &src2)               \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
-   }
-
-ALU1(NOT)
-ALU1(MOV)
-ALU1(FRC)
-ALU1(RNDD)
-ALU1(RNDE)
-ALU1(RNDZ)
-ALU2(ADD)
-ALU2(MUL)
-ALU2_ACC(MACH)
-ALU2(AND)
-ALU2(OR)
-ALU2(XOR)
-ALU2(SHL)
-ALU2(SHR)
-ALU2(ASR)
-ALU3(LRP)
-ALU1(BFREV)
-ALU3(BFE)
-ALU2(BFI1)
-ALU3(BFI2)
-ALU1(FBH)
-ALU1(FBL)
-ALU1(CBIT)
-ALU3(MAD)
-ALU2_ACC(ADDC)
-ALU2_ACC(SUBB)
-ALU2(SEL)
-ALU2(MAC)
-
-/** Gen4 predicated IF. */
-fs_inst *
-fs_visitor::IF(enum brw_predicate predicate)
-{
-   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
-   inst->predicate = predicate;
-   return inst;
-}
-
-/** Gen6 IF with embedded comparison. */
-fs_inst *
-fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
-               enum brw_conditional_mod condition)
-{
-   assert(devinfo->gen == 6);
-   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
-                                        reg_null_d, src0, src1);
-   inst->conditional_mod = condition;
-   return inst;
-}
-
-/**
- * CMP: Sets the low bit of the destination channels with the result
- * of the comparison, while the upper bits are undefined, and updates
- * the flag register with the packed 16 bits of the result.
- */
-fs_inst *
-fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
-                enum brw_conditional_mod condition)
-{
-   fs_inst *inst;
-
-   /* Take the instruction:
-    *
-    * CMP null<d> src0<f> src1<f>
-    *
-    * Original gen4 does type conversion to the destination type before
-    * comparison, producing garbage results for floating point comparisons.
-    *
-    * The destination type doesn't matter on newer generations, so we set the
-    * type to match src0 so we can compact the instruction.
-    */
-   dst.type = src0.type;
-   if (dst.file == HW_REG)
-      dst.fixed_hw_reg.type = dst.type;
-
-   resolve_ud_negate(&src0);
-   resolve_ud_negate(&src1);
-
-   inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
-   inst->conditional_mod = condition;
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
-                         int header_size)
-{
-   assert(dst.width % 8 == 0);
-   fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
-                                        dst, src, sources);
-   inst->header_size = header_size;
-
-   for (int i = 0; i < header_size; i++)
-      assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
-   inst->regs_written = header_size;
-
-   for (int i = header_size; i < sources; ++i)
-      assert(src[i].file != GRF || src[i].width == dst.width);
-   inst->regs_written += (sources - header_size) * (dst.width / 8);
-
-   return inst;
-}
-
-exec_list
-fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
+void
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
+                                       const fs_reg &dst,
                                        const fs_reg &surf_index,
                                        const fs_reg &varying_offset,
                                        uint32_t const_offset)
 {
-   exec_list instructions;
-   fs_inst *inst;
-
    /* We have our constant surface use a pitch of 4 bytes, so our index can
     * be any component of a vector, and then we load 4 contiguous
     * components starting from that.
@@ -370,8 +233,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
     * the redundant ones.
     */
    fs_reg vec4_offset = vgrf(glsl_type::int_type);
-   instructions.push_tail(ADD(vec4_offset,
-                              varying_offset, fs_reg(const_offset & ~3)));
+   bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 
    int scale = 1;
    if (devinfo->gen == 4 && dst.width == 8) {
@@ -393,9 +255,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
    int regs_written = 4 * (dst.width / 8) * scale;
    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
                                dst.type, dst.width);
-   inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
+   fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
-   instructions.push_tail(inst);
 
    if (devinfo->gen < 7) {
       inst->base_mrf = 13;
@@ -406,30 +267,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
          inst->mlen = 1 + dispatch_width / 8;
    }
 
-   fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
-   instructions.push_tail(MOV(dst, result));
-
-   return instructions;
+   bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
 }
 
 /**
  * A helper for MOV generation for fixing up broken hardware SEND dependency
  * handling.
  */
-fs_inst *
-fs_visitor::DEP_RESOLVE_MOV(int grf)
+void
+fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 {
-   fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
-
-   inst->ir = NULL;
-   inst->annotation = "send dependency resolve";
-
    /* The caller always wants uncompressed to emit the minimal extra
     * dependencies, and to avoid having to deal with aligning its regs to 2.
     */
-   inst->exec_size = 8;
+   const fs_builder ubld = bld.annotate("send dependency resolve")
+                              .half(0);
 
-   return inst;
+   ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 }
 
 bool
@@ -685,7 +539,7 @@ fs_visitor::type_size(const struct glsl_type *type)
  * the destination of the MOV, with extra parameters set.
  */
 fs_reg
-fs_visitor::get_timestamp(fs_inst **out_mov)
+fs_visitor::get_timestamp(const fs_builder &bld)
 {
    assert(devinfo->gen >= 7);
 
@@ -696,11 +550,10 @@ fs_visitor::get_timestamp(fs_inst **out_mov)
 
    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 
-   fs_inst *mov = MOV(dst, ts);
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
     */
-   mov->force_writemask_all = true;
+   bld.exec_all().MOV(dst, ts);
 
    /* The caller wants the low 32 bits of the timestamp.  Since it's running
     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
@@ -714,105 +567,60 @@ fs_visitor::get_timestamp(fs_inst **out_mov)
     */
    dst.set_smear(0);
 
-   *out_mov = mov;
    return dst;
 }
 
 void
 fs_visitor::emit_shader_time_begin()
 {
-   current_annotation = "shader time start";
-   fs_inst *mov;
-   shader_start_time = get_timestamp(&mov);
-   emit(mov);
+   shader_start_time = get_timestamp(bld.annotate("shader time start"));
 }
 
 void
 fs_visitor::emit_shader_time_end()
 {
-   current_annotation = "shader time end";
-
-   enum shader_time_shader_type type, written_type, reset_type;
-   switch (stage) {
-   case MESA_SHADER_VERTEX:
-      type = ST_VS;
-      written_type = ST_VS_WRITTEN;
-      reset_type = ST_VS_RESET;
-      break;
-   case MESA_SHADER_GEOMETRY:
-      type = ST_GS;
-      written_type = ST_GS_WRITTEN;
-      reset_type = ST_GS_RESET;
-      break;
-   case MESA_SHADER_FRAGMENT:
-      if (dispatch_width == 8) {
-         type = ST_FS8;
-         written_type = ST_FS8_WRITTEN;
-         reset_type = ST_FS8_RESET;
-      } else {
-         assert(dispatch_width == 16);
-         type = ST_FS16;
-         written_type = ST_FS16_WRITTEN;
-         reset_type = ST_FS16_RESET;
-      }
-      break;
-   case MESA_SHADER_COMPUTE:
-      type = ST_CS;
-      written_type = ST_CS_WRITTEN;
-      reset_type = ST_CS_RESET;
-      break;
-   default:
-      unreachable("fs_visitor::emit_shader_time_end missing code");
-   }
-
    /* Insert our code just before the final SEND with EOT. */
    exec_node *end = this->instructions.get_tail();
    assert(end && ((fs_inst *) end)->eot);
+   const fs_builder ibld = bld.annotate("shader time end")
+                              .exec_all().at(NULL, end);
 
-   fs_inst *tm_read;
-   fs_reg shader_end_time = get_timestamp(&tm_read);
-   end->insert_before(tm_read);
+   fs_reg shader_end_time = get_timestamp(ibld);
 
    /* Check that there weren't any timestamp reset events (assuming these
     * were the only two timestamp reads that happened).
     */
    fs_reg reset = shader_end_time;
    reset.set_smear(2);
-   fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
-   test->conditional_mod = BRW_CONDITIONAL_Z;
-   test->force_writemask_all = true;
-   end->insert_before(test);
-   end->insert_before(IF(BRW_PREDICATE_NORMAL));
+   set_condmod(BRW_CONDITIONAL_Z,
+               ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
+   ibld.IF(BRW_PREDICATE_NORMAL);
 
    fs_reg start = shader_start_time;
    start.negate = true;
    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
    diff.set_smear(0);
-   fs_inst *add = ADD(diff, start, shader_end_time);
-   add->force_writemask_all = true;
-   end->insert_before(add);
+   ibld.ADD(diff, start, shader_end_time);
 
    /* If there were no instructions between the two timestamp gets, the diff
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   add = ADD(diff, diff, fs_reg(-2u));
-   add->force_writemask_all = true;
-   end->insert_before(add);
-
-   end->insert_before(SHADER_TIME_ADD(type, diff));
-   end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
-   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
-   end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
-   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
+   ibld.ADD(diff, diff, fs_reg(-2u));
+   SHADER_TIME_ADD(ibld, 0, diff);
+   SHADER_TIME_ADD(ibld, 1, fs_reg(1u));
+   ibld.emit(BRW_OPCODE_ELSE);
+   SHADER_TIME_ADD(ibld, 2, fs_reg(1u));
+   ibld.emit(BRW_OPCODE_ENDIF);
 }
 
-fs_inst *
-fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
+void
+fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
+                            int shader_time_subindex,
+                            fs_reg value)
 {
-   int shader_time_index =
-      brw_get_shader_time_index(brw, shader_prog, prog, type);
-   fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
+   int index = shader_time_index * 3 + shader_time_subindex;
+   fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
 
    fs_reg payload;
    if (dispatch_width == 8)
@@ -820,8 +628,7 @@ fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
    else
       payload = vgrf(glsl_type::uint_type);
 
-   return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
-                               fs_reg(), payload, offset, value);
+   bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 }
 
 void
@@ -864,65 +671,16 @@ fs_visitor::fail(const char *format, ...)
  * During a SIMD16 compile (if one happens anyway), this just calls fail().
  */
 void
-fs_visitor::no16(const char *format, ...)
+fs_visitor::no16(const char *msg)
 {
-   va_list va;
-
-   va_start(va, format);
-
    if (dispatch_width == 16) {
-      vfail(format, va);
+      fail("%s", msg);
    } else {
       simd16_unsupported = true;
 
-      if (brw->perf_debug) {
-         if (no16_msg)
-            ralloc_vasprintf_append(&no16_msg, format, va);
-         else
-            no16_msg = ralloc_vasprintf(mem_ctx, format, va);
-      }
+      compiler->shader_perf_log(log_data,
+                                "SIMD16 shader failed to compile: %s", msg);
    }
-
-   va_end(va);
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1, const fs_reg &src2)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
-                 fs_reg src[], int sources)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 }
 
 /**
@@ -1051,7 +809,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
       return inst->mlen;
    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
-      return 2;
+      return inst->mlen;
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -1077,14 +835,6 @@ fs_visitor::vgrf(const glsl_type *const type)
                  brw_type_for_base_type(type), dispatch_width);
 }
 
-fs_reg
-fs_visitor::vgrf(int num_components)
-{
-   int reg_width = dispatch_width / 8;
-   return fs_reg(GRF, alloc.allocate(num_components * reg_width),
-                 BRW_REGISTER_TYPE_F, dispatch_width);
-}
-
 /** Fixed HW reg constructor. */
 fs_reg::fs_reg(enum register_file file, int reg)
 {
@@ -1130,117 +880,18 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
    this->width = width;
 }
 
-fs_reg *
-fs_visitor::variable_storage(ir_variable *var)
-{
-   return (fs_reg *)hash_table_find(this->variable_ht, var);
-}
-
-void
-import_uniforms_callback(const void *key,
-			 void *data,
-			 void *closure)
-{
-   struct hash_table *dst_ht = (struct hash_table *)closure;
-   const fs_reg *reg = (const fs_reg *)data;
-
-   if (reg->file != UNIFORM)
-      return;
-
-   hash_table_insert(dst_ht, data, key);
-}
-
 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
  * This brings in those uniform definitions
  */
 void
 fs_visitor::import_uniforms(fs_visitor *v)
 {
-   hash_table_call_foreach(v->variable_ht,
-			   import_uniforms_callback,
-			   variable_ht);
    this->push_constant_loc = v->push_constant_loc;
    this->pull_constant_loc = v->pull_constant_loc;
    this->uniforms = v->uniforms;
    this->param_size = v->param_size;
 }
 
-/* Our support for uniforms is piggy-backed on the struct
- * gl_fragment_program, because that's where the values actually
- * get stored, rather than in some global gl_shader_program uniform
- * store.
- */
-void
-fs_visitor::setup_uniform_values(ir_variable *ir)
-{
-   int namelen = strlen(ir->name);
-
-   /* The data for our (non-builtin) uniforms is stored in a series of
-    * gl_uniform_driver_storage structs for each subcomponent that
-    * glGetUniformLocation() could name.  We know it's been set up in the same
-    * order we'd walk the type, so walk the list of storage and find anything
-    * with our name, or the prefix of a component that starts with our name.
-    */
-   unsigned params_before = uniforms;
-   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
-      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
-
-      if (strncmp(ir->name, storage->name, namelen) != 0 ||
-          (storage->name[namelen] != 0 &&
-           storage->name[namelen] != '.' &&
-           storage->name[namelen] != '[')) {
-         continue;
-      }
-
-      unsigned slots = storage->type->component_slots();
-      if (storage->array_elements)
-         slots *= storage->array_elements;
-
-      for (unsigned i = 0; i < slots; i++) {
-         stage_prog_data->param[uniforms++] = &storage->storage[i];
-      }
-   }
-
-   /* Make sure we actually initialized the right amount of stuff here. */
-   assert(params_before + ir->type->component_slots() == uniforms);
-   (void)params_before;
-}
-
-
-/* Our support for builtin uniforms is even scarier than non-builtin.
- * It sits on top of the PROG_STATE_VAR parameters that are
- * automatically updated from GL context state.
- */
-void
-fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
-{
-   const ir_state_slot *const slots = ir->get_state_slots();
-   assert(slots != NULL);
-
-   for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
-      /* This state reference has already been setup by ir_to_mesa, but we'll
-       * get the same index back here.
-       */
-      int index = _mesa_add_state_reference(this->prog->Parameters,
-					    (gl_state_index *)slots[i].tokens);
-
-      /* Add each of the unique swizzles of the element as a parameter.
-       * This'll end up matching the expected layout of the
-       * array/matrix/structure we're trying to fill in.
-       */
-      int last_swiz = -1;
-      for (unsigned int j = 0; j < 4; j++) {
-	 int swiz = GET_SWZ(slots[i].swizzle, j);
-	 if (swiz == last_swiz)
-	    break;
-	 last_swiz = swiz;
-
-         stage_prog_data->param[uniforms++] =
-            &prog->Parameters->ParameterValues[index][swiz];
-      }
-   }
-}
-
 fs_reg *
 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
                                          bool origin_upper_left)
@@ -1253,15 +904,15 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 
    /* gl_FragCoord.x */
    if (pixel_center_integer) {
-      emit(MOV(wpos, this->pixel_x));
+      bld.MOV(wpos, this->pixel_x);
    } else {
-      emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
+      bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
    }
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.y */
    if (!flip && pixel_center_integer) {
-      emit(MOV(wpos, this->pixel_y));
+      bld.MOV(wpos, this->pixel_y);
    } else {
       fs_reg pixel_y = this->pixel_y;
       float offset = (pixel_center_integer ? 0.0 : 0.5);
@@ -1271,22 +922,22 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 	 offset += key->drawable_height - 1.0;
       }
 
-      emit(ADD(wpos, pixel_y, fs_reg(offset)));
+      bld.ADD(wpos, pixel_y, fs_reg(offset));
    }
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.z */
    if (devinfo->gen >= 6) {
-      emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
+      bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
    } else {
-      emit(FS_OPCODE_LINTERP, wpos,
+      bld.emit(FS_OPCODE_LINTERP, wpos,
            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
            interp_reg(VARYING_SLOT_POS, 2));
    }
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.w: Already set up in emit_interpolation */
-   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
+   bld.MOV(wpos, this->wpos_w);
 
    return reg;
 }
@@ -1321,8 +972,8 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
        */
       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
    }
-   return emit(FS_OPCODE_LINTERP, attr,
-               this->delta_xy[barycoord_mode], interp);
+   return bld.emit(FS_OPCODE_LINTERP, attr,
+                   this->delta_xy[barycoord_mode], interp);
 }
 
 void
@@ -1380,7 +1031,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 	       struct brw_reg interp = interp_reg(location, k);
 	       interp = suboffset(interp, 3);
                interp.type = attr.type;
-	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
+               bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 	       attr = offset(attr, 1);
 	    }
 	 } else {
@@ -1393,7 +1044,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
                    * unlit, replace the centroid data with non-centroid
                    * data.
                    */
-                  emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+                  bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
 
                   fs_inst *inst;
                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
@@ -1417,7 +1068,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
                                mod_sample || key->persample_shading);
                }
                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
-                  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
+                  bld.MUL(attr, attr, this->pixel_w);
                }
 	       attr = offset(attr, 1);
 	    }
@@ -1448,7 +1099,7 @@ fs_visitor::emit_frontfacing_interpolation()
       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
       g0.negate = true;
 
-      emit(ASR(*reg, g0, fs_reg(15)));
+      bld.ASR(*reg, g0, fs_reg(15));
    } else {
       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
        * a boolean result from this (1/true or 0/false).
@@ -1463,7 +1114,7 @@ fs_visitor::emit_frontfacing_interpolation()
       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
       g1_6.negate = true;
 
-      emit(ASR(*reg, g1_6, fs_reg(31)));
+      bld.ASR(*reg, g1_6, fs_reg(31));
    }
 
    return reg;
@@ -1478,9 +1129,9 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
 
    if (key->compute_pos_offset) {
       /* Convert int_sample_pos to floating point */
-      emit(MOV(dst, int_sample_pos));
+      bld.MOV(dst, int_sample_pos);
       /* Scale to the range [0, 1] */
-      emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
+      bld.MUL(dst, dst, fs_reg(1 / 16.0f));
    }
    else {
       /* From ARB_sample_shading specification:
@@ -1488,7 +1139,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
        *  rasterization is disabled, gl_SamplePosition will always be
        *  (0.5, 0.5).
        */
-      emit(MOV(dst, fs_reg(0.5f)));
+      bld.MOV(dst, fs_reg(0.5f));
    }
 }
 
@@ -1497,7 +1148,7 @@ fs_visitor::emit_samplepos_setup()
 {
    assert(devinfo->gen >= 6);
 
-   this->current_annotation = "compute sample position";
+   const fs_builder abld = bld.annotate("compute sample position");
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
    fs_reg pos = *reg;
    fs_reg int_sample_x = vgrf(glsl_type::int_type);
@@ -1519,22 +1170,22 @@ fs_visitor::emit_samplepos_setup()
                     BRW_REGISTER_TYPE_B), 16, 8, 2);
 
    if (dispatch_width == 8) {
-      emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
+      abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
    } else {
-      emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
-      emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
-         ->force_sechalf = true;
+      abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
+      abld.half(1).MOV(half(int_sample_x, 1),
+                       fs_reg(suboffset(sample_pos_reg, 16)));
    }
    /* Compute gl_SamplePosition.x */
    compute_sample_position(pos, int_sample_x);
    pos = offset(pos, 1);
    if (dispatch_width == 8) {
-      emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
+      abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
    } else {
-      emit(MOV(half(int_sample_y, 0),
-               fs_reg(suboffset(sample_pos_reg, 1))));
-      emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
-         ->force_sechalf = true;
+      abld.half(0).MOV(half(int_sample_y, 0),
+                       fs_reg(suboffset(sample_pos_reg, 1)));
+      abld.half(1).MOV(half(int_sample_y, 1),
+                       fs_reg(suboffset(sample_pos_reg, 17)));
    }
    /* Compute gl_SamplePosition.y */
    compute_sample_position(pos, int_sample_y);
@@ -1548,7 +1199,7 @@ fs_visitor::emit_sampleid_setup()
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
    assert(devinfo->gen >= 6);
 
-   this->current_annotation = "compute sample id";
+   const fs_builder abld = bld.annotate("compute sample id");
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
 
    if (key->compute_sample_id) {
@@ -1575,26 +1226,25 @@ fs_visitor::emit_sampleid_setup()
        * are sample 1 of subspan 0; the third group is sample 0 of
        * subspan 1, and finally sample 1 of subspan 1.
        */
-      fs_inst *inst;
-      inst = emit(BRW_OPCODE_AND, t1,
-                  fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
-                  fs_reg(0xc0));
-      inst->force_writemask_all = true;
-      inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
-      inst->force_writemask_all = true;
+      abld.exec_all()
+          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+               fs_reg(0xc0));
+      abld.exec_all().SHR(t1, t1, fs_reg(5));
+
       /* This works for both SIMD8 and SIMD16 */
-      inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
-      inst->force_writemask_all = true;
+      abld.exec_all()
+          .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
+
       /* This special instruction takes care of setting vstride=1,
        * width=4, hstride=0 of t2 during an ADD instruction.
        */
-      emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
+      abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
    } else {
       /* As per GL_ARB_sample_shading specification:
        * "When rendering to a non-multisample buffer, or if multisample
        *  rasterization is disabled, gl_SampleID will always be zero."
        */
-      emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
+      abld.MOV(*reg, fs_reg(0));
    }
 
    return reg;
@@ -1606,111 +1256,11 @@ fs_visitor::resolve_source_modifiers(fs_reg *src)
    if (!src->abs && !src->negate)
       return;
 
-   fs_reg temp = retype(vgrf(1), src->type);
-   emit(MOV(temp, *src));
+   fs_reg temp = bld.vgrf(src->type);
+   bld.MOV(temp, *src);
    *src = temp;
 }
 
-fs_reg
-fs_visitor::fix_math_operand(fs_reg src)
-{
-   /* Can't do hstride == 0 args on gen6 math, so expand it out. We
-    * might be able to do better by doing execsize = 1 math and then
-    * expanding that result out, but we would need to be careful with
-    * masking.
-    *
-    * The hardware ignores source modifiers (negate and abs) on math
-    * instructions, so we also move to a temp to set those up.
-    */
-   if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
-       !src.abs && !src.negate)
-      return src;
-
-   /* Gen7 relaxes most of the above restrictions, but still can't use IMM
-    * operands to math
-    */
-   if (devinfo->gen >= 7 && src.file != IMM)
-      return src;
-
-   fs_reg expanded = vgrf(glsl_type::float_type);
-   expanded.type = src.type;
-   emit(BRW_OPCODE_MOV, expanded, src);
-   return expanded;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
-{
-   switch (opcode) {
-   case SHADER_OPCODE_RCP:
-   case SHADER_OPCODE_RSQ:
-   case SHADER_OPCODE_SQRT:
-   case SHADER_OPCODE_EXP2:
-   case SHADER_OPCODE_LOG2:
-   case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
-      break;
-   default:
-      unreachable("not reached: bad math opcode");
-   }
-
-   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
-    * might be able to do better by doing execsize = 1 math and then
-    * expanding that result out, but we would need to be careful with
-    * masking.
-    *
-    * Gen 6 hardware ignores source modifiers (negate and abs) on math
-    * instructions, so we also move to a temp to set those up.
-    */
-   if (devinfo->gen == 6 || devinfo->gen == 7)
-      src = fix_math_operand(src);
-
-   fs_inst *inst = emit(opcode, dst, src);
-
-   if (devinfo->gen < 6) {
-      inst->base_mrf = 2;
-      inst->mlen = dispatch_width / 8;
-   }
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
-{
-   int base_mrf = 2;
-   fs_inst *inst;
-
-   if (devinfo->gen >= 8) {
-      inst = emit(opcode, dst, src0, src1);
-   } else if (devinfo->gen >= 6) {
-      src0 = fix_math_operand(src0);
-      src1 = fix_math_operand(src1);
-
-      inst = emit(opcode, dst, src0, src1);
-   } else {
-      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
-       * "Message Payload":
-       *
-       * "Operand0[7].  For the INT DIV functions, this operand is the
-       *  denominator."
-       *  ...
-       * "Operand1[7].  For the INT DIV functions, this operand is the
-       *  numerator."
-       */
-      bool is_int_div = opcode != SHADER_OPCODE_POW;
-      fs_reg &op0 = is_int_div ? src1 : src0;
-      fs_reg &op1 = is_int_div ? src0 : src1;
-
-      emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
-      inst = emit(opcode, dst, op0, reg_null_f);
-
-      inst->base_mrf = base_mrf;
-      inst->mlen = 2 * dispatch_width / 8;
-   }
-   return inst;
-}
-
 void
 fs_visitor::emit_discard_jump()
 {
@@ -1719,7 +1269,7 @@ fs_visitor::emit_discard_jump()
    /* For performance, after a discard, jump to the end of the
     * shader if all relevant channels have been discarded.
     */
-   fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
+   fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
    discard_jump->flag_subreg = 1;
 
    discard_jump->predicate = (dispatch_width == 8)
@@ -2317,26 +1867,22 @@ fs_visitor::demote_pull_constants()
 	    continue;
 
          /* Set up the annotation tracking for new generated instructions. */
-         base_ir = inst->ir;
-         current_annotation = inst->annotation;
-
+         const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
+                                    .at(block, inst);
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
          fs_reg dst = vgrf(glsl_type::float_type);
 
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
-            exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
-                                                        surf_index,
-                                                        *inst->src[i].reladdr,
-                                                        pull_index);
-            inst->insert_before(block, &list);
+            VARYING_PULL_CONSTANT_LOAD(ibld, dst,
+                                       surf_index,
+                                       *inst->src[i].reladdr,
+                                       pull_index);
             inst->src[i].reladdr = NULL;
          } else {
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            fs_inst *pull =
-               new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
-                                    dst, surf_index, offset);
-            inst->insert_before(block, pull);
+            ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                      dst, surf_index, offset);
             inst->src[i].set_smear(pull_index & 3);
          }
 
@@ -2663,6 +2209,16 @@ fs_visitor::opt_sampler_eot()
    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
       return false;
 
+   /* This optimisation doesn't seem to work for textureGather for some
+    * reason. I can't find any documentation or known workarounds to indicate
+    * that this is expected, but considering that it is probably pretty
+    * unlikely that a shader would directly write out the results from
+    * textureGather we might as well just disable it.
+    */
+   if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
+       tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
+      return false;
+
    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
     * It's very likely to be the previous instruction.
     */
@@ -2676,7 +2232,7 @@ fs_visitor::opt_sampler_eot()
 
    tex_inst->offset |= fb_write->target << 24;
    tex_inst->eot = true;
-   tex_inst->dst = reg_null_ud;
+   tex_inst->dst = bld.null_reg_ud();
    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
 
    /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2688,7 +2244,8 @@ fs_visitor::opt_sampler_eot()
    if (tex_inst->header_size != 0)
       return true;
 
-   fs_reg send_header = vgrf(load_payload->sources + 1);
+   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
+                                 load_payload->sources + 1);
    fs_reg *new_sources =
       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
 
@@ -3041,8 +2598,8 @@ fs_visitor::emit_repclear_shader()
    fs_inst *mov;
 
    if (uniforms == 1) {
-      mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
-                     fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
+      mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
+                               fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
    } else {
       struct brw_reg reg =
          brw_reg(BRW_GENERAL_REGISTER_FILE,
@@ -3051,14 +2608,13 @@ fs_visitor::emit_repclear_shader()
                  BRW_WIDTH_2,
                  BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 
-      mov = emit(MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)));
+      mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
+                               fs_reg(reg));
    }
 
-   mov->force_writemask_all = true;
-
    fs_inst *write;
    if (key->nr_color_regions == 1) {
-      write = emit(FS_OPCODE_REP_FB_WRITE);
+      write = bld.emit(FS_OPCODE_REP_FB_WRITE);
       write->saturate = key->clamp_fragment_color;
       write->base_mrf = color_mrf;
       write->target = 0;
@@ -3067,7 +2623,7 @@ fs_visitor::emit_repclear_shader()
    } else {
       assume(key->nr_color_regions > 0);
       for (int i = 0; i < key->nr_color_regions; ++i) {
-         write = emit(FS_OPCODE_REP_FB_WRITE);
+         write = bld.emit(FS_OPCODE_REP_FB_WRITE);
          write->saturate = key->clamp_fragment_color;
          write->base_mrf = base_mrf;
          write->target = i;
@@ -3223,9 +2779,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        */
       if (block->start() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
-            if (needs_dep[i]) {
-               inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
-            }
+            if (needs_dep[i])
+               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
          }
          return;
       }
@@ -3241,7 +2796,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
                 needs_dep[reg - first_write_grf]) {
-               inst->insert_before(block, DEP_RESOLVE_MOV(reg));
+               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
                needs_dep[reg - first_write_grf] = false;
                if (scan_inst->exec_size == 16)
                   needs_dep[reg - first_write_grf + 1] = false;
@@ -3288,8 +2843,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       if (block->end() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               scan_inst->insert_before(block,
-                                        DEP_RESOLVE_MOV(first_write_grf + i));
+               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
          }
          return;
       }
@@ -3304,7 +2858,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
           scan_inst->dst.reg >= first_write_grf &&
           scan_inst->dst.reg < first_write_grf + write_len &&
           needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
+         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
       }
 
@@ -3429,6 +2983,9 @@ fs_visitor::lower_load_payload()
       assert(inst->dst.file == MRF || inst->dst.file == GRF);
       assert(inst->saturate == false);
 
+      const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
+                                 .exec_all(inst->force_writemask_all)
+                                 .at(block, inst);
       fs_reg dst = inst->dst;
 
       /* Get rid of COMPR4.  We'll add it back in if we need it */
@@ -3441,9 +2998,7 @@ fs_visitor::lower_load_payload()
             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
             mov_src.width = 8;
-            fs_inst *mov = MOV(mov_dst, mov_src);
-            mov->force_writemask_all = true;
-            inst->insert_before(block, mov);
+            ibld.exec_all().MOV(mov_dst, mov_src);
          }
          dst = offset(dst, 1);
       }
@@ -3474,23 +3029,13 @@ fs_visitor::lower_load_payload()
                if (devinfo->has_compr4) {
                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
                   compr4_dst.reg |= BRW_MRF_COMPR4;
-
-                  fs_inst *mov = MOV(compr4_dst, inst->src[i]);
-                  mov->force_writemask_all = inst->force_writemask_all;
-                  inst->insert_before(block, mov);
+                  ibld.MOV(compr4_dst, inst->src[i]);
                } else {
                   /* Platform doesn't have COMPR4.  We have to fake it */
                   fs_reg mov_dst = retype(dst, inst->src[i].type);
                   mov_dst.width = 8;
-
-                  fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
-                  mov->force_writemask_all = inst->force_writemask_all;
-                  inst->insert_before(block, mov);
-
-                  mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
-                  mov->force_writemask_all = inst->force_writemask_all;
-                  mov->force_sechalf = true;
-                  inst->insert_before(block, mov);
+                  ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
+                  ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
                }
             }
 
@@ -3513,12 +3058,8 @@ fs_visitor::lower_load_payload()
       }
 
       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
-         if (inst->src[i].file != BAD_FILE) {
-            fs_inst *mov = MOV(retype(dst, inst->src[i].type),
-                               inst->src[i]);
-            mov->force_writemask_all = inst->force_writemask_all;
-            inst->insert_before(block, mov);
-         }
+         if (inst->src[i].file != BAD_FILE)
+            ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
          dst = offset(dst, 1);
       }
 
@@ -3532,6 +3073,172 @@ fs_visitor::lower_load_payload()
    return progress;
 }
 
+bool
+fs_visitor::lower_integer_multiplication()
+{
+   bool progress = false;
+
+   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
+    * directly, but Cherryview cannot.
+    */
+   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
+      return false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MUL ||
+          inst->dst.is_accumulator() ||
+          (inst->dst.type != BRW_REGISTER_TYPE_D &&
+           inst->dst.type != BRW_REGISTER_TYPE_UD))
+         continue;
+
+      const fs_builder ibld = bld.at(block, inst);
+
+      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+       * src1 are used.
+       *
+       * If multiplying by an immediate value that fits in 16-bits, do a
+       * single MUL instruction with that value in the proper location.
+       */
+      if (inst->src[1].file == IMM &&
+          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+         if (devinfo->gen < 7) {
+            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+                       inst->dst.type, dispatch_width);
+            ibld.MOV(imm, inst->src[1]);
+            ibld.MUL(inst->dst, imm, inst->src[0]);
+         } else {
+            ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+         }
+      } else {
+         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+          * do 32-bit integer multiplication in one instruction, but instead
+          * must do a sequence (which actually calculates a 64-bit result):
+          *
+          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
+          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
+          *    mov(8)  g2<1>D     acc0<8,8,1>D
+          *
+          * But on Gen > 6, the ability to use second accumulator register
+          * (acc1) for non-float data types was removed, preventing a simple
+          * implementation in SIMD16. A 16-channel result can be calculated by
+          * executing the three instructions twice in SIMD8, once with quarter
+          * control of 1Q for the first eight channels and again with 2Q for
+          * the second eight channels.
+          *
+          * Which accumulator register is implicitly accessed (by AccWrEnable
+          * for instance) is determined by the quarter control. Unfortunately
+          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+          * implicit accumulator access by an instruction with 2Q will access
+          * acc1 regardless of whether the data type is usable in acc1.
+          *
+          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+          * integer data types.
+          *
+          * Since we only want the low 32-bits of the result, we can do two
+          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+          * adjust the high result and add them (like the mach is doing):
+          *
+          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
+          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
+          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
+          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
+          *
+          * We avoid the shl instruction by realizing that we only want to add
+          * the low 16-bits of the "high" result to the high 16-bits of the
+          * "low" result and using proper regioning on the add:
+          *
+          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
+          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
+          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
+          *
+          * Since it does not use the (single) accumulator register, we can
+          * schedule multi-component multiplications much better.
+          */
+
+         if (inst->conditional_mod && inst->dst.is_null()) {
+            inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+                               inst->dst.type, dispatch_width);
+         }
+         fs_reg low = inst->dst;
+         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+                     inst->dst.type, dispatch_width);
+
+         if (devinfo->gen >= 7) {
+            fs_reg src1_0_w = inst->src[1];
+            fs_reg src1_1_w = inst->src[1];
+
+            if (inst->src[1].file == IMM) {
+               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
+               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+            } else {
+               src1_0_w.type = BRW_REGISTER_TYPE_UW;
+               if (src1_0_w.stride != 0) {
+                  assert(src1_0_w.stride == 1);
+                  src1_0_w.stride = 2;
+               }
+
+               src1_1_w.type = BRW_REGISTER_TYPE_UW;
+               if (src1_1_w.stride != 0) {
+                  assert(src1_1_w.stride == 1);
+                  src1_1_w.stride = 2;
+               }
+               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+            }
+            ibld.MUL(low, inst->src[0], src1_0_w);
+            ibld.MUL(high, inst->src[0], src1_1_w);
+         } else {
+            fs_reg src0_0_w = inst->src[0];
+            fs_reg src0_1_w = inst->src[0];
+
+            src0_0_w.type = BRW_REGISTER_TYPE_UW;
+            if (src0_0_w.stride != 0) {
+               assert(src0_0_w.stride == 1);
+               src0_0_w.stride = 2;
+            }
+
+            src0_1_w.type = BRW_REGISTER_TYPE_UW;
+            if (src0_1_w.stride != 0) {
+               assert(src0_1_w.stride == 1);
+               src0_1_w.stride = 2;
+            }
+            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+
+            ibld.MUL(low, src0_0_w, inst->src[1]);
+            ibld.MUL(high, src0_1_w, inst->src[1]);
+         }
+
+         fs_reg dst = inst->dst;
+         dst.type = BRW_REGISTER_TYPE_UW;
+         dst.subreg_offset = 2;
+         dst.stride = 2;
+
+         high.type = BRW_REGISTER_TYPE_UW;
+         high.stride = 2;
+
+         low.type = BRW_REGISTER_TYPE_UW;
+         low.subreg_offset = 2;
+         low.stride = 2;
+
+         ibld.ADD(dst, low, high);
+
+         if (inst->conditional_mod) {
+            fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+            set_condmod(inst->conditional_mod,
+                        ibld.MOV(null, inst->dst));
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::dump_instructions()
 {
@@ -3602,6 +3309,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    }
    fprintf(file, "(%d) ", inst->exec_size);
 
+   if (inst->mlen) {
+      fprintf(file, "(mlen: %d) ", inst->mlen);
+   }
 
    switch (inst->dst.file) {
    case GRF:
@@ -3895,7 +3605,7 @@ fs_visitor::setup_vs_payload()
 void
 fs_visitor::setup_cs_payload()
 {
-   assert(brw->gen >= 7);
+   assert(devinfo->gen >= 7);
 
    payload.num_regs = 1;
 }
@@ -3938,6 +3648,17 @@ fs_visitor::calculate_register_pressure()
 void
 fs_visitor::optimize()
 {
+   /* bld is the common builder object pointing at the end of the program we
+    * used to translate it into i965 IR.  For the optimization and lowering
+    * passes coming next, any code added after the end of the program without
+    * having explicitly called fs_builder::at() clearly points at a mistake.
+    * Ideally optimization passes wouldn't be part of the visitor so they
+    * wouldn't have access to bld at all, but they do, so just in case some
+    * pass forgets to ask for a location explicitly set it to NULL here to
+    * make it trip.
+    */
+   bld = bld.at(NULL, NULL);
+
    split_virtual_grfs();
 
    move_uniform_array_access_to_pull_constants();
@@ -3953,7 +3674,7 @@ fs_visitor::optimize()
          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
                                                                         \
-         backend_visitor::dump_instructions(filename);                  \
+         backend_shader::dump_instructions(filename);                   \
       }                                                                 \
                                                                         \
       progress = progress || this_progress;                             \
@@ -3966,7 +3687,7 @@ fs_visitor::optimize()
                stage_abbrev, dispatch_width,
                shader_prog ? shader_prog->Name : 0);
 
-      backend_visitor::dump_instructions(filename);
+      backend_shader::dump_instructions(filename);
    }
 
    bool progress;
@@ -4010,6 +3731,7 @@ fs_visitor::optimize()
    }
 
    OPT(opt_combine_constants);
+   OPT(lower_integer_multiplication);
 
    lower_uniform_pull_constant_loads();
 }
@@ -4066,9 +3788,11 @@ fs_visitor::allocate_registers()
          fail("Failure to register allocate.  Reduce number of "
               "live scalar values to avoid this.");
       } else {
-         perf_debug("%s shader triggered register spilling.  "
-                    "Try reducing the number of live scalar values to "
-                    "improve performance.\n", stage_name);
+         compiler->shader_perf_log(log_data,
+                                   "%s shader triggered register spilling.  "
+                                   "Try reducing the number of live scalar "
+                                   "values to improve performance.\n",
+                                   stage_name);
       }
 
       /* Since we're out of heuristics, just go spill registers until we
@@ -4097,7 +3821,7 @@ fs_visitor::allocate_registers()
 }
 
 bool
-fs_visitor::run_vs()
+fs_visitor::run_vs(gl_clip_plane *clip_planes)
 {
    assert(stage == MESA_SHADER_VERTEX);
 
@@ -4105,26 +3829,17 @@ fs_visitor::run_vs()
       assign_common_binding_table_offsets(0);
    setup_vs_payload();
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_begin();
 
-   if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
-      emit_nir_code();
-   } else {
-      foreach_in_list(ir_instruction, ir, shader->base.ir) {
-         base_ir = ir;
-         this->result = reg_undef;
-         ir->accept(this);
-      }
-      base_ir = NULL;
-   }
+   emit_nir_code();
 
    if (failed)
       return false;
 
-   emit_urb_writes();
+   emit_urb_writes(clip_planes);
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_end();
 
    calculate_cfg();
@@ -4141,7 +3856,7 @@ fs_visitor::run_vs()
 }
 
 bool
-fs_visitor::run_fs()
+fs_visitor::run_fs(bool do_rep_send)
 {
    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
@@ -4160,10 +3875,11 @@ fs_visitor::run_fs()
 
    if (0) {
       emit_dummy_fs();
-   } else if (brw->use_rep_send && dispatch_width == 16) {
+   } else if (do_rep_send) {
+      assert(dispatch_width == 16);
       emit_repclear_shader();
    } else {
-      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      if (shader_time_index >= 0)
          emit_shader_time_begin();
 
       calculate_urb_setup();
@@ -4178,37 +3894,27 @@ fs_visitor::run_fs()
        * Initialize it with the dispatched pixels.
        */
       if (wm_prog_data->uses_kill) {
-         fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+         fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
          discard_init->flag_subreg = 1;
       }
 
       /* Generate FS IR for main().  (the visitor only descends into
        * functions called "main").
        */
-      if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
-         emit_nir_code();
-      } else if (shader) {
-         foreach_in_list(ir_instruction, ir, shader->base.ir) {
-            base_ir = ir;
-            this->result = reg_undef;
-            ir->accept(this);
-         }
-      } else {
-         emit_fragment_program_code();
-      }
-      base_ir = NULL;
+      emit_nir_code();
+
       if (failed)
 	 return false;
 
       if (wm_prog_data->uses_kill)
-         emit(FS_OPCODE_PLACEHOLDER_HALT);
+         bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
 
       if (wm_key->alpha_test_func)
          emit_alpha_test();
 
       emit_fb_writes();
 
-      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      if (shader_time_index >= 0)
          emit_shader_time_end();
 
       calculate_cfg();
@@ -4252,7 +3958,7 @@ fs_visitor::run_cs()
 
    setup_cs_payload();
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_begin();
 
    emit_nir_code();
@@ -4262,7 +3968,7 @@ fs_visitor::run_cs()
 
    emit_cs_terminate();
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_end();
 
    calculate_cfg();
@@ -4312,10 +4018,18 @@ brw_wm_fs_emit(struct brw_context *brw,
    if (unlikely(INTEL_DEBUG & DEBUG_WM))
       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
 
+   int st_index8 = -1, st_index16 = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+      st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
+      st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
+   }
+
    /* Now the main event: Visit the shader IR and generate our FS IR for it.
     */
-   fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
-   if (!v.run_fs()) {
+   fs_visitor v(brw->intelScreen->compiler, brw,
+                mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
+                prog, &fp->Base, 8, st_index8);
+   if (!v.run_fs(false /* do_rep_send */)) {
       if (prog) {
          prog->LinkStatus = false;
          ralloc_strcat(&prog->InfoLog, v.fail_msg);
@@ -4328,20 +4042,18 @@ brw_wm_fs_emit(struct brw_context *brw,
    }
 
    cfg_t *simd16_cfg = NULL;
-   fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
+   fs_visitor v2(brw->intelScreen->compiler, brw,
+                 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
+                 prog, &fp->Base, 16, st_index16);
    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
       if (!v.simd16_unsupported) {
          /* Try a SIMD16 compile */
          v2.import_uniforms(&v);
-         if (!v2.run_fs()) {
-            perf_debug("SIMD16 shader failed to compile, falling back to "
-                       "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
+         if (!v2.run_fs(brw->use_rep_send)) {
+            perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
          } else {
             simd16_cfg = v2.cfg;
          }
-      } else {
-         perf_debug("SIMD16 shader unsupported, falling back to "
-                    "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
       }
    }
 
@@ -4355,7 +4067,8 @@ brw_wm_fs_emit(struct brw_context *brw,
       prog_data->no_8 = false;
    }
 
-   fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
+   fs_generator g(brw->intelScreen->compiler, brw,
+                  mem_ctx, (void *) key, &prog_data->base,
                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 1d7de2effbd..243baf688de 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -29,6 +29,7 @@
 
 #include "brw_shader.h"
 #include "brw_ir_fs.h"
+#include "brw_fs_builder.h"
 
 extern "C" {
 
@@ -66,138 +67,44 @@ namespace brw {
  *
  * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
  */
-class fs_visitor : public backend_visitor
+class fs_visitor : public backend_shader
 {
 public:
-   const fs_reg reg_null_f;
-   const fs_reg reg_null_d;
-   const fs_reg reg_null_ud;
-
-   fs_visitor(struct brw_context *brw,
+   fs_visitor(const struct brw_compiler *compiler, void *log_data,
               void *mem_ctx,
-              const struct brw_wm_prog_key *key,
-              struct brw_wm_prog_data *prog_data,
+              gl_shader_stage stage,
+              const void *key,
+              struct brw_stage_prog_data *prog_data,
               struct gl_shader_program *shader_prog,
-              struct gl_fragment_program *fp,
-              unsigned dispatch_width);
-
-   fs_visitor(struct brw_context *brw,
-              void *mem_ctx,
-              const struct brw_vs_prog_key *key,
-              struct brw_vs_prog_data *prog_data,
-              struct gl_shader_program *shader_prog,
-              struct gl_vertex_program *cp,
-              unsigned dispatch_width);
-
-   fs_visitor(struct brw_context *brw,
-              void *mem_ctx,
-              const struct brw_cs_prog_key *key,
-              struct brw_cs_prog_data *prog_data,
-              struct gl_shader_program *shader_prog,
-              struct gl_compute_program *cp,
-              unsigned dispatch_width);
+              struct gl_program *prog,
+              unsigned dispatch_width,
+              int shader_time_index);
 
    ~fs_visitor();
-   void init();
 
-   fs_reg *variable_storage(ir_variable *var);
    fs_reg vgrf(const glsl_type *const type);
-   fs_reg vgrf(int num_components);
    void import_uniforms(fs_visitor *v);
-   void setup_uniform_clipplane_values();
-   void compute_clip_distance();
-
-   void visit(ir_variable *ir);
-   void visit(ir_assignment *ir);
-   void visit(ir_dereference_variable *ir);
-   void visit(ir_dereference_record *ir);
-   void visit(ir_dereference_array *ir);
-   void visit(ir_expression *ir);
-   void visit(ir_texture *ir);
-   void visit(ir_if *ir);
-   void visit(ir_constant *ir);
-   void visit(ir_swizzle *ir);
-   void visit(ir_return *ir);
-   void visit(ir_loop *ir);
-   void visit(ir_loop_jump *ir);
-   void visit(ir_discard *ir);
-   void visit(ir_call *ir);
-   void visit(ir_function *ir);
-   void visit(ir_function_signature *ir);
-   void visit(ir_emit_vertex *);
-   void visit(ir_end_primitive *);
+   void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
+   void compute_clip_distance(gl_clip_plane *clip_planes);
 
    uint32_t gather_channel(int orig_chan, uint32_t sampler);
    void swizzle_result(ir_texture_opcode op, int dest_components,
                        fs_reg orig_val, uint32_t sampler);
 
-   fs_inst *emit(fs_inst *inst);
-   void emit(exec_list list);
-
-   fs_inst *emit(enum opcode opcode);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst,
-                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst,
-                 fs_reg src[], int sources);
-
-   fs_inst *MOV(const fs_reg &dst, const fs_reg &src);
-   fs_inst *NOT(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDD(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDE(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDZ(const fs_reg &dst, const fs_reg &src);
-   fs_inst *FRC(const fs_reg &dst, const fs_reg &src);
-   fs_inst *ADD(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MUL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MACH(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MAC(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SHL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SHR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *ASR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *AND(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *OR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *XOR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *IF(enum brw_predicate predicate);
-   fs_inst *IF(const fs_reg &src0, const fs_reg &src1,
-               enum brw_conditional_mod condition);
-   fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1,
-                enum brw_conditional_mod condition);
-   fs_inst *LRP(const fs_reg &dst, const fs_reg &a, const fs_reg &y,
-                const fs_reg &x);
-   fs_inst *DEP_RESOLVE_MOV(int grf);
-   fs_inst *BFREV(const fs_reg &dst, const fs_reg &value);
-   fs_inst *BFE(const fs_reg &dst, const fs_reg &bits, const fs_reg &offset,
-                const fs_reg &value);
-   fs_inst *BFI1(const fs_reg &dst, const fs_reg &bits, const fs_reg &offset);
-   fs_inst *BFI2(const fs_reg &dst, const fs_reg &bfi1_dst,
-                 const fs_reg &insert, const fs_reg &base);
-   fs_inst *FBH(const fs_reg &dst, const fs_reg &value);
-   fs_inst *FBL(const fs_reg &dst, const fs_reg &value);
-   fs_inst *CBIT(const fs_reg &dst, const fs_reg &value);
-   fs_inst *MAD(const fs_reg &dst, const fs_reg &c, const fs_reg &b,
-                const fs_reg &a);
-   fs_inst *ADDC(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SUBB(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SEL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-
    int type_size(const struct glsl_type *type);
    fs_inst *get_instruction_generating_reg(fs_inst *start,
 					   fs_inst *end,
 					   const fs_reg &reg);
 
-   fs_inst *LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
-                         int header_size);
-
-   exec_list VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
-                                        const fs_reg &surf_index,
-                                        const fs_reg &varying_offset,
-                                        uint32_t const_offset);
+   void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
+                                   const fs_reg &dst,
+                                   const fs_reg &surf_index,
+                                   const fs_reg &varying_offset,
+                                   uint32_t const_offset);
+   void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
 
-   bool run_fs();
-   bool run_vs();
+   bool run_fs(bool do_rep_send);
+   bool run_vs(gl_clip_plane *clip_planes);
    bool run_cs();
    void optimize();
    void allocate_registers();
@@ -213,11 +120,8 @@ public:
    void assign_vs_urb_setup();
    bool assign_regs(bool allow_spilling);
    void assign_regs_trivial();
-   void get_used_mrfs(bool *mrf_used);
    void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
                                    int first_payload_node);
-   void setup_mrf_hack_interference(struct ra_graph *g,
-                                    int first_mrf_hack_node);
    int choose_spill_reg(struct ra_graph *g);
    void spill_reg(int spill_reg);
    void split_virtual_grfs();
@@ -254,9 +158,10 @@ public:
                                                      fs_inst *inst);
    void vfail(const char *msg, va_list args);
    void fail(const char *msg, ...);
-   void no16(const char *msg, ...);
+   void no16(const char *msg);
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
+   bool lower_integer_multiplication();
    bool opt_combine_constants();
 
    void emit_dummy_fs();
@@ -318,58 +223,18 @@ public:
    fs_reg emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler);
    void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
    void resolve_source_modifiers(fs_reg *src);
-   fs_reg fix_math_operand(fs_reg src);
-   fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
-   fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
-   fs_inst *emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
-                     const fs_reg &a);
-   void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
-                    const fs_reg &src0, const fs_reg &src1);
    void emit_discard_jump();
-   /** Copy any live channel from \p src to the first channel of \p dst. */
-   void emit_uniformize(const fs_reg &dst, const fs_reg &src);
-   bool try_emit_b2f_of_comparison(ir_expression *ir);
-   bool try_emit_saturate(ir_expression *ir);
-   bool try_emit_line(ir_expression *ir);
-   bool try_emit_mad(ir_expression *ir);
    bool try_replace_with_sel();
-   bool try_opt_frontfacing_ternary(ir_if *ir);
    bool opt_peephole_sel();
    bool opt_peephole_predicated_break();
    bool opt_saturate_propagation();
    bool opt_cmod_propagation();
    bool opt_zero_samples();
-   void emit_bool_to_cond_code(ir_rvalue *condition);
-   void emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3]);
-   void emit_if_gen6(ir_if *ir);
    void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
                      uint32_t spill_offset, int count);
    void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
                    uint32_t spill_offset, int count);
 
-   void emit_fragment_program_code();
-   void setup_fp_regs();
-   fs_reg get_fp_src_reg(const prog_src_register *src);
-   fs_reg get_fp_dst_reg(const prog_dst_register *dst);
-   void emit_fp_alu1(enum opcode opcode,
-                     const struct prog_instruction *fpi,
-                     fs_reg dst, fs_reg src);
-   void emit_fp_alu2(enum opcode opcode,
-                     const struct prog_instruction *fpi,
-                     fs_reg dst, fs_reg src0, fs_reg src1);
-   void emit_fp_scalar_write(const struct prog_instruction *fpi,
-                             fs_reg dst, fs_reg src);
-   void emit_fp_scalar_math(enum opcode opcode,
-                            const struct prog_instruction *fpi,
-                            fs_reg dst, fs_reg src);
-
-   void emit_fp_minmax(const struct prog_instruction *fpi,
-                       fs_reg dst, fs_reg src0, fs_reg src1);
-
-   void emit_fp_sop(enum brw_conditional_mod conditional_mod,
-                    const struct prog_instruction *fpi,
-                    fs_reg dst, fs_reg src0, fs_reg src1, fs_reg one);
-
    void emit_nir_code();
    void nir_setup_inputs(nir_shader *shader);
    void nir_setup_outputs(nir_shader *shader);
@@ -383,13 +248,17 @@ public:
    void nir_emit_loop(nir_loop *loop);
    void nir_emit_block(nir_block *block);
    void nir_emit_instr(nir_instr *instr);
-   void nir_emit_alu(nir_alu_instr *instr);
-   void nir_emit_intrinsic(nir_intrinsic_instr *instr);
-   void nir_emit_texture(nir_tex_instr *instr);
-   void nir_emit_jump(nir_jump_instr *instr);
+   void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+   void nir_emit_intrinsic(const brw::fs_builder &bld,
+                           nir_intrinsic_instr *instr);
+   void nir_emit_texture(const brw::fs_builder &bld,
+                         nir_tex_instr *instr);
+   void nir_emit_jump(const brw::fs_builder &bld,
+                      nir_jump_instr *instr);
    fs_reg get_nir_src(nir_src src);
    fs_reg get_nir_dest(nir_dest dest);
-   void emit_percomp(fs_inst *inst, unsigned wr_mask);
+   void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
+                     unsigned wr_mask);
 
    bool optimize_frontfacing_ternary(nir_alu_instr *instr,
                                      const fs_reg &result);
@@ -397,16 +266,21 @@ public:
    void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
                             unsigned exec_size, bool use_2nd_half);
    void emit_alpha_test();
-   fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2,
+   fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
+                                 fs_reg color1, fs_reg color2,
                                  fs_reg src0_alpha, unsigned components,
                                  unsigned exec_size, bool use_2nd_half = false);
    void emit_fb_writes();
-   void emit_urb_writes();
+   void emit_urb_writes(gl_clip_plane *clip_planes);
    void emit_cs_terminate();
 
+   void emit_barrier();
+
    void emit_shader_time_begin();
    void emit_shader_time_end();
-   fs_inst *SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value);
+   void SHADER_TIME_ADD(const brw::fs_builder &bld,
+                        int shader_time_subindex,
+                        fs_reg value);
 
    void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
                             fs_reg dst, fs_reg offset, fs_reg src0,
@@ -415,23 +289,9 @@ public:
    void emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
                                   fs_reg offset);
 
-   void emit_interpolate_expression(ir_expression *ir);
-
-   bool try_rewrite_rhs_to_dst(ir_assignment *ir,
-			       fs_reg dst,
-			       fs_reg src,
-			       fs_inst *pre_rhs_inst,
-			       fs_inst *last_rhs_inst);
-   void emit_assignment_writes(fs_reg &l, fs_reg &r,
-			       const glsl_type *type, bool predicated);
-   void resolve_ud_negate(fs_reg *reg);
-   void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg);
-
-   fs_reg get_timestamp(fs_inst **out_mov);
+   fs_reg get_timestamp(const brw::fs_builder &bld);
 
    struct brw_reg interp_reg(int location, int channel);
-   void setup_uniform_values(ir_variable *ir);
-   void setup_builtin_uniform_values(ir_variable *ir);
    int implied_mrf_writes(fs_inst *inst);
 
    virtual void dump_instructions();
@@ -439,8 +299,6 @@ public:
    void dump_instruction(backend_instruction *inst);
    void dump_instruction(backend_instruction *inst, FILE *file);
 
-   void visit_atomic_counter_intrinsic(ir_call *ir);
-
    const void *const key;
    const struct brw_sampler_prog_key_data *key_tex;
 
@@ -476,7 +334,6 @@ public:
     */
    int *push_constant_loc;
 
-   struct hash_table *variable_ht;
    fs_reg frag_depth;
    fs_reg sample_mask;
    fs_reg outputs[VARYING_SLOT_MAX];
@@ -487,26 +344,18 @@ public:
    /** Either BRW_MAX_GRF or GEN7_MRF_HACK_START */
    unsigned max_grf;
 
-   fs_reg *fp_temp_regs;
-   fs_reg *fp_input_regs;
-
    fs_reg *nir_locals;
    fs_reg *nir_globals;
    fs_reg nir_inputs;
    fs_reg nir_outputs;
    fs_reg *nir_system_values;
 
-   /** @{ debug annotation info */
-   const char *current_annotation;
-   const void *base_ir;
-   /** @} */
-
    bool failed;
    char *fail_msg;
    bool simd16_unsupported;
    char *no16_msg;
 
-   /* Result of last visit() method. */
+   /* Result of last visit() method. Still used by emit_texture() */
    fs_reg result;
 
    /** Register numbers for thread payload fields. */
@@ -539,7 +388,10 @@ public:
 
    const unsigned dispatch_width; /**< 8 or 16 */
 
+   int shader_time_index;
+
    unsigned promoted_constants;
+   brw::fs_builder bld;
 };
 
 /**
@@ -550,7 +402,7 @@ public:
 class fs_generator
 {
 public:
-   fs_generator(struct brw_context *brw,
+   fs_generator(const struct brw_compiler *compiler, void *log_data,
                 void *mem_ctx,
                 const void *key,
                 struct brw_stage_prog_data *prog_data,
@@ -572,6 +424,7 @@ private:
    void generate_fb_write(fs_inst *inst, struct brw_reg payload);
    void generate_urb_write(fs_inst *inst, struct brw_reg payload);
    void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
+   void generate_barrier(fs_inst *inst, struct brw_reg src);
    void generate_blorp_fb_write(fs_inst *inst);
    void generate_linterp(fs_inst *inst, struct brw_reg dst,
 			 struct brw_reg *src);
@@ -644,7 +497,9 @@ private:
 
    bool patch_discard_jumps_to_fb_writes();
 
-   struct brw_context *brw;
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
    const struct brw_device_info *devinfo;
 
    struct brw_codegen *p;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
new file mode 100644
index 00000000000..58ac5980da5
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -0,0 +1,652 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_BUILDER_H
+#define BRW_FS_BUILDER_H
+
+#include "brw_ir_fs.h"
+#include "brw_shader.h"
+#include "brw_context.h"
+
+namespace brw {
+   /**
+    * Toolbox to assemble an FS IR program out of individual instructions.
+    *
+    * This object is meant to have an interface consistent with
+    * brw::vec4_builder.  They cannot be fully interchangeable because
+    * brw::fs_builder generates scalar code while brw::vec4_builder generates
+    * vector code.
+    */
+   class fs_builder {
+   public:
+      /** Type used in this IR to represent a source of an instruction. */
+      typedef fs_reg src_reg;
+
+      /** Type used in this IR to represent the destination of an instruction. */
+      typedef fs_reg dst_reg;
+
+      /** Type used in this IR to represent an instruction. */
+      typedef fs_inst instruction;
+
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader.
+       * \p dispatch_width gives the native execution width of the program.
+       */
+      fs_builder(backend_shader *shader,
+                 unsigned dispatch_width) :
+         shader(shader), block(NULL), cursor(NULL),
+         _dispatch_width(dispatch_width),
+         _group(0),
+         force_writemask_all(false),
+         annotation()
+      {
+      }
+
+      /**
+       * Construct an fs_builder that inserts instructions before \p cursor in
+       * basic block \p block, inheriting other code generation parameters
+       * from this.
+       */
+      fs_builder
+      at(bblock_t *block, exec_node *cursor) const
+      {
+         fs_builder bld = *this;
+         bld.block = block;
+         bld.cursor = cursor;
+         return bld;
+      }
+
+      /**
+       * Construct an fs_builder appending instructions at the end of the
+       * instruction list of the shader, inheriting other code generation
+       * parameters from this.
+       */
+      fs_builder
+      at_end() const
+      {
+         return at(NULL, (exec_node *)&shader->instructions.tail);
+      }
+
+      /**
+       * Construct a builder specifying the default SIMD width and group of
+       * channel enable signals, inheriting other code generation parameters
+       * from this.
+       *
+       * \p n gives the default SIMD width, \p i gives the slot group used for
+       * predication and control flow masking in multiples of \p n channels.
+       */
+      fs_builder
+      group(unsigned n, unsigned i) const
+      {
+         assert(n <= dispatch_width() &&
+                i < dispatch_width() / n);
+         fs_builder bld = *this;
+         bld._dispatch_width = n;
+         bld._group += i * n;
+         return bld;
+      }
+
+      /**
+       * Alias for group() with width equal to eight.
+       */
+      fs_builder
+      half(unsigned i) const
+      {
+         return group(8, i);
+      }
+
+      /**
+       * Construct a builder with per-channel control flow execution masking
+       * disabled if \p b is true.  If control flow execution masking is
+       * already disabled this has no effect.
+       */
+      fs_builder
+      exec_all(bool b = true) const
+      {
+         fs_builder bld = *this;
+         if (b)
+            bld.force_writemask_all = true;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with the given debug annotation info.
+       */
+      fs_builder
+      annotate(const char *str, const void *ir = NULL) const
+      {
+         fs_builder bld = *this;
+         bld.annotation.str = str;
+         bld.annotation.ir = ir;
+         return bld;
+      }
+
+      /**
+       * Get the SIMD width in use.
+       */
+      unsigned
+      dispatch_width() const
+      {
+         return _dispatch_width;
+      }
+
+      /**
+       * Allocate a virtual register of natural vector size (one for this IR)
+       * and SIMD width.  \p n gives the amount of space to allocate in
+       * dispatch_width units (which is just enough space for one logical
+       * component in this IR).
+       */
+      dst_reg
+      vgrf(enum brw_reg_type type, unsigned n = 1) const
+      {
+         return dst_reg(GRF, shader->alloc.allocate(
+                           DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+                                        REG_SIZE)),
+                        type, dispatch_width());
+      }
+
+      /**
+       * Create a null register of floating type.
+       */
+      dst_reg
+      null_reg_f() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_F));
+      }
+
+      /**
+       * Create a null register of signed integer type.
+       */
+      dst_reg
+      null_reg_d() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_D));
+      }
+
+      /**
+       * Create a null register of unsigned integer type.
+       */
+      dst_reg
+      null_reg_ud() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Get the mask of SIMD channels enabled by dispatch and not yet
+       * disabled by discard.
+       */
+      src_reg
+      sample_mask_reg() const
+      {
+         const bool uses_kill =
+            (shader->stage == MESA_SHADER_FRAGMENT &&
+             ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
+         return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
+                 uses_kill ? brw_flag_reg(0, 1) :
+                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Insert an instruction into the program.
+       */
+      instruction *
+      emit(const instruction &inst) const
+      {
+         return emit(new(shader->mem_ctx) instruction(inst));
+      }
+
+      /**
+       * Create and insert a nullary control instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode) const
+      {
+         return emit(instruction(opcode, dispatch_width()));
+      }
+
+      /**
+       * Create and insert a nullary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst) const
+      {
+         return emit(instruction(opcode, dst));
+      }
+
+      /**
+       * Create and insert a unary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_RCP:
+         case SHADER_OPCODE_RSQ:
+         case SHADER_OPCODE_SQRT:
+         case SHADER_OPCODE_EXP2:
+         case SHADER_OPCODE_LOG2:
+         case SHADER_OPCODE_SIN:
+         case SHADER_OPCODE_COS:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst.width, dst,
+                                fix_math_operand(src0))));
+
+         default:
+            return emit(instruction(opcode, dst.width, dst, src0));
+         }
+      }
+
+      /**
+       * Create and insert a binary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_POW:
+         case SHADER_OPCODE_INT_QUOTIENT:
+         case SHADER_OPCODE_INT_REMAINDER:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst.width, dst,
+                                fix_math_operand(src0),
+                                fix_math_operand(src1))));
+
+         default:
+            return emit(instruction(opcode, dst.width, dst, src0, src1));
+
+         }
+      }
+
+      /**
+       * Create and insert a ternary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1, const src_reg &src2) const
+      {
+         switch (opcode) {
+         case BRW_OPCODE_BFE:
+         case BRW_OPCODE_BFI2:
+         case BRW_OPCODE_MAD:
+         case BRW_OPCODE_LRP:
+            return emit(instruction(opcode, dst.width, dst,
+                                    fix_3src_operand(src0),
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
+
+         default:
+            return emit(instruction(opcode, dst.width, dst, src0, src1, src2));
+         }
+      }
+
+      /**
+       * Insert a preallocated instruction into the program.
+       */
+      instruction *
+      emit(instruction *inst) const
+      {
+         assert(inst->exec_size == dispatch_width() ||
+                force_writemask_all);
+         assert(_group == 0 || _group == 8);
+
+         inst->force_sechalf = (_group == 8);
+         inst->force_writemask_all = force_writemask_all;
+         inst->annotation = annotation.str;
+         inst->ir = annotation.ir;
+
+         if (block)
+            static_cast<instruction *>(cursor)->insert_before(block, inst);
+         else
+            cursor->insert_before(inst);
+
+         return inst;
+      }
+
+      /**
+       * Select \p src0 if the comparison of both sources with the given
+       * conditional mod evaluates to true, otherwise select \p src1.
+       *
+       * Generally useful to get the minimum or maximum of two values.
+       */
+      void
+      emit_minmax(const dst_reg &dst, const src_reg &src0,
+                  const src_reg &src1, brw_conditional_mod mod) const
+      {
+         if (shader->devinfo->gen >= 6) {
+            set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+         } else {
+            CMP(null_reg_d(), src0, src1, mod);
+            set_predicate(BRW_PREDICATE_NORMAL,
+                          SEL(dst, src0, src1));
+         }
+      }
+
+      /**
+       * Copy any live channel from \p src to the first channel of \p dst.
+       */
+      void
+      emit_uniformize(const dst_reg &dst, const src_reg &src) const
+      {
+         const fs_builder ubld = exec_all();
+         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
+         ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
+                   src, component(chan_index, 0));
+      }
+
+      /**
+       * Assorted arithmetic ops.
+       * @{
+       */
+#define ALU1(op)                                        \
+      instruction *                                     \
+      op(const dst_reg &dst, const src_reg &src0) const \
+      {                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0);       \
+      }
+
+#define ALU2(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
+      }
+
+#define ALU2_ACC(op)                                                    \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
+         inst->writes_accumulator = true;                               \
+         return inst;                                                   \
+      }
+
+#define ALU3(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
+         const src_reg &src2) const                                     \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
+      }
+
+      ALU2(ADD)
+      ALU2_ACC(ADDC)
+      ALU2(AND)
+      ALU2(ASR)
+      ALU2(AVG)
+      ALU3(BFE)
+      ALU2(BFI1)
+      ALU3(BFI2)
+      ALU1(BFREV)
+      ALU1(CBIT)
+      ALU2(CMPN)
+      ALU3(CSEL)
+      ALU2(DP2)
+      ALU2(DP3)
+      ALU2(DP4)
+      ALU2(DPH)
+      ALU1(F16TO32)
+      ALU1(F32TO16)
+      ALU1(FBH)
+      ALU1(FBL)
+      ALU1(FRC)
+      ALU2(LINE)
+      ALU1(LZD)
+      ALU2(MAC)
+      ALU2_ACC(MACH)
+      ALU3(MAD)
+      ALU1(MOV)
+      ALU2(MUL)
+      ALU1(NOT)
+      ALU2(OR)
+      ALU2(PLN)
+      ALU1(RNDD)
+      ALU1(RNDE)
+      ALU1(RNDU)
+      ALU1(RNDZ)
+      ALU2(SAD2)
+      ALU2_ACC(SADA2)
+      ALU2(SEL)
+      ALU2(SHL)
+      ALU2(SHR)
+      ALU2_ACC(SUBB)
+      ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+      /** @} */
+
+      /**
+       * CMP: Sets the low bit of the destination channels with the result
+       * of the comparison, while the upper bits are undefined, and updates
+       * the flag register with the packed 16 bits of the result.
+       */
+      instruction *
+      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gen4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Gen4 predicated IF.
+       */
+      instruction *
+      IF(brw_predicate predicate) const
+      {
+         return set_predicate(predicate, emit(BRW_OPCODE_IF));
+      }
+
+      /**
+       * Emit a linear interpolation instruction.
+       */
+      instruction *
+      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+          const src_reg &a) const
+      {
+         if (shader->devinfo->gen >= 6) {
+            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+             * we need to reorder the operands.
+             */
+            return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+         } else {
+            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
+            const dst_reg y_times_a = vgrf(dst.type);
+            const dst_reg one_minus_a = vgrf(dst.type);
+            const dst_reg x_times_one_minus_a = vgrf(dst.type);
+
+            MUL(y_times_a, y, a);
+            ADD(one_minus_a, negate(a), src_reg(1.0f));
+            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+         }
+      }
+
+      /**
+       * Collect a number of registers in a contiguous range of registers.
+       */
+      instruction *
+      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
+                   unsigned sources, unsigned header_size) const
+      {
+         assert(dst.width % 8 == 0);
+         instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
+                                              dst.width, dst, src, sources));
+         inst->header_size = header_size;
+
+         for (unsigned i = 0; i < header_size; i++)
+            assert(src[i].file != GRF ||
+                   src[i].width * type_sz(src[i].type) == 32);
+         inst->regs_written = header_size;
+
+         for (unsigned i = header_size; i < sources; ++i)
+            assert(src[i].file != GRF ||
+                   src[i].width == dst.width);
+         inst->regs_written += (sources - header_size) * (dst.width / 8);
+
+         return inst;
+      }
+
+      backend_shader *shader;
+
+   private:
+      /**
+       * Workaround for negation of UD registers.  See comment in
+       * fs_generator::generate_code() for more details.
+       */
+      src_reg
+      fix_unsigned_negate(const src_reg &src) const
+      {
+         if (src.type == BRW_REGISTER_TYPE_UD &&
+             src.negate) {
+            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+            MOV(temp, src);
+            return src_reg(temp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround for source register modes not supported by the ternary
+       * instruction encoding.
+       */
+      src_reg
+      fix_3src_operand(const src_reg &src) const
+      {
+         if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
+            return src;
+         } else {
+            dst_reg expanded = vgrf(src.type);
+            MOV(expanded, src);
+            return expanded;
+         }
+      }
+
+      /**
+       * Workaround for source register modes not supported by the math
+       * instruction.
+       */
+      src_reg
+      fix_math_operand(const src_reg &src) const
+      {
+         /* Can't do hstride == 0 args on gen6 math, so expand it out. We
+          * might be able to do better by doing execsize = 1 math and then
+          * expanding that result out, but we would need to be careful with
+          * masking.
+          *
+          * Gen6 hardware ignores source modifiers (negate and abs) on math
+          * instructions, so we also move to a temp to set those up.
+          *
+          * Gen7 relaxes most of the above restrictions, but still can't use IMM
+          * operands to math
+          */
+         if ((shader->devinfo->gen == 6 &&
+              (src.file == IMM || src.file == UNIFORM ||
+               src.abs || src.negate)) ||
+             (shader->devinfo->gen == 7 && src.file == IMM)) {
+            const dst_reg tmp = vgrf(src.type);
+            MOV(tmp, src);
+            return tmp;
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround other weirdness of the math instruction.
+       */
+      instruction *
+      fix_math_instruction(instruction *inst) const
+      {
+         if (shader->devinfo->gen < 6) {
+            inst->base_mrf = 2;
+            inst->mlen = inst->sources * dispatch_width() / 8;
+
+            if (inst->sources > 1) {
+               /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+                * "Message Payload":
+                *
+                * "Operand0[7].  For the INT DIV functions, this operand is the
+                *  denominator."
+                *  ...
+                * "Operand1[7].  For the INT DIV functions, this operand is the
+                *  numerator."
+                */
+               const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+               const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
+               const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
+
+               inst->resize_sources(1);
+               inst->src[0] = src0;
+
+               at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
+                                          dispatch_width()), src1);
+            }
+         }
+
+         return inst;
+      }
+
+      bblock_t *block;
+      exec_node *cursor;
+
+      unsigned _dispatch_width;
+      unsigned _group;
+      bool force_writemask_all;
+
+      /** Debug annotation info. */
+      struct {
+         const char *str;
+         const void *ir;
+      } annotation;
+   };
+}
+
+#endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index aa62031df73..0af5a915c9f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -38,6 +38,8 @@
 #include "brw_fs_live_variables.h"
 #include "brw_cfg.h"
 
+using namespace brw;
+
 /* Returns whether an instruction could co-issue if its immediate source were
  * replaced with a GRF source.
  */
@@ -270,15 +272,14 @@ fs_visitor::opt_combine_constants()
    reg.stride = 0;
    for (int i = 0; i < table.len; i++) {
       struct imm *imm = &table.imm[i];
-
-      fs_inst *mov = MOV(reg, fs_reg(imm->val));
-      mov->force_writemask_all = true;
-      if (imm->inst) {
-         imm->inst->insert_before(imm->block, mov);
-      } else {
-         backend_instruction *inst = imm->block->last_non_control_flow_inst();
-         inst->insert_after(imm->block, mov);
-      }
+      /* Insert it either before the instruction that generated the immediate
+       * or after the last non-control flow instruction of the common ancestor.
+       */
+      exec_node *n = (imm->inst ? imm->inst :
+                      imm->block->last_non_control_flow_inst()->next);
+      const fs_builder ibld = bld.at(imm->block, n).exec_all();
+
+      ibld.MOV(reg, fs_reg(imm->val));
       imm->reg = reg.reg;
       imm->subreg_offset = reg.subreg_offset;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 52bfa921dc3..c92aae4b1d6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -541,8 +541,16 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
             /* Fit this constant in by commuting the operands.
              * Exception: we can't do this for 32-bit integer MUL/MACH
              * because it's asymmetric.
+             *
+             * The BSpec says for Broadwell that
+             *
+             *    "When multiplying DW x DW, the dst cannot be accumulator."
+             *
+             * Integer MUL with a non-accumulator destination will be lowered
+             * by lower_integer_multiplication(), so don't restrict it.
              */
-            if ((inst->opcode == BRW_OPCODE_MUL ||
+            if (((inst->opcode == BRW_OPCODE_MUL &&
+                  inst->dst.is_accumulator()) ||
                  inst->opcode == BRW_OPCODE_MACH) &&
                 (inst->src[1].type == BRW_REGISTER_TYPE_D ||
                  inst->src[1].type == BRW_REGISTER_TYPE_UD))
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index db01f8cf7ab..70f0217b93d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -32,6 +32,8 @@
  * 13.1 (p378).
  */
 
+using namespace brw;
+
 namespace {
 struct aeb_entry : public exec_node {
    /** The instruction that generates the expression value. */
@@ -152,28 +154,34 @@ static bool
 instructions_match(fs_inst *a, fs_inst *b, bool *negate)
 {
    return a->opcode == b->opcode &&
+          a->force_writemask_all == b->force_writemask_all &&
+          a->exec_size == b->exec_size &&
+          a->force_sechalf == b->force_sechalf &&
           a->saturate == b->saturate &&
           a->predicate == b->predicate &&
           a->predicate_inverse == b->predicate_inverse &&
           a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
           a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->regs_written == b->regs_written &&
+          a->base_mrf == b->base_mrf &&
+          a->eot == b->eot &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
+          a->pi_noperspective == b->pi_noperspective &&
           a->sources == b->sources &&
-          (a->is_tex() ? (a->offset == b->offset &&
-                          a->mlen == b->mlen &&
-                          a->regs_written == b->regs_written &&
-                          a->base_mrf == b->base_mrf &&
-                          a->eot == b->eot &&
-                          a->header_size == b->header_size &&
-                          a->shadow_compare == b->shadow_compare)
-                       : true) &&
           operands_match(a, b, negate);
 }
 
-static fs_inst *
-create_copy_instr(fs_visitor *v, fs_inst *inst, fs_reg src, bool negate)
+static void
+create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
 {
    int written = inst->regs_written;
    int dst_width = inst->dst.width / 8;
+   const fs_builder ubld = bld.group(inst->exec_size, inst->force_sechalf)
+                              .exec_all(inst->force_writemask_all);
    fs_inst *copy;
 
    if (written > dst_width) {
@@ -189,7 +197,7 @@ create_copy_instr(fs_visitor *v, fs_inst *inst, fs_reg src, bool negate)
       }
 
       assert(src.file == GRF);
-      payload = ralloc_array(v->mem_ctx, fs_reg, sources);
+      payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
       for (int i = 0; i < header_size; i++) {
          payload[i] = src;
          payload[i].width = 8;
@@ -199,15 +207,12 @@ create_copy_instr(fs_visitor *v, fs_inst *inst, fs_reg src, bool negate)
          payload[i] = src;
          src = offset(src, 1);
       }
-      copy = v->LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
+      copy = ubld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
    } else {
-      copy = v->MOV(inst->dst, src);
-      copy->force_writemask_all = inst->force_writemask_all;
+      copy = ubld.MOV(inst->dst, src);
       copy->src[0].negate = negate;
    }
    assert(copy->regs_written == written);
-
-   return copy;
 }
 
 bool
@@ -261,9 +266,8 @@ fs_visitor::opt_cse_local(bblock_t *block)
                                    entry->generator->dst.type,
                                    entry->generator->dst.width);
 
-               fs_inst *copy = create_copy_instr(this, entry->generator,
-                                                 entry->tmp, false);
-               entry->generator->insert_after(block, copy);
+               create_copy_instr(bld.at(block, entry->generator->next),
+                                 entry->generator, entry->tmp, false);
 
                entry->generator->dst = entry->tmp;
             }
@@ -274,9 +278,7 @@ fs_visitor::opt_cse_local(bblock_t *block)
                assert(inst->dst.width == entry->generator->dst.width);
                assert(inst->dst.type == entry->tmp.type);
 
-               fs_inst *copy = create_copy_instr(this, inst,
-                                                 entry->tmp, negate);
-               inst->insert_before(block, copy);
+               create_copy_instr(bld.at(block, inst), inst, entry->tmp, negate);
             }
 
             /* Set our iterator so that next time through the loop inst->next
diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
deleted file mode 100644
index 6518ff60c3b..00000000000
--- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
+++ /dev/null
@@ -1,742 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/** @file brw_fs_fp.cpp
- *
- * Implementation of the compiler for GL_ARB_fragment_program shaders on top
- * of the GLSL compiler backend.
- */
-
-#include "brw_context.h"
-#include "brw_fs.h"
-
-void
-fs_visitor::emit_fp_alu1(enum opcode opcode,
-                         const struct prog_instruction *fpi,
-                         fs_reg dst, fs_reg src)
-{
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i))
-         emit(opcode, offset(dst, i), offset(src, i));
-   }
-}
-
-void
-fs_visitor::emit_fp_alu2(enum opcode opcode,
-                         const struct prog_instruction *fpi,
-                         fs_reg dst, fs_reg src0, fs_reg src1)
-{
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i))
-         emit(opcode, offset(dst, i),
-              offset(src0, i), offset(src1, i));
-   }
-}
-
-void
-fs_visitor::emit_fp_minmax(const prog_instruction *fpi,
-                           fs_reg dst, fs_reg src0, fs_reg src1)
-{
-   enum brw_conditional_mod conditionalmod;
-   if (fpi->Opcode == OPCODE_MIN)
-      conditionalmod = BRW_CONDITIONAL_L;
-   else
-      conditionalmod = BRW_CONDITIONAL_GE;
-
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i)) {
-         emit_minmax(conditionalmod, offset(dst, i),
-                     offset(src0, i), offset(src1, i));
-      }
-   }
-}
-
-void
-fs_visitor::emit_fp_sop(enum brw_conditional_mod conditional_mod,
-                        const struct prog_instruction *fpi,
-                        fs_reg dst, fs_reg src0, fs_reg src1,
-                        fs_reg one)
-{
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i)) {
-         fs_inst *inst;
-
-         emit(CMP(reg_null_d, offset(src0, i), offset(src1, i),
-                  conditional_mod));
-
-         inst = emit(BRW_OPCODE_SEL, offset(dst, i), one, fs_reg(0.0f));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-      }
-   }
-}
-
-void
-fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi,
-                                 fs_reg dst, fs_reg src)
-{
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i))
-         emit(MOV(offset(dst, i), src));
-   }
-}
-
-void
-fs_visitor::emit_fp_scalar_math(enum opcode opcode,
-                                const struct prog_instruction *fpi,
-                                fs_reg dst, fs_reg src)
-{
-   fs_reg temp = vgrf(glsl_type::float_type);
-   emit_math(opcode, temp, src);
-   emit_fp_scalar_write(fpi, dst, temp);
-}
-
-void
-fs_visitor::emit_fragment_program_code()
-{
-   setup_fp_regs();
-
-   /* Keep a reg with 1.0 around, for reuse by emit_fp_sop so that it can just
-    * be:
-    *
-    * sel.f0 dst 1.0 0.0
-    *
-    * instead of
-    *
-    * mov    dst 0.0
-    * mov.f0 dst 1.0
-    */
-   fs_reg one = vgrf(glsl_type::float_type);
-   emit(MOV(one, fs_reg(1.0f)));
-
-   for (unsigned int insn = 0; insn < prog->NumInstructions; insn++) {
-      const struct prog_instruction *fpi = &prog->Instructions[insn];
-      base_ir = fpi;
-
-      fs_reg dst;
-      fs_reg src[3];
-
-      /* We always emit into a temporary destination register to avoid
-       * aliasing issues.
-       */
-      dst = vgrf(glsl_type::vec4_type);
-
-      for (int i = 0; i < 3; i++)
-         src[i] = get_fp_src_reg(&fpi->SrcReg[i]);
-
-      switch (fpi->Opcode) {
-      case OPCODE_ABS:
-         src[0].abs = true;
-         src[0].negate = false;
-         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_ADD:
-         emit_fp_alu2(BRW_OPCODE_ADD, fpi, dst, src[0], src[1]);
-         break;
-
-      case OPCODE_CMP:
-         for (int i = 0; i < 4; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               fs_inst *inst;
-
-               emit(CMP(reg_null_f, offset(src[0], i), fs_reg(0.0f),
-                        BRW_CONDITIONAL_L));
-
-               inst = emit(BRW_OPCODE_SEL, offset(dst, i),
-                           offset(src[1], i), offset(src[2], i));
-               inst->predicate = BRW_PREDICATE_NORMAL;
-            }
-         }
-         break;
-
-      case OPCODE_COS:
-         emit_fp_scalar_math(SHADER_OPCODE_COS, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_DP2:
-      case OPCODE_DP3:
-      case OPCODE_DP4:
-      case OPCODE_DPH: {
-         fs_reg mul = vgrf(glsl_type::float_type);
-         fs_reg acc = vgrf(glsl_type::float_type);
-         int count;
-
-         switch (fpi->Opcode) {
-         case OPCODE_DP2: count = 2; break;
-         case OPCODE_DP3: count = 3; break;
-         case OPCODE_DP4: count = 4; break;
-         case OPCODE_DPH: count = 3; break;
-         default: unreachable("not reached");
-         }
-
-         emit(MUL(acc, offset(src[0], 0), offset(src[1], 0)));
-         for (int i = 1; i < count; i++) {
-            emit(MUL(mul, offset(src[0], i), offset(src[1], i)));
-            emit(ADD(acc, acc, mul));
-         }
-
-         if (fpi->Opcode == OPCODE_DPH)
-            emit(ADD(acc, acc, offset(src[1], 3)));
-
-         emit_fp_scalar_write(fpi, dst, acc);
-         break;
-      }
-
-      case OPCODE_DST:
-         if (fpi->DstReg.WriteMask & WRITEMASK_X)
-            emit(MOV(dst, fs_reg(1.0f)));
-         if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-            emit(MUL(offset(dst, 1),
-                     offset(src[0], 1), offset(src[1], 1)));
-         }
-         if (fpi->DstReg.WriteMask & WRITEMASK_Z)
-            emit(MOV(offset(dst, 2), offset(src[0], 2)));
-         if (fpi->DstReg.WriteMask & WRITEMASK_W)
-            emit(MOV(offset(dst, 3), offset(src[1], 3)));
-         break;
-
-      case OPCODE_EX2:
-         emit_fp_scalar_math(SHADER_OPCODE_EXP2, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_FLR:
-         emit_fp_alu1(BRW_OPCODE_RNDD, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_FRC:
-         emit_fp_alu1(BRW_OPCODE_FRC, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_KIL: {
-         for (int i = 0; i < 4; i++) {
-            /* In most cases the argument to a KIL will be something like
-             * TEMP[0].wwww, so there's no point in checking whether .w is < 0
-             * 4 times in a row.
-             */
-            if (i > 0 &&
-                GET_SWZ(fpi->SrcReg[0].Swizzle, i) ==
-                GET_SWZ(fpi->SrcReg[0].Swizzle, i - 1) &&
-                ((fpi->SrcReg[0].Negate >> i) & 1) ==
-                ((fpi->SrcReg[0].Negate >> (i - 1)) & 1)) {
-               continue;
-            }
-
-
-            /* Emit an instruction that's predicated on the current
-             * undiscarded pixels, and updates just those pixels to be
-             * turned off.
-             */
-            fs_inst *cmp = emit(CMP(reg_null_f, offset(src[0], i),
-                                    fs_reg(0.0f), BRW_CONDITIONAL_GE));
-            cmp->predicate = BRW_PREDICATE_NORMAL;
-            cmp->flag_subreg = 1;
-
-            if (devinfo->gen >= 6)
-               emit_discard_jump();
-         }
-         break;
-      }
-
-      case OPCODE_LG2:
-         emit_fp_scalar_math(SHADER_OPCODE_LOG2, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_LIT:
-         /* From the ARB_fragment_program spec:
-          *
-          *      tmp = VectorLoad(op0);
-          *      if (tmp.x < 0) tmp.x = 0;
-          *      if (tmp.y < 0) tmp.y = 0;
-          *      if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
-          *      else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
-          *      result.x = 1.0;
-          *      result.y = tmp.x;
-          *      result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
-          *      result.w = 1.0;
-          *
-          * Note that we don't do the clamping to +/- 128.  We didn't in
-          * brw_wm_emit.c either.
-          */
-         if (fpi->DstReg.WriteMask & WRITEMASK_X)
-            emit(MOV(offset(dst, 0), fs_reg(1.0f)));
-
-         if (fpi->DstReg.WriteMask & WRITEMASK_YZ) {
-            fs_inst *inst;
-            emit(CMP(reg_null_f, offset(src[0], 0), fs_reg(0.0f),
-                     BRW_CONDITIONAL_LE));
-
-            if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-               emit(MOV(offset(dst, 1), offset(src[0], 0)));
-               inst = emit(MOV(offset(dst, 1), fs_reg(0.0f)));
-               inst->predicate = BRW_PREDICATE_NORMAL;
-            }
-
-            if (fpi->DstReg.WriteMask & WRITEMASK_Z) {
-               emit_math(SHADER_OPCODE_POW, offset(dst, 2),
-                         offset(src[0], 1), offset(src[0], 3));
-
-               inst = emit(MOV(offset(dst, 2), fs_reg(0.0f)));
-               inst->predicate = BRW_PREDICATE_NORMAL;
-            }
-         }
-
-         if (fpi->DstReg.WriteMask & WRITEMASK_W)
-            emit(MOV(offset(dst, 3), fs_reg(1.0f)));
-
-         break;
-
-      case OPCODE_LRP:
-         for (int i = 0; i < 4; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               fs_reg a = offset(src[0], i);
-               fs_reg y = offset(src[1], i);
-               fs_reg x = offset(src[2], i);
-               emit_lrp(offset(dst, i), x, y, a);
-            }
-         }
-         break;
-
-      case OPCODE_MAD:
-         for (int i = 0; i < 4; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               if (devinfo->gen >= 6) {
-                  emit(MAD(offset(dst, i), offset(src[2], i),
-                           offset(src[1], i), offset(src[0], i)));
-               } else {
-                  fs_reg temp = vgrf(glsl_type::float_type);
-                  emit(MUL(temp, offset(src[0], i), offset(src[1], i)));
-                  emit(ADD(offset(dst, i), temp, offset(src[2], i)));
-               }
-            }
-         }
-         break;
-
-      case OPCODE_MAX:
-         emit_fp_minmax(fpi, dst, src[0], src[1]);
-         break;
-
-      case OPCODE_MOV:
-         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_MIN:
-         emit_fp_minmax(fpi, dst, src[0], src[1]);
-         break;
-
-      case OPCODE_MUL:
-         emit_fp_alu2(BRW_OPCODE_MUL, fpi, dst, src[0], src[1]);
-         break;
-
-      case OPCODE_POW: {
-         fs_reg temp = vgrf(glsl_type::float_type);
-         emit_math(SHADER_OPCODE_POW, temp, src[0], src[1]);
-         emit_fp_scalar_write(fpi, dst, temp);
-         break;
-      }
-
-      case OPCODE_RCP:
-         emit_fp_scalar_math(SHADER_OPCODE_RCP, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_RSQ:
-         emit_fp_scalar_math(SHADER_OPCODE_RSQ, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_SCS:
-         if (fpi->DstReg.WriteMask & WRITEMASK_X) {
-            emit_math(SHADER_OPCODE_COS, offset(dst, 0),
-                      offset(src[0], 0));
-         }
-
-         if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-            emit_math(SHADER_OPCODE_SIN, offset(dst, 1),
-                      offset(src[0], 1));
-         }
-         break;
-
-      case OPCODE_SGE:
-         emit_fp_sop(BRW_CONDITIONAL_GE, fpi, dst, src[0], src[1], one);
-         break;
-
-      case OPCODE_SIN:
-         emit_fp_scalar_math(SHADER_OPCODE_SIN, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_SLT:
-         emit_fp_sop(BRW_CONDITIONAL_L, fpi, dst, src[0], src[1], one);
-         break;
-
-      case OPCODE_SUB: {
-         fs_reg neg_src1 = src[1];
-         neg_src1.negate = !src[1].negate;
-
-         emit_fp_alu2(BRW_OPCODE_ADD, fpi, dst, src[0], neg_src1);
-         break;
-      }
-
-      case OPCODE_TEX:
-      case OPCODE_TXB:
-      case OPCODE_TXP: {
-         ir_texture_opcode op;
-         fs_reg lod;
-         fs_reg dpdy;
-         fs_reg coordinate = src[0];
-         fs_reg shadow_c;
-         fs_reg sample_index;
-         fs_reg texel_offset; /* No offsets; leave as BAD_FILE. */
-
-         switch (fpi->Opcode) {
-         case OPCODE_TEX:
-            op = ir_tex;
-            break;
-         case OPCODE_TXP: {
-            op = ir_tex;
-
-            coordinate = vgrf(glsl_type::vec3_type);
-            fs_reg invproj = vgrf(glsl_type::float_type);
-            emit_math(SHADER_OPCODE_RCP, invproj, offset(src[0], 3));
-            for (int i = 0; i < 3; i++) {
-               emit(MUL(offset(coordinate, i),
-                        offset(src[0], i), invproj));
-            }
-            break;
-         }
-         case OPCODE_TXB:
-            op = ir_txb;
-            lod = offset(src[0], 3);
-            break;
-         default:
-            unreachable("not reached");
-         }
-
-         int coord_components;
-         switch (fpi->TexSrcTarget) {
-         case TEXTURE_1D_INDEX:
-            coord_components = 1;
-            break;
-
-         case TEXTURE_2D_INDEX:
-         case TEXTURE_1D_ARRAY_INDEX:
-         case TEXTURE_RECT_INDEX:
-         case TEXTURE_EXTERNAL_INDEX:
-            coord_components = 2;
-            break;
-
-         case TEXTURE_3D_INDEX:
-         case TEXTURE_2D_ARRAY_INDEX:
-            coord_components = 3;
-            break;
-
-         case TEXTURE_CUBE_INDEX: {
-            coord_components = 3;
-
-            fs_reg temp = vgrf(glsl_type::float_type);
-            fs_reg cubecoord = vgrf(glsl_type::vec3_type);
-            fs_reg abscoord = coordinate;
-            abscoord.negate = false;
-            abscoord.abs = true;
-            emit_minmax(BRW_CONDITIONAL_GE, temp,
-                        offset(abscoord, 0), offset(abscoord, 1));
-            emit_minmax(BRW_CONDITIONAL_GE, temp,
-                        temp, offset(abscoord, 2));
-            emit_math(SHADER_OPCODE_RCP, temp, temp);
-            for (int i = 0; i < 3; i++) {
-               emit(MUL(offset(cubecoord, i),
-                        offset(coordinate, i), temp));
-            }
-
-            coordinate = cubecoord;
-            break;
-         }
-
-         default:
-            unreachable("not reached");
-         }
-
-         if (fpi->TexShadow)
-            shadow_c = offset(coordinate, 2);
-
-         emit_texture(op, glsl_type::vec4_type, coordinate, coord_components,
-                      shadow_c, lod, dpdy, 0, sample_index,
-                      reg_undef, /* offset */
-                      reg_undef, /* mcs */
-                      0, /* gather component */
-                      false, /* is cube array */
-                      fpi->TexSrcTarget == TEXTURE_RECT_INDEX,
-                      fpi->TexSrcUnit, fs_reg(fpi->TexSrcUnit),
-                      fpi->TexSrcUnit);
-         dst = this->result;
-
-         break;
-      }
-
-      case OPCODE_SWZ:
-         /* Note that SWZ's extended swizzles are handled in the general
-          * get_src_reg() code.
-          */
-         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_XPD:
-         for (int i = 0; i < 3; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               int i1 = (i + 1) % 3;
-               int i2 = (i + 2) % 3;
-
-               fs_reg temp = vgrf(glsl_type::float_type);
-               fs_reg neg_src1_1 = offset(src[1], i1);
-               neg_src1_1.negate = !neg_src1_1.negate;
-               emit(MUL(temp, offset(src[0], i2), neg_src1_1));
-               emit(MUL(offset(dst, i),
-                        offset(src[0], i1), offset(src[1], i2)));
-               emit(ADD(offset(dst, i), offset(dst, i), temp));
-            }
-         }
-         break;
-
-      case OPCODE_END:
-         break;
-
-      default:
-         _mesa_problem(ctx, "Unsupported opcode %s in fragment program\n",
-                       _mesa_opcode_string(fpi->Opcode));
-      }
-
-      /* To handle saturates, we emit a MOV with a saturate bit, which
-       * optimization should fold into the preceding instructions when safe.
-       */
-      if (_mesa_num_inst_dst_regs(fpi->Opcode) != 0) {
-         fs_reg real_dst = get_fp_dst_reg(&fpi->DstReg);
-
-         for (int i = 0; i < 4; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               fs_inst *inst = emit(MOV(offset(real_dst, i),
-                                        offset(dst, i)));
-               inst->saturate = fpi->SaturateMode;
-            }
-         }
-      }
-   }
-
-   /* Epilogue:
-    *
-    * Fragment depth has this strange convention of being the .z component of
-    * a vec4.  emit_fb_write() wants to see a float value, instead.
-    */
-   this->current_annotation = "result.depth write";
-   if (frag_depth.file != BAD_FILE) {
-      fs_reg temp = vgrf(glsl_type::float_type);
-      emit(MOV(temp, offset(frag_depth, 2)));
-      frag_depth = temp;
-   }
-}
-
-void
-fs_visitor::setup_fp_regs()
-{
-   /* PROGRAM_TEMPORARY */
-   int num_temp = prog->NumTemporaries;
-   fp_temp_regs = rzalloc_array(mem_ctx, fs_reg, num_temp);
-   for (int i = 0; i < num_temp; i++)
-      fp_temp_regs[i] = vgrf(glsl_type::vec4_type);
-
-   /* PROGRAM_STATE_VAR etc. */
-   if (dispatch_width == 8) {
-      for (unsigned p = 0;
-           p < prog->Parameters->NumParameters; p++) {
-         for (unsigned int i = 0; i < 4; i++) {
-            stage_prog_data->param[uniforms++] =
-               &prog->Parameters->ParameterValues[p][i];
-         }
-      }
-   }
-
-   fp_input_regs = rzalloc_array(mem_ctx, fs_reg, VARYING_SLOT_MAX);
-   for (int i = 0; i < VARYING_SLOT_MAX; i++) {
-      if (prog->InputsRead & BITFIELD64_BIT(i)) {
-         this->current_annotation = ralloc_asprintf(ctx, "interpolate input %d",
-                                                    i);
-
-         switch (i) {
-         case VARYING_SLOT_POS:
-            {
-               assert(stage == MESA_SHADER_FRAGMENT);
-               gl_fragment_program *fp = (gl_fragment_program*) prog;
-               fp_input_regs[i] =
-                  *emit_fragcoord_interpolation(fp->PixelCenterInteger,
-                                                fp->OriginUpperLeft);
-            }
-            break;
-         case VARYING_SLOT_FACE:
-            fp_input_regs[i] = *emit_frontfacing_interpolation();
-            break;
-         default:
-            fp_input_regs[i] = vgrf(glsl_type::vec4_type);
-            emit_general_interpolation(fp_input_regs[i], "fp_input",
-                                       glsl_type::vec4_type,
-                                       INTERP_QUALIFIER_NONE,
-                                       i, false, false);
-
-            if (i == VARYING_SLOT_FOGC) {
-               emit(MOV(offset(fp_input_regs[i], 1), fs_reg(0.0f)));
-               emit(MOV(offset(fp_input_regs[i], 2), fs_reg(0.0f)));
-               emit(MOV(offset(fp_input_regs[i], 3), fs_reg(1.0f)));
-            }
-
-            break;
-         }
-
-         this->current_annotation = NULL;
-      }
-   }
-}
-
-fs_reg
-fs_visitor::get_fp_dst_reg(const prog_dst_register *dst)
-{
-   assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-
-   switch (dst->File) {
-   case PROGRAM_TEMPORARY:
-      return fp_temp_regs[dst->Index];
-
-   case PROGRAM_OUTPUT:
-      if (dst->Index == FRAG_RESULT_DEPTH) {
-         if (frag_depth.file == BAD_FILE)
-            frag_depth = vgrf(glsl_type::vec4_type);
-         return frag_depth;
-      } else if (dst->Index == FRAG_RESULT_COLOR) {
-         if (outputs[0].file == BAD_FILE) {
-            outputs[0] = vgrf(glsl_type::vec4_type);
-            output_components[0] = 4;
-
-            /* Tell emit_fb_writes() to smear fragment.color across all the
-             * color attachments.
-             */
-            for (int i = 1; i < key->nr_color_regions; i++) {
-               outputs[i] = outputs[0];
-               output_components[i] = output_components[0];
-            }
-         }
-         return outputs[0];
-      } else {
-         int output_index = dst->Index - FRAG_RESULT_DATA0;
-         if (outputs[output_index].file == BAD_FILE) {
-            outputs[output_index] = vgrf(glsl_type::vec4_type);
-         }
-         output_components[output_index] = 4;
-         return outputs[output_index];
-      }
-
-   case PROGRAM_UNDEFINED:
-      return fs_reg();
-
-   default:
-      _mesa_problem(ctx, "bad dst register file: %s\n",
-                    _mesa_register_file_name((gl_register_file)dst->File));
-      return vgrf(glsl_type::vec4_type);
-   }
-}
-
-fs_reg
-fs_visitor::get_fp_src_reg(const prog_src_register *src)
-{
-   struct gl_program_parameter_list *plist = prog->Parameters;
-
-   fs_reg result;
-
-   assert(!src->Abs);
-
-   switch (src->File) {
-   case PROGRAM_UNDEFINED:
-      return fs_reg();
-   case PROGRAM_TEMPORARY:
-      result = fp_temp_regs[src->Index];
-      break;
-
-   case PROGRAM_INPUT:
-      result = fp_input_regs[src->Index];
-      break;
-
-   case PROGRAM_STATE_VAR:
-   case PROGRAM_UNIFORM:
-   case PROGRAM_CONSTANT:
-      /* We actually want to look at the type in the Parameters list for this,
-       * because this lets us upload constant builtin uniforms, as actual
-       * constants.
-       */
-      switch (plist->Parameters[src->Index].Type) {
-      case PROGRAM_CONSTANT: {
-         result = vgrf(glsl_type::vec4_type);
-
-         for (int i = 0; i < 4; i++) {
-            emit(MOV(offset(result, i),
-                     fs_reg(plist->ParameterValues[src->Index][i].f)));
-         }
-         break;
-      }
-
-      case PROGRAM_STATE_VAR:
-      case PROGRAM_UNIFORM:
-         result = fs_reg(UNIFORM, src->Index * 4);
-         break;
-
-      default:
-         _mesa_problem(ctx, "bad uniform src register file: %s\n",
-                       _mesa_register_file_name((gl_register_file)src->File));
-         return vgrf(glsl_type::vec4_type);
-      }
-      break;
-
-   default:
-      _mesa_problem(ctx, "bad src register file: %s\n",
-                    _mesa_register_file_name((gl_register_file)src->File));
-      return vgrf(glsl_type::vec4_type);
-   }
-
-   if (src->Swizzle != SWIZZLE_NOOP || src->Negate) {
-      fs_reg unswizzled = result;
-      result = vgrf(glsl_type::vec4_type);
-      for (int i = 0; i < 4; i++) {
-         bool negate = src->Negate & (1 << i);
-         /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
-          * but it costs us nothing to support it.
-          */
-         int src_swiz = GET_SWZ(src->Swizzle, i);
-         if (src_swiz == SWIZZLE_ZERO) {
-            emit(MOV(offset(result, i), fs_reg(0.0f)));
-         } else if (src_swiz == SWIZZLE_ONE) {
-            emit(MOV(offset(result, i),
-                     negate ? fs_reg(-1.0f) : fs_reg(1.0f)));
-         } else {
-            fs_reg src = offset(unswizzled, src_swiz);
-            if (negate)
-               src.negate = !src.negate;
-            emit(MOV(offset(result, i), src));
-         }
-      }
-   }
-
-   return result;
-}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index a99b7f75b26..2ed0bac6fd9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -121,7 +121,7 @@ brw_reg_from_fs_reg(fs_reg *reg)
    return brw_reg;
 }
 
-fs_generator::fs_generator(struct brw_context *brw,
+fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
                            void *mem_ctx,
                            const void *key,
                            struct brw_stage_prog_data *prog_data,
@@ -130,7 +130,8 @@ fs_generator::fs_generator(struct brw_context *brw,
                            bool runtime_check_aads_emit,
                            const char *stage_abbrev)
 
-   : brw(brw), devinfo(brw->intelScreen->devinfo), key(key),
+   : compiler(compiler), log_data(log_data),
+     devinfo(compiler->devinfo), key(key),
      prog_data(prog_data),
      prog(prog), promoted_constants(promoted_constants),
      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
@@ -401,6 +402,13 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
 }
 
 void
+fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
+{
+   brw_barrier(p, src);
+   brw_WAIT(p);
+}
+
+void
 fs_generator::generate_blorp_fb_write(fs_inst *inst)
 {
    brw_fb_WRITE(p,
@@ -779,27 +787,19 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       brw_mark_surface_used(prog_data, sampler + base_binding_table_index);
    } else {
       /* Non-const sampler index */
-      /* Note: this clobbers `dst` as a temporary before emitting the send */
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
-      struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
-
       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
 
       brw_push_insn_state(p);
       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
       brw_set_default_access_mode(p, BRW_ALIGN_1);
 
-      /* Some care required: `sampler` and `temp` may alias:
-       *    addr = sampler & 0xff
-       *    temp = (sampler << 8) & 0xf00
-       *    addr = addr | temp
-       */
-      brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
-      brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
-      brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
-      brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
-      brw_OR(p, addr, addr, temp);
+      /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
+      brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      if (base_binding_table_index)
+         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
+      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
 
       brw_pop_insn_state(p);
 
@@ -941,6 +941,7 @@ fs_generator::generate_ddy(enum opcode opcode,
       brw_push_insn_state(p);
       brw_set_default_access_mode(p, BRW_ALIGN_16);
       if (unroll_to_simd8) {
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
          if (negate_value) {
             brw_ADD(p, firsthalf(dst), firsthalf(src1), negate(firsthalf(src0)));
@@ -1600,10 +1601,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
       case 16:
       case 32:
-         if (type_sz(inst->dst.type) < sizeof(float))
-            brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-         else
+         /* If the instruction writes to more than one register, it needs to
+          * be a "compressed" instruction on Gen <= 5.
+          */
+         if (inst->exec_size * inst->dst.stride * type_sz(inst->dst.type) > 32)
             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+         else
+            brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
          break;
       default:
          unreachable("Invalid instruction width");
@@ -2121,6 +2125,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          generate_cs_terminate(inst, src[0]);
          break;
 
+      case SHADER_OPCODE_BARRIER:
+	 generate_barrier(inst, src[0]);
+	 break;
+
       default:
          unreachable("Unsupported opcode");
 
@@ -2166,15 +2174,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
       ralloc_free(annotation.ann);
    }
 
-   static GLuint msg_id = 0;
-   _mesa_gl_debug(&brw->ctx, &msg_id,
-                  MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                  MESA_DEBUG_TYPE_OTHER,
-                  MESA_DEBUG_SEVERITY_NOTIFICATION,
-                  "%s SIMD%d shader: %d inst, %d loops, %d:%d spills:fills, "
-                  "Promoted %u constants, compacted %d to %d bytes.\n",
-                  stage_abbrev, dispatch_width, before_size / 16, loop_count,
-                  spill_count, fill_count, promoted_constants, before_size, after_size);
+   compiler->shader_debug_log(log_data,
+                              "%s SIMD%d shader: %d inst, %d loops, "
+                              "%d:%d spills:fills, Promoted %u constants, "
+                              "compacted %d to %d bytes.\n",
+                              stage_abbrev, dispatch_width, before_size / 16,
+                              loop_count, spill_count, fill_count,
+                              promoted_constants, before_size, after_size);
 
    return start_offset;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 270131a73d1..a378019af5b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -28,6 +28,8 @@
 #include "brw_fs.h"
 #include "brw_nir.h"
 
+using namespace brw;
+
 void
 fs_visitor::emit_nir_code()
 {
@@ -38,12 +40,12 @@ fs_visitor::emit_nir_code()
     */
 
    if (nir->num_inputs > 0) {
-      nir_inputs = vgrf(nir->num_inputs);
+      nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
       nir_setup_inputs(nir);
    }
 
    if (nir->num_outputs > 0) {
-      nir_outputs = vgrf(nir->num_outputs);
+      nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
       nir_setup_outputs(nir);
    }
 
@@ -58,7 +60,7 @@ fs_visitor::emit_nir_code()
       unsigned array_elems =
          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
       unsigned size = array_elems * reg->num_components;
-      nir_globals[reg->index] = vgrf(size);
+      nir_globals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
    }
 
    /* get the main function and emit it */
@@ -93,8 +95,8 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
          unsigned array_length = var->type->is_array() ? var->type->length : 1;
          for (unsigned i = 0; i < array_length; i++) {
             for (unsigned j = 0; j < components; j++) {
-               emit(MOV(retype(offset(input, components * i + j), type),
-                        offset(fs_reg(ATTR, var->data.location + i, type), j)));
+               bld.MOV(retype(offset(input, components * i + j), type),
+                       offset(fs_reg(ATTR, var->data.location + i, type), j));
             }
          }
          break;
@@ -107,7 +109,7 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
          if (var->data.location == VARYING_SLOT_POS) {
             reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
                                                 var->data.origin_upper_left);
-            emit_percomp(MOV(input, reg), 0xF);
+            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, input, reg), 0xF);
          } else {
             emit_general_interpolation(input, var->name, var->type,
                                        (glsl_interp_qualifier) var->data.interpolation,
@@ -218,9 +220,12 @@ fs_visitor::nir_setup_uniform(nir_variable *var)
       * our name.
       */
    unsigned index = var->data.driver_location;
-   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
+   for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 
+      if (storage->builtin)
+              continue;
+
       if (strncmp(var->name, storage->name, namelen) != 0 ||
          (storage->name[namelen] != 0 &&
          storage->name[namelen] != '.' &&
@@ -358,7 +363,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
       unsigned array_elems =
          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
       unsigned size = array_elems * reg->num_components;
-      nir_locals[reg->index] = vgrf(size);
+      nir_locals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
    }
 
    nir_emit_cf_list(&impl->body);
@@ -392,21 +397,21 @@ void
 fs_visitor::nir_emit_if(nir_if *if_stmt)
 {
    /* first, put the condition into f0 */
-   fs_inst *inst = emit(MOV(reg_null_d,
+   fs_inst *inst = bld.MOV(bld.null_reg_d(),
                             retype(get_nir_src(if_stmt->condition),
-                                   BRW_REGISTER_TYPE_D)));
+                                   BRW_REGISTER_TYPE_D));
    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.IF(BRW_PREDICATE_NORMAL);
 
    nir_emit_cf_list(&if_stmt->then_list);
 
    /* note: if the else is empty, dead CF elimination will remove it */
-   emit(BRW_OPCODE_ELSE);
+   bld.emit(BRW_OPCODE_ELSE);
 
    nir_emit_cf_list(&if_stmt->else_list);
 
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 
    if (!try_replace_with_sel() && devinfo->gen < 6) {
       no16("Can't support (non-uniform) control flow on SIMD16\n");
@@ -420,11 +425,11 @@ fs_visitor::nir_emit_loop(nir_loop *loop)
       no16("Can't support (non-uniform) control flow on SIMD16\n");
    }
 
-   emit(BRW_OPCODE_DO);
+   bld.emit(BRW_OPCODE_DO);
 
    nir_emit_cf_list(&loop->body);
 
-   emit(BRW_OPCODE_WHILE);
+   bld.emit(BRW_OPCODE_WHILE);
 }
 
 void
@@ -438,19 +443,19 @@ fs_visitor::nir_emit_block(nir_block *block)
 void
 fs_visitor::nir_emit_instr(nir_instr *instr)
 {
-   this->base_ir = instr;
+   const fs_builder abld = bld.annotate(NULL, instr);
 
    switch (instr->type) {
    case nir_instr_type_alu:
-      nir_emit_alu(nir_instr_as_alu(instr));
+      nir_emit_alu(abld, nir_instr_as_alu(instr));
       break;
 
    case nir_instr_type_intrinsic:
-      nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
+      nir_emit_intrinsic(abld, nir_instr_as_intrinsic(instr));
       break;
 
    case nir_instr_type_tex:
-      nir_emit_texture(nir_instr_as_tex(instr));
+      nir_emit_texture(abld, nir_instr_as_tex(instr));
       break;
 
    case nir_instr_type_load_const:
@@ -460,14 +465,12 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       break;
 
    case nir_instr_type_jump:
-      nir_emit_jump(nir_instr_as_jump(instr));
+      nir_emit_jump(abld, nir_instr_as_jump(instr));
       break;
 
    default:
       unreachable("unknown instruction type");
    }
-
-   this->base_ir = NULL;
 }
 
 static brw_reg_type
@@ -540,7 +543,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
       tmp.subreg_offset = 2;
       tmp.stride = 2;
 
-      fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
+      fs_inst *or_inst = bld.OR(tmp, g0, fs_reg(0x3f80));
       or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
 
       tmp.type = BRW_REGISTER_TYPE_D;
@@ -565,15 +568,15 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
          g1_6.negate = true;
       }
 
-      emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
+      bld.OR(tmp, g1_6, fs_reg(0x3f800000));
    }
-   emit(AND(retype(result, BRW_REGISTER_TYPE_D), tmp, fs_reg(0xbf800000)));
+   bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, fs_reg(0xbf800000));
 
    return true;
 }
 
 void
-fs_visitor::nir_emit_alu(nir_alu_instr *instr)
+fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 {
    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
    fs_inst *inst;
@@ -605,7 +608,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
          if (!instr->src[i].src.is_ssa &&
              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
             need_extra_copy = true;
-            temp = retype(vgrf(4), result.type);
+            temp = bld.vgrf(result.type, 4);
             break;
          }
       }
@@ -615,11 +618,11 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
             continue;
 
          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
-            inst = emit(MOV(offset(temp, i),
-                        offset(op[0], instr->src[0].swizzle[i])));
+            inst = bld.MOV(offset(temp, i),
+                           offset(op[0], instr->src[0].swizzle[i]));
          } else {
-            inst = emit(MOV(offset(temp, i),
-                        offset(op[i], instr->src[i].swizzle[0])));
+            inst = bld.MOV(offset(temp, i),
+                           offset(op[i], instr->src[i].swizzle[0]));
          }
          inst->saturate = instr->dest.saturate;
       }
@@ -633,7 +636,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
             if (!(instr->dest.write_mask & (1 << i)))
                continue;
 
-            emit(MOV(offset(result, i), offset(temp, i)));
+            bld.MOV(offset(result, i), offset(temp, i));
          }
       }
       return;
@@ -665,13 +668,13 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    switch (instr->op) {
    case nir_op_i2f:
    case nir_op_u2f:
-      inst = emit(MOV(result, op[0]));
+      inst = bld.MOV(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_f2i:
    case nir_op_f2u:
-      emit(MOV(result, op[0]));
+      bld.MOV(result, op[0]);
       break;
 
    case nir_op_fsign: {
@@ -680,17 +683,17 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
          * zero.
          */
-      emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
+      bld.CMP(bld.null_reg_f(), op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ);
 
       fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
       op[0].type = BRW_REGISTER_TYPE_UD;
       result.type = BRW_REGISTER_TYPE_UD;
-      emit(AND(result_int, op[0], fs_reg(0x80000000u)));
+      bld.AND(result_int, op[0], fs_reg(0x80000000u));
 
-      inst = emit(OR(result_int, result_int, fs_reg(0x3f800000u)));
+      inst = bld.OR(result_int, result_int, fs_reg(0x3f800000u));
       inst->predicate = BRW_PREDICATE_NORMAL;
       if (instr->dest.saturate) {
-         inst = emit(MOV(result, result));
+         inst = bld.MOV(result, result);
          inst->saturate = true;
       }
       break;
@@ -701,120 +704,88 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
        *               -> non-negative val generates 0x00000000.
        *  Predicated OR sets 1 if val is positive.
        */
-      emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
-      emit(ASR(result, op[0], fs_reg(31)));
-      inst = emit(OR(result, result, fs_reg(1)));
+      bld.CMP(bld.null_reg_d(), op[0], fs_reg(0), BRW_CONDITIONAL_G);
+      bld.ASR(result, op[0], fs_reg(31));
+      inst = bld.OR(result, result, fs_reg(1));
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
    case nir_op_frcp:
-      inst = emit_math(SHADER_OPCODE_RCP, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fexp2:
-      inst = emit_math(SHADER_OPCODE_EXP2, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_flog2:
-      inst = emit_math(SHADER_OPCODE_LOG2, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fsin:
-      inst = emit_math(SHADER_OPCODE_SIN, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fcos:
-      inst = emit_math(SHADER_OPCODE_COS, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fddx:
       if (fs_key->high_quality_derivatives) {
-         inst = emit(FS_OPCODE_DDX_FINE, result, op[0]);
+         inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
       } else {
-         inst = emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+         inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
       }
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddx_fine:
-      inst = emit(FS_OPCODE_DDX_FINE, result, op[0]);
+      inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddx_coarse:
-      inst = emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+      inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddy:
       if (fs_key->high_quality_derivatives) {
-         inst = emit(FS_OPCODE_DDY_FINE, result, op[0],
-                     fs_reg(fs_key->render_to_fbo));
+         inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
+                         fs_reg(fs_key->render_to_fbo));
       } else {
-         inst = emit(FS_OPCODE_DDY_COARSE, result, op[0],
-                     fs_reg(fs_key->render_to_fbo));
+         inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
+                         fs_reg(fs_key->render_to_fbo));
       }
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddy_fine:
-      inst = emit(FS_OPCODE_DDY_FINE, result, op[0],
-                  fs_reg(fs_key->render_to_fbo));
+      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
+                      fs_reg(fs_key->render_to_fbo));
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddy_coarse:
-      inst = emit(FS_OPCODE_DDY_COARSE, result, op[0],
-                  fs_reg(fs_key->render_to_fbo));
+      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
+                      fs_reg(fs_key->render_to_fbo));
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fadd:
    case nir_op_iadd:
-      inst = emit(ADD(result, op[0], op[1]));
+      inst = bld.ADD(result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fmul:
-      inst = emit(MUL(result, op[0], op[1]));
+      inst = bld.MUL(result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
       break;
 
-   case nir_op_imul: {
-      if (devinfo->gen >= 8) {
-         emit(MUL(result, op[0], op[1]));
-         break;
-      } else {
-         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
-         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
-
-         if (value0 && value0->u[0] < (1 << 16)) {
-            if (devinfo->gen < 7) {
-               emit(MUL(result, op[0], op[1]));
-            } else {
-               emit(MUL(result, op[1], op[0]));
-            }
-            break;
-         } else if (value1 && value1->u[0] < (1 << 16)) {
-            if (devinfo->gen < 7) {
-               emit(MUL(result, op[1], op[0]));
-            } else {
-               emit(MUL(result, op[0], op[1]));
-            }
-            break;
-         }
-      }
-
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
-
-      emit(MUL(acc, op[0], op[1]));
-      emit(MACH(reg_null_d, op[0], op[1]));
-      emit(MOV(result, fs_reg(acc)));
+   case nir_op_imul:
+      bld.MUL(result, op[0], op[1]);
       break;
-   }
 
    case nir_op_imul_high:
    case nir_op_umul_high: {
@@ -823,8 +794,8 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
 
       struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
 
-      fs_inst *mul = emit(MUL(acc, op[0], op[1]));
-      emit(MACH(result, op[0], op[1]));
+      fs_inst *mul = bld.MUL(acc, op[0], op[1]);
+      bld.MACH(result, op[0], op[1]);
 
       /* Until Gen8, integer multiplies read 32-bits from one source, and
        * 16-bits from the other, and relying on the MACH instruction to
@@ -852,7 +823,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
 
    case nir_op_idiv:
    case nir_op_udiv:
-      emit_math(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
+      bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
       break;
 
    case nir_op_uadd_carry: {
@@ -862,8 +833,8 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
                                   BRW_REGISTER_TYPE_UD);
 
-      emit(ADDC(reg_null_ud, op[0], op[1]));
-      emit(MOV(result, fs_reg(acc)));
+      bld.ADDC(bld.null_reg_ud(), op[0], op[1]);
+      bld.MOV(result, fs_reg(acc));
       break;
    }
 
@@ -874,63 +845,63 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
                                   BRW_REGISTER_TYPE_UD);
 
-      emit(SUBB(reg_null_ud, op[0], op[1]));
-      emit(MOV(result, fs_reg(acc)));
+      bld.SUBB(bld.null_reg_ud(), op[0], op[1]);
+      bld.MOV(result, fs_reg(acc));
       break;
    }
 
    case nir_op_umod:
-      emit_math(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
       break;
 
    case nir_op_flt:
    case nir_op_ilt:
    case nir_op_ult:
-      emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_L));
+      bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
       break;
 
    case nir_op_fge:
    case nir_op_ige:
    case nir_op_uge:
-      emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_GE));
+      bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_GE);
       break;
 
    case nir_op_feq:
    case nir_op_ieq:
-      emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_Z));
+      bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_Z);
       break;
 
    case nir_op_fne:
    case nir_op_ine:
-      emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ));
+      bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ);
       break;
 
    case nir_op_inot:
       if (devinfo->gen >= 8) {
          resolve_source_modifiers(&op[0]);
       }
-      emit(NOT(result, op[0]));
+      bld.NOT(result, op[0]);
       break;
    case nir_op_ixor:
       if (devinfo->gen >= 8) {
          resolve_source_modifiers(&op[0]);
          resolve_source_modifiers(&op[1]);
       }
-      emit(XOR(result, op[0], op[1]));
+      bld.XOR(result, op[0], op[1]);
       break;
    case nir_op_ior:
       if (devinfo->gen >= 8) {
          resolve_source_modifiers(&op[0]);
          resolve_source_modifiers(&op[1]);
       }
-      emit(OR(result, op[0], op[1]));
+      bld.OR(result, op[0], op[1]);
       break;
    case nir_op_iand:
       if (devinfo->gen >= 8) {
          resolve_source_modifiers(&op[0]);
          resolve_source_modifiers(&op[1]);
       }
-      emit(AND(result, op[0], op[1]));
+      bld.AND(result, op[0], op[1]);
       break;
 
    case nir_op_fdot2:
@@ -978,53 +949,53 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       unreachable("not reached: should be handled by ldexp_to_arith()");
 
    case nir_op_fsqrt:
-      inst = emit_math(SHADER_OPCODE_SQRT, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_frsq:
-      inst = emit_math(SHADER_OPCODE_RSQ, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_b2i:
-      emit(AND(result, op[0], fs_reg(1)));
+      bld.AND(result, op[0], fs_reg(1));
       break;
    case nir_op_b2f:
-      emit(AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], fs_reg(0x3f800000u)));
+      bld.AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], fs_reg(0x3f800000u));
       break;
 
    case nir_op_f2b:
-      emit(CMP(result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
+      bld.CMP(result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ);
       break;
    case nir_op_i2b:
-      emit(CMP(result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
+      bld.CMP(result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ);
       break;
 
    case nir_op_ftrunc:
-      inst = emit(RNDZ(result, op[0]));
+      inst = bld.RNDZ(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fceil: {
       op[0].negate = !op[0].negate;
       fs_reg temp = vgrf(glsl_type::float_type);
-      emit(RNDD(temp, op[0]));
+      bld.RNDD(temp, op[0]);
       temp.negate = true;
-      inst = emit(MOV(result, temp));
+      inst = bld.MOV(result, temp);
       inst->saturate = instr->dest.saturate;
       break;
    }
    case nir_op_ffloor:
-      inst = emit(RNDD(result, op[0]));
+      inst = bld.RNDD(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_ffract:
-      inst = emit(FRC(result, op[0]));
+      inst = bld.FRC(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fround_even:
-      inst = emit(RNDE(result, op[0]));
+      inst = bld.RNDE(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -1032,11 +1003,11 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_imin:
    case nir_op_umin:
       if (devinfo->gen >= 6) {
-         inst = emit(BRW_OPCODE_SEL, result, op[0], op[1]);
+         inst = bld.emit(BRW_OPCODE_SEL, result, op[0], op[1]);
          inst->conditional_mod = BRW_CONDITIONAL_L;
       } else {
-         emit(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_L));
-         inst = emit(SEL(result, op[0], op[1]));
+         bld.CMP(bld.null_reg_d(), op[0], op[1], BRW_CONDITIONAL_L);
+         inst = bld.SEL(result, op[0], op[1]);
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
       inst->saturate = instr->dest.saturate;
@@ -1046,11 +1017,11 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_imax:
    case nir_op_umax:
       if (devinfo->gen >= 6) {
-         inst = emit(BRW_OPCODE_SEL, result, op[0], op[1]);
+         inst = bld.emit(BRW_OPCODE_SEL, result, op[0], op[1]);
          inst->conditional_mod = BRW_CONDITIONAL_GE;
       } else {
-         emit(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_GE));
-         inst = emit(SEL(result, op[0], op[1]));
+         bld.CMP(bld.null_reg_d(), op[0], op[1], BRW_CONDITIONAL_GE);
+         inst = bld.SEL(result, op[0], op[1]);
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
       inst->saturate = instr->dest.saturate;
@@ -1069,57 +1040,57 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       unreachable("not reached: should be handled by lower_packing_builtins");
 
    case nir_op_unpack_half_2x16_split_x:
-      inst = emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
+      inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_unpack_half_2x16_split_y:
-      inst = emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
+      inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fpow:
-      inst = emit_math(SHADER_OPCODE_POW, result, op[0], op[1]);
+      inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_bitfield_reverse:
-      emit(BFREV(result, op[0]));
+      bld.BFREV(result, op[0]);
       break;
 
    case nir_op_bit_count:
-      emit(CBIT(result, op[0]));
+      bld.CBIT(result, op[0]);
       break;
 
    case nir_op_ufind_msb:
    case nir_op_ifind_msb: {
-      emit(FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]));
+      bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
 
       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
        * subtract the result from 31 to convert the MSB count into an LSB count.
        */
 
-      emit(CMP(reg_null_d, result, fs_reg(-1), BRW_CONDITIONAL_NZ));
+      bld.CMP(bld.null_reg_d(), result, fs_reg(-1), BRW_CONDITIONAL_NZ);
       fs_reg neg_result(result);
       neg_result.negate = true;
-      inst = emit(ADD(result, neg_result, fs_reg(31)));
+      inst = bld.ADD(result, neg_result, fs_reg(31));
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
    }
 
    case nir_op_find_lsb:
-      emit(FBL(result, op[0]));
+      bld.FBL(result, op[0]);
       break;
 
    case nir_op_ubitfield_extract:
    case nir_op_ibitfield_extract:
-      emit(BFE(result, op[2], op[1], op[0]));
+      bld.BFE(result, op[2], op[1], op[0]);
       break;
    case nir_op_bfm:
-      emit(BFI1(result, op[0], op[1]));
+      bld.BFI1(result, op[0], op[1]);
       break;
    case nir_op_bfi:
-      emit(BFI2(result, op[0], op[1], op[2]));
+      bld.BFI2(result, op[0], op[1], op[2]);
       break;
 
    case nir_op_bitfield_insert:
@@ -1127,26 +1098,26 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
                   "lower_instructions::bitfield_insert_to_bfm_bfi");
 
    case nir_op_ishl:
-      emit(SHL(result, op[0], op[1]));
+      bld.SHL(result, op[0], op[1]);
       break;
    case nir_op_ishr:
-      emit(ASR(result, op[0], op[1]));
+      bld.ASR(result, op[0], op[1]);
       break;
    case nir_op_ushr:
-      emit(SHR(result, op[0], op[1]));
+      bld.SHR(result, op[0], op[1]);
       break;
 
    case nir_op_pack_half_2x16_split:
-      emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
+      bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
       break;
 
    case nir_op_ffma:
-      inst = emit(MAD(result, op[2], op[1], op[0]));
+      inst = bld.MAD(result, op[2], op[1], op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_flrp:
-      inst = emit_lrp(result, op[0], op[1], op[2]);
+      inst = bld.LRP(result, op[0], op[1], op[2]);
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -1154,8 +1125,8 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       if (optimize_frontfacing_ternary(instr, result))
          return;
 
-      emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      inst = emit(SEL(result, op[1], op[2]));
+      bld.CMP(bld.null_reg_d(), op[0], fs_reg(0), BRW_CONDITIONAL_NZ);
+      inst = bld.SEL(result, op[1], op[2]);
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
@@ -1169,9 +1140,9 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    if (devinfo->gen <= 5 &&
        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
       fs_reg masked = vgrf(glsl_type::int_type);
-      emit(AND(masked, result, fs_reg(1)));
+      bld.AND(masked, result, fs_reg(1));
       masked.negate = true;
-      emit(MOV(retype(result, BRW_REGISTER_TYPE_D), masked));
+      bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
    }
 }
 
@@ -1190,8 +1161,8 @@ fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
       int multiplier = nir_reg->num_components * (v->dispatch_width / 8);
 
       reg.reladdr = new(v->mem_ctx) fs_reg(v->vgrf(glsl_type::int_type));
-      v->emit(v->MUL(*reg.reladdr, v->get_nir_src(*indirect),
-                     fs_reg(multiplier)));
+      v->bld.MUL(*reg.reladdr, v->get_nir_src(*indirect),
+                 fs_reg(multiplier));
    }
 
    return reg;
@@ -1203,11 +1174,10 @@ fs_visitor::get_nir_src(nir_src src)
    if (src.is_ssa) {
       assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
       nir_load_const_instr *load = nir_instr_as_load_const(src.ssa->parent_instr);
-      fs_reg reg = vgrf(src.ssa->num_components);
-      reg.type = BRW_REGISTER_TYPE_D;
+      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, src.ssa->num_components);
 
       for (unsigned i = 0; i < src.ssa->num_components; ++i)
-         emit(MOV(offset(reg, i), fs_reg(load->value.i[i])));
+         bld.MOV(offset(reg, i), fs_reg(load->value.i[i]));
 
       return reg;
    } else {
@@ -1230,24 +1200,25 @@ fs_visitor::get_nir_dest(nir_dest dest)
 }
 
 void
-fs_visitor::emit_percomp(fs_inst *inst, unsigned wr_mask)
+fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
+                         unsigned wr_mask)
 {
    for (unsigned i = 0; i < 4; i++) {
       if (!((wr_mask >> i) & 1))
          continue;
 
-      fs_inst *new_inst = new(mem_ctx) fs_inst(*inst);
+      fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
       new_inst->dst = offset(new_inst->dst, i);
       for (unsigned j = 0; j < new_inst->sources; j++)
-         if (inst->src[j].file == GRF)
+         if (new_inst->src[j].file == GRF)
             new_inst->src[j] = offset(new_inst->src[j], i);
 
-      emit(new_inst);
+      bld.emit(new_inst);
    }
 }
 
 void
-fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
 {
    fs_reg dest;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -1265,12 +1236,12 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
        */
       fs_inst *cmp;
       if (instr->intrinsic == nir_intrinsic_discard_if) {
-         cmp = emit(CMP(reg_null_f, get_nir_src(instr->src[0]),
-                        fs_reg(0), BRW_CONDITIONAL_Z));
+         cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
+                       fs_reg(0), BRW_CONDITIONAL_Z);
       } else {
          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
                                        BRW_REGISTER_TYPE_UW));
-         cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
+         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
       }
       cmp->predicate = BRW_PREDICATE_NORMAL;
       cmp->flag_subreg = 1;
@@ -1307,8 +1278,8 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    }
 
    case nir_intrinsic_load_front_face:
-      emit(MOV(retype(dest, BRW_REGISTER_TYPE_D),
-               *emit_frontfacing_interpolation()));
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              *emit_frontfacing_interpolation());
       break;
 
    case nir_intrinsic_load_vertex_id:
@@ -1318,7 +1289,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg vertex_id = nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
       assert(vertex_id.file != BAD_FILE);
       dest.type = vertex_id.type;
-      emit(MOV(dest, vertex_id));
+      bld.MOV(dest, vertex_id);
       break;
    }
 
@@ -1326,7 +1297,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg base_vertex = nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
       assert(base_vertex.file != BAD_FILE);
       dest.type = base_vertex.type;
-      emit(MOV(dest, base_vertex));
+      bld.MOV(dest, base_vertex);
       break;
    }
 
@@ -1334,7 +1305,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg instance_id = nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
       assert(instance_id.file != BAD_FILE);
       dest.type = instance_id.type;
-      emit(MOV(dest, instance_id));
+      bld.MOV(dest, instance_id);
       break;
    }
 
@@ -1342,7 +1313,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg sample_mask_in = nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
       assert(sample_mask_in.file != BAD_FILE);
       dest.type = sample_mask_in.type;
-      emit(MOV(dest, sample_mask_in));
+      bld.MOV(dest, sample_mask_in);
       break;
    }
 
@@ -1350,8 +1321,8 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
       assert(sample_pos.file != BAD_FILE);
       dest.type = sample_pos.type;
-      emit(MOV(dest, sample_pos));
-      emit(MOV(offset(dest, 1), offset(sample_pos, 1)));
+      bld.MOV(dest, sample_pos);
+      bld.MOV(offset(dest, 1), offset(sample_pos, 1));
       break;
    }
 
@@ -1359,7 +1330,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg sample_id = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
       assert(sample_id.file != BAD_FILE);
       dest.type = sample_id.type;
-      emit(MOV(dest, sample_id));
+      bld.MOV(dest, sample_id);
       break;
    }
 
@@ -1377,16 +1348,14 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          index -= num_direct_uniforms;
       }
 
-      for (int i = 0; i < instr->const_index[1]; i++) {
-         for (unsigned j = 0; j < instr->num_components; j++) {
-            fs_reg src = offset(retype(uniform_reg, dest.type), index);
-            if (has_indirect)
-               src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
-            index++;
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         fs_reg src = offset(retype(uniform_reg, dest.type), index);
+         if (has_indirect)
+            src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
+         index++;
 
-            emit(MOV(dest, src));
-            dest = offset(dest, 1);
-         }
+         bld.MOV(dest, src);
+         dest = offset(dest, 1);
       }
       break;
    }
@@ -1417,9 +1386,9 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           * from any live channel.
           */
          surf_index = vgrf(glsl_type::uint_type);
-         emit(ADD(surf_index, get_nir_src(instr->src[0]),
-                  fs_reg(stage_prog_data->binding_table.ubo_start)));
-         emit_uniformize(surf_index, surf_index);
+         bld.ADD(surf_index, get_nir_src(instr->src[0]),
+                 fs_reg(stage_prog_data->binding_table.ubo_start));
+         bld.emit_uniformize(surf_index, surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -1432,21 +1401,21 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       if (has_indirect) {
          /* Turn the byte offset into a dword offset. */
          fs_reg base_offset = vgrf(glsl_type::int_type);
-         emit(SHR(base_offset, retype(get_nir_src(instr->src[1]),
-                                 BRW_REGISTER_TYPE_D),
-                  fs_reg(2)));
+         bld.SHR(base_offset, retype(get_nir_src(instr->src[1]),
+                                     BRW_REGISTER_TYPE_D),
+                 fs_reg(2));
 
          unsigned vec4_offset = instr->const_index[0] / 4;
          for (int i = 0; i < instr->num_components; i++)
-            emit(VARYING_PULL_CONSTANT_LOAD(offset(dest, i), surf_index,
-                                            base_offset, vec4_offset + i));
+            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, i), surf_index,
+                                       base_offset, vec4_offset + i);
       } else {
          fs_reg packed_consts = vgrf(glsl_type::float_type);
          packed_consts.type = dest.type;
 
          fs_reg const_offset_reg((unsigned) instr->const_index[0] & ~15);
-         emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
-              surf_index, const_offset_reg);
+         bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
+                  surf_index, const_offset_reg);
 
          for (unsigned i = 0; i < instr->num_components; i++) {
             packed_consts.set_smear(instr->const_index[0] % 16 / 4 + i);
@@ -1456,7 +1425,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
              */
             assert(packed_consts.subreg_offset < 32);
 
-            emit(MOV(dest, packed_consts));
+            bld.MOV(dest, packed_consts);
             dest = offset(dest, 1);
          }
       }
@@ -1468,17 +1437,15 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       /* fallthrough */
    case nir_intrinsic_load_input: {
       unsigned index = 0;
-      for (int i = 0; i < instr->const_index[1]; i++) {
-         for (unsigned j = 0; j < instr->num_components; j++) {
-            fs_reg src = offset(retype(nir_inputs, dest.type),
-                                instr->const_index[0] + index);
-            if (has_indirect)
-               src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
-            index++;
-
-            emit(MOV(dest, src));
-            dest = offset(dest, 1);
-         }
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         fs_reg src = offset(retype(nir_inputs, dest.type),
+                             instr->const_index[0] + index);
+         if (has_indirect)
+            src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
+         index++;
+
+         bld.MOV(dest, src);
+         dest = offset(dest, 1);
       }
       break;
    }
@@ -1510,7 +1477,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
        */
       no16("interpolate_at_* not yet supported in SIMD16 mode.");
 
-      fs_reg dst_xy = vgrf(2);
+      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
 
       /* For most messages, we need one reg of ignored data; the hardware
        * requires mlen==1 even when there is no payload. in the per-slot
@@ -1522,7 +1489,8 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       switch (instr->intrinsic) {
       case nir_intrinsic_interp_var_at_centroid:
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
+         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_CENTROID,
+                         dst_xy, src, fs_reg(0u));
          break;
 
       case nir_intrinsic_interp_var_at_sample: {
@@ -1530,8 +1498,8 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
          assert(const_sample);
          unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
-                     fs_reg(msg_data));
+         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
+                         fs_reg(msg_data));
          break;
       }
 
@@ -1542,17 +1510,17 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
             unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
             unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
 
-            inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
-                        fs_reg(off_x | (off_y << 4)));
+            inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
+                            fs_reg(off_x | (off_y << 4)));
          } else {
             src = vgrf(glsl_type::ivec2_type);
             fs_reg offset_src = retype(get_nir_src(instr->src[0]),
                                        BRW_REGISTER_TYPE_F);
             for (int i = 0; i < 2; i++) {
                fs_reg temp = vgrf(glsl_type::float_type);
-               emit(MUL(temp, offset(offset_src, i), fs_reg(16.0f)));
+               bld.MUL(temp, offset(offset_src, i), fs_reg(16.0f));
                fs_reg itemp = vgrf(glsl_type::int_type);
-               emit(MOV(itemp, temp));  /* float to int */
+               bld.MOV(itemp, temp);  /* float to int */
 
                /* Clamp the upper end of the range to +7/16.
                 * ARB_gpu_shader5 requires that we support a maximum offset
@@ -1569,14 +1537,13 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                 * implementation-dependent constant
                 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
                 */
-
-               emit(BRW_OPCODE_SEL, offset(src, i), itemp, fs_reg(7))
-                   ->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
+               set_condmod(BRW_CONDITIONAL_L,
+                           bld.SEL(offset(src, i), itemp, fs_reg(7)));
             }
 
             mlen = 2;
-            inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
-                        fs_reg(0u));
+            inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
+                            fs_reg(0u));
          }
          break;
       }
@@ -1594,7 +1561,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
          src.type = dest.type;
 
-         emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
+         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
          dest = offset(dest, 1);
       }
       break;
@@ -1606,27 +1573,29 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    case nir_intrinsic_store_output: {
       fs_reg src = get_nir_src(instr->src[0]);
       unsigned index = 0;
-      for (int i = 0; i < instr->const_index[1]; i++) {
-         for (unsigned j = 0; j < instr->num_components; j++) {
-            fs_reg new_dest = offset(retype(nir_outputs, src.type),
-                                     instr->const_index[0] + index);
-            if (has_indirect)
-               src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
-            index++;
-            emit(MOV(new_dest, src));
-            src = offset(src, 1);
-         }
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         fs_reg new_dest = offset(retype(nir_outputs, src.type),
+                                  instr->const_index[0] + index);
+         if (has_indirect)
+            src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
+         index++;
+         bld.MOV(new_dest, src);
+         src = offset(src, 1);
       }
       break;
    }
 
+   case nir_intrinsic_barrier:
+      emit_barrier();
+      break;
+
    default:
       unreachable("unknown intrinsic");
    }
 }
 
 void
-fs_visitor::nir_emit_texture(nir_tex_instr *instr)
+fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
 {
    uint32_t set = instr->sampler_set;
    uint32_t binding = instr->sampler_index;
@@ -1650,7 +1619,8 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
    bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
                         instr->is_array;
 
-   int lod_components = 0, offset_components = 0;
+   int lod_components = 0;
+   int UNUSED offset_components = 0;
 
    fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset;
 
@@ -1719,8 +1689,8 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
 
          /* Emit code to evaluate the actual indexing expression */
          sampler_reg = vgrf(glsl_type::uint_type);
-         emit(ADD(sampler_reg, src, fs_reg(sampler)));
-         emit_uniformize(sampler_reg, sampler_reg);
+         bld.ADD(sampler_reg, src, fs_reg(sampler));
+         bld.emit_uniformize(sampler_reg, sampler_reg);
          break;
       }
 
@@ -1789,18 +1759,19 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
    fs_reg dest = get_nir_dest(instr->dest);
    dest.type = this->result.type;
    unsigned num_components = nir_tex_instr_dest_size(instr);
-   emit_percomp(MOV(dest, this->result), (1 << num_components) - 1);
+   emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, dest, this->result),
+                (1 << num_components) - 1);
 }
 
 void
-fs_visitor::nir_emit_jump(nir_jump_instr *instr)
+fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
 {
    switch (instr->type) {
    case nir_jump_break:
-      emit(BRW_OPCODE_BREAK);
+      bld.emit(BRW_OPCODE_BREAK);
       break;
    case nir_jump_continue:
-      emit(BRW_OPCODE_CONTINUE);
+      bld.emit(BRW_OPCODE_CONTINUE);
       break;
    case nir_jump_return:
    default:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
index cf3da7b1882..d92d4bbd81d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
@@ -85,9 +85,9 @@ fs_visitor::opt_peephole_predicated_break()
        * instruction to set the flag register.
        */
       if (devinfo->gen == 6 && if_inst->conditional_mod) {
-         fs_inst *cmp_inst = CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                                 if_inst->conditional_mod);
-         if_inst->insert_before(if_block, cmp_inst);
+         bld.at(if_block, if_inst)
+            .CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                 if_inst->conditional_mod);
          jump_inst->predicate = BRW_PREDICATE_NORMAL;
       } else {
          jump_inst->predicate = if_inst->predicate;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 582d0993f1c..364fc4a5ad2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -30,6 +30,8 @@
 #include "glsl/glsl_types.h"
 #include "glsl/ir_optimization.h"
 
+using namespace brw;
+
 static void
 assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
 {
@@ -468,14 +470,14 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
  * see if we can actually use MRFs to do spills without overwriting normal MRF
  * contents.
  */
-void
-fs_visitor::get_used_mrfs(bool *mrf_used)
+static void
+get_used_mrfs(fs_visitor *v, bool *mrf_used)
 {
-   int reg_width = dispatch_width / 8;
+   int reg_width = v->dispatch_width / 8;
 
    memset(mrf_used, 0, BRW_MAX_MRF * sizeof(bool));
 
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
       if (inst->dst.file == MRF) {
          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
          mrf_used[reg] = true;
@@ -489,7 +491,7 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
       }
 
       if (inst->mlen > 0) {
-	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
+	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
             mrf_used[inst->base_mrf + i] = true;
          }
       }
@@ -500,12 +502,14 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
  * Sets interference between virtual GRFs and usage of the high GRFs for SEND
  * messages (treated as MRFs in code generation).
  */
-void
-fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
+static void
+setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
+                            int first_mrf_node, int *first_used_mrf)
 {
    bool mrf_used[BRW_MAX_MRF];
-   get_used_mrfs(mrf_used);
+   get_used_mrfs(v, mrf_used);
 
+   *first_used_mrf = BRW_MAX_MRF;
    for (int i = 0; i < BRW_MAX_MRF; i++) {
       /* Mark each MRF reg node as being allocated to its physical register.
        *
@@ -518,7 +522,10 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
        * that are used as conflicting with all virtual GRFs.
        */
       if (mrf_used[i]) {
-         for (unsigned j = 0; j < this->alloc.count; j++) {
+         if (i < *first_used_mrf)
+            *first_used_mrf = i;
+
+         for (unsigned j = 0; j < v->alloc.count; j++) {
             ra_add_node_interference(g, first_mrf_node + i, j);
          }
       }
@@ -528,7 +535,6 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
 bool
 fs_visitor::assign_regs(bool allow_spilling)
 {
-   struct brw_compiler *compiler = brw->intelScreen->compiler;
    /* Most of this allocation was written for a reg_width of 1
     * (dispatch_width == 8).  In extending to SIMD16, the code was
     * left in place and it was converted to have the hardware
@@ -584,7 +590,9 @@ fs_visitor::assign_regs(bool allow_spilling)
 
    setup_payload_interference(g, payload_node_count, first_payload_node);
    if (devinfo->gen >= 7) {
-      setup_mrf_hack_interference(g, first_mrf_hack_node);
+      int first_used_mrf = BRW_MAX_MRF;
+      setup_mrf_hack_interference(this, g, first_mrf_hack_node,
+                                  &first_used_mrf);
 
       foreach_block_and_inst(block, fs_inst, inst, cfg) {
          /* When we do send-from-GRF for FB writes, we need to ensure that
@@ -600,6 +608,13 @@ fs_visitor::assign_regs(bool allow_spilling)
          if (inst->eot) {
             int size = alloc.sizes[inst->src[0].reg];
             int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
+
+            /* If something happened to spill, we want to push the EOT send
+             * register early enough in the register file that we don't
+             * conflict with any used MRF hack registers.
+             */
+            reg -= BRW_MAX_MRF - first_used_mrf;
+
             ra_set_node_reg(g, inst->src[0].reg, reg);
             break;
          }
@@ -696,25 +711,24 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
       dst.width = 16;
    }
 
+   const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
+                              .group(reg_size * 8, 0)
+                              .at(block, inst);
+
    for (int i = 0; i < count / reg_size; i++) {
       /* The gen7 descriptor-based offset is 12 bits of HWORD units. */
       bool gen7_read = devinfo->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE;
-
-      fs_inst *unspill_inst =
-         new(mem_ctx) fs_inst(gen7_read ?
-                              SHADER_OPCODE_GEN7_SCRATCH_READ :
-                              SHADER_OPCODE_GEN4_SCRATCH_READ,
-                              dst);
+      fs_inst *unspill_inst = ibld.emit(gen7_read ?
+                                        SHADER_OPCODE_GEN7_SCRATCH_READ :
+                                        SHADER_OPCODE_GEN4_SCRATCH_READ,
+                                        dst);
       unspill_inst->offset = spill_offset;
-      unspill_inst->ir = inst->ir;
-      unspill_inst->annotation = inst->annotation;
       unspill_inst->regs_written = reg_size;
 
       if (!gen7_read) {
          unspill_inst->base_mrf = 14;
          unspill_inst->mlen = 1; /* header contains offset */
       }
-      inst->insert_before(block, unspill_inst);
 
       dst.reg_offset += reg_size;
       spill_offset += reg_size * REG_SIZE;
@@ -732,17 +746,17 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
       reg_size = 2;
    }
 
+   const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
+                              .group(reg_size * 8, 0)
+                              .at(block, inst->next);
+
    for (int i = 0; i < count / reg_size; i++) {
       fs_inst *spill_inst =
-         new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
-                              reg_size * 8, reg_null_f, src);
+         ibld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src);
       src.reg_offset += reg_size;
       spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
-      spill_inst->ir = inst->ir;
-      spill_inst->annotation = inst->annotation;
       spill_inst->mlen = 1 + reg_size; /* header, value */
       spill_inst->base_mrf = spill_base_mrf;
-      inst->insert_after(block, spill_inst);
    }
 }
 
@@ -839,7 +853,7 @@ fs_visitor::spill_reg(int spill_reg)
     */
    if (!spilled_any_registers) {
       bool mrf_used[BRW_MAX_MRF];
-      get_used_mrfs(mrf_used);
+      get_used_mrfs(this, mrf_used);
 
       for (int i = spill_base_mrf; i < BRW_MAX_MRF; i++) {
          if (mrf_used[i]) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
index 52aa5590c2e..8660ec08b8f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
@@ -37,6 +37,8 @@
  */
 #define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */
 
+using namespace brw;
+
 /**
  * Scans forwards from an IF counting consecutive MOV instructions in the
  * "then" and "else" blocks of the if statement.
@@ -153,9 +155,6 @@ fs_visitor::opt_peephole_sel()
       if (movs == 0)
          continue;
 
-      fs_inst *sel_inst[MAX_MOVS] = { NULL };
-      fs_inst *mov_imm_inst[MAX_MOVS] = { NULL };
-
       enum brw_predicate predicate;
       bool predicate_inverse;
       if (devinfo->gen == 6 && if_inst->conditional_mod) {
@@ -188,9 +187,21 @@ fs_visitor::opt_peephole_sel()
             movs = i;
             break;
          }
+      }
+
+      if (movs == 0)
+         continue;
+
+      const fs_builder ibld = bld.at(block, if_inst);
 
+      /* Emit a CMP if our IF used the embedded comparison */
+      if (devinfo->gen == 6 && if_inst->conditional_mod)
+         ibld.CMP(ibld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                  if_inst->conditional_mod);
+
+      for (int i = 0; i < movs; i++) {
          if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
-            sel_inst[i] = MOV(then_mov[i]->dst, then_mov[i]->src[0]);
+            ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
          } else {
             /* Only the last source register can be a constant, so if the MOV
              * in the "then" clause uses a constant, we need to put it in a
@@ -200,29 +211,13 @@ fs_visitor::opt_peephole_sel()
             if (src0.file == IMM) {
                src0 = vgrf(glsl_type::float_type);
                src0.type = then_mov[i]->src[0].type;
-               mov_imm_inst[i] = MOV(src0, then_mov[i]->src[0]);
+               ibld.MOV(src0, then_mov[i]->src[0]);
             }
 
-            sel_inst[i] = SEL(then_mov[i]->dst, src0, else_mov[i]->src[0]);
-            sel_inst[i]->predicate = predicate;
-            sel_inst[i]->predicate_inverse = predicate_inverse;
+            set_predicate_inv(predicate, predicate_inverse,
+                              ibld.SEL(then_mov[i]->dst, src0,
+                                       else_mov[i]->src[0]));
          }
-      }
-
-      if (movs == 0)
-         continue;
-
-      /* Emit a CMP if our IF used the embedded comparison */
-      if (devinfo->gen == 6 && if_inst->conditional_mod) {
-         fs_inst *cmp_inst = CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                                 if_inst->conditional_mod);
-         if_inst->insert_before(block, cmp_inst);
-      }
-
-      for (int i = 0; i < movs; i++) {
-         if (mov_imm_inst[i])
-            if_inst->insert_before(block, mov_imm_inst[i]);
-         if_inst->insert_before(block, sel_inst[i]);
 
          then_mov[i]->remove(then_block);
          else_mov[i]->remove(else_block);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index e1f47d4ec44..9a4bad6bcf5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -47,6 +47,7 @@
 #include "glsl/ir_optimization.h"
 #include "program/sampler.h"
 
+using namespace brw;
 
 fs_reg *
 fs_visitor::emit_vs_system_value(int location)
@@ -76,1371 +77,6 @@ fs_visitor::emit_vs_system_value(int location)
    return reg;
 }
 
-void
-fs_visitor::visit(ir_variable *ir)
-{
-   fs_reg *reg = NULL;
-
-   if (variable_storage(ir))
-      return;
-
-   if (ir->data.mode == ir_var_shader_in) {
-      assert(ir->data.location != -1);
-      if (stage == MESA_SHADER_VERTEX) {
-         reg = new(this->mem_ctx)
-            fs_reg(ATTR, ir->data.location,
-                   brw_type_for_base_type(ir->type->get_scalar_type()));
-      } else if (ir->data.location == VARYING_SLOT_POS) {
-         reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
-                                            ir->data.origin_upper_left);
-      } else if (ir->data.location == VARYING_SLOT_FACE) {
-	 reg = emit_frontfacing_interpolation();
-      } else {
-         reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
-         emit_general_interpolation(*reg, ir->name, ir->type,
-                                    (glsl_interp_qualifier) ir->data.interpolation,
-                                    ir->data.location, ir->data.centroid,
-                                    ir->data.sample);
-      }
-      assert(reg);
-      hash_table_insert(this->variable_ht, reg, ir);
-      return;
-   } else if (ir->data.mode == ir_var_shader_out) {
-      reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
-
-      if (stage == MESA_SHADER_VERTEX) {
-	 int vector_elements =
-	    ir->type->is_array() ? ir->type->fields.array->vector_elements
-				 : ir->type->vector_elements;
-
-	 for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
-	    int output = ir->data.location + i;
-	    this->outputs[output] = *reg;
-	    this->outputs[output].reg_offset = i * 4;
-	    this->output_components[output] = vector_elements;
-	 }
-
-      } else if (ir->data.index > 0) {
-	 assert(ir->data.location == FRAG_RESULT_DATA0);
-	 assert(ir->data.index == 1);
-	 this->dual_src_output = *reg;
-         this->do_dual_src = true;
-      } else if (ir->data.location == FRAG_RESULT_COLOR) {
-	 /* Writing gl_FragColor outputs to all color regions. */
-         assert(stage == MESA_SHADER_FRAGMENT);
-         brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-	 for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
-	    this->outputs[i] = *reg;
-	    this->output_components[i] = 4;
-	 }
-      } else if (ir->data.location == FRAG_RESULT_DEPTH) {
-	 this->frag_depth = *reg;
-      } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
-         this->sample_mask = *reg;
-      } else {
-	 /* gl_FragData or a user-defined FS output */
-	 assert(ir->data.location >= FRAG_RESULT_DATA0 &&
-		ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
-
-	 int vector_elements =
-	    ir->type->is_array() ? ir->type->fields.array->vector_elements
-				 : ir->type->vector_elements;
-
-	 /* General color output. */
-	 for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
-	    int output = ir->data.location - FRAG_RESULT_DATA0 + i;
-	    this->outputs[output] = offset(*reg, vector_elements * i);
-	    this->output_components[output] = vector_elements;
-	 }
-      }
-   } else if (ir->data.mode == ir_var_uniform) {
-      int param_index = uniforms;
-
-      /* Thanks to the lower_ubo_reference pass, we will see only
-       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
-       * variables, so no need for them to be in variable_ht.
-       *
-       * Some uniforms, such as samplers and atomic counters, have no actual
-       * storage, so we should ignore them.
-       */
-      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
-         return;
-
-      if (dispatch_width == 16) {
-	 if (!variable_storage(ir)) {
-	    fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
-	 }
-	 return;
-      }
-
-      param_size[param_index] = type_size(ir->type);
-      if (!strncmp(ir->name, "gl_", 3)) {
-	 setup_builtin_uniform_values(ir);
-      } else {
-	 setup_uniform_values(ir);
-      }
-
-      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
-      reg->type = brw_type_for_base_type(ir->type);
-
-   } else if (ir->data.mode == ir_var_system_value) {
-      switch (ir->data.location) {
-      case SYSTEM_VALUE_BASE_VERTEX:
-      case SYSTEM_VALUE_VERTEX_ID:
-      case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
-      case SYSTEM_VALUE_INSTANCE_ID:
-         reg = emit_vs_system_value(ir->data.location);
-         break;
-      case SYSTEM_VALUE_SAMPLE_POS:
-	 reg = emit_samplepos_setup();
-         break;
-      case SYSTEM_VALUE_SAMPLE_ID:
-	 reg = emit_sampleid_setup();
-         break;
-      case SYSTEM_VALUE_SAMPLE_MASK_IN:
-         assert(devinfo->gen >= 7);
-         reg = new(mem_ctx)
-            fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
-                          BRW_REGISTER_TYPE_D));
-         break;
-      }
-   }
-
-   if (!reg)
-      reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
-
-   hash_table_insert(this->variable_ht, reg, ir);
-}
-
-void
-fs_visitor::visit(ir_dereference_variable *ir)
-{
-   fs_reg *reg = variable_storage(ir->var);
-
-   if (!reg) {
-      fail("Failed to find variable storage for %s\n", ir->var->name);
-      this->result = fs_reg(reg_null_d);
-      return;
-   }
-   this->result = *reg;
-}
-
-void
-fs_visitor::visit(ir_dereference_record *ir)
-{
-   const glsl_type *struct_type = ir->record->type;
-
-   ir->record->accept(this);
-
-   unsigned int off = 0;
-   for (unsigned int i = 0; i < struct_type->length; i++) {
-      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
-	 break;
-      off += type_size(struct_type->fields.structure[i].type);
-   }
-   this->result = offset(this->result, off);
-   this->result.type = brw_type_for_base_type(ir->type);
-}
-
-void
-fs_visitor::visit(ir_dereference_array *ir)
-{
-   ir_constant *constant_index;
-   fs_reg src;
-   int element_size = type_size(ir->type);
-
-   constant_index = ir->array_index->as_constant();
-
-   ir->array->accept(this);
-   src = this->result;
-   src.type = brw_type_for_base_type(ir->type);
-
-   if (constant_index) {
-      if (src.file == ATTR) {
-         /* Attribute arrays get loaded as one vec4 per element.  In that case
-          * offset the source register.
-          */
-         src.reg += constant_index->value.i[0];
-      } else {
-         assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
-         src = offset(src, constant_index->value.i[0] * element_size);
-      }
-   } else {
-      /* Variable index array dereference.  We attach the variable index
-       * component to the reg as a pointer to a register containing the
-       * offset.  Currently only uniform arrays are supported in this patch,
-       * and that reladdr pointer is resolved by
-       * move_uniform_array_access_to_pull_constants().  All other array types
-       * are lowered by lower_variable_index_to_cond_assign().
-       */
-      ir->array_index->accept(this);
-
-      fs_reg index_reg;
-      index_reg = vgrf(glsl_type::int_type);
-      emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
-
-      if (src.reladdr) {
-         emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
-      }
-
-      src.reladdr = ralloc(mem_ctx, fs_reg);
-      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
-   }
-   this->result = src;
-}
-
-fs_inst *
-fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
-                     const fs_reg &a)
-{
-   if (devinfo->gen < 6) {
-      /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
-      fs_reg y_times_a           = vgrf(glsl_type::float_type);
-      fs_reg one_minus_a         = vgrf(glsl_type::float_type);
-      fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
-
-      emit(MUL(y_times_a, y, a));
-
-      fs_reg negative_a = a;
-      negative_a.negate = !a.negate;
-      emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
-      emit(MUL(x_times_one_minus_a, x, one_minus_a));
-
-      return emit(ADD(dst, x_times_one_minus_a, y_times_a));
-   } else {
-      /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
-       * we need to reorder the operands.
-       */
-      return emit(LRP(dst, a, y, x));
-   }
-}
-
-void
-fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
-                        const fs_reg &src0, const fs_reg &src1)
-{
-   assert(conditionalmod == BRW_CONDITIONAL_GE ||
-          conditionalmod == BRW_CONDITIONAL_L);
-
-   fs_inst *inst;
-
-   if (devinfo->gen >= 6) {
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->conditional_mod = conditionalmod;
-   } else {
-      emit(CMP(reg_null_d, src0, src1, conditionalmod));
-
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->predicate = BRW_PREDICATE_NORMAL;
-   }
-}
-
-void
-fs_visitor::emit_uniformize(const fs_reg &dst, const fs_reg &src)
-{
-   const fs_reg chan_index = vgrf(glsl_type::uint_type);
-
-   emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0))
-      ->force_writemask_all = true;
-   emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
-        src, component(chan_index, 0))
-      ->force_writemask_all = true;
-}
-
-bool
-fs_visitor::try_emit_saturate(ir_expression *ir)
-{
-   if (ir->operation != ir_unop_saturate)
-      return false;
-
-   ir_rvalue *sat_val = ir->operands[0];
-
-   fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
-
-   sat_val->accept(this);
-   fs_reg src = this->result;
-
-   fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
-
-   /* If the last instruction from our accept() generated our
-    * src, just set the saturate flag instead of emmitting a separate mov.
-    */
-   fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
-   if (modify && modify->regs_written == modify->dst.width / 8 &&
-       modify->can_do_saturate()) {
-      modify->saturate = true;
-      this->result = src;
-      return true;
-   }
-
-   return false;
-}
-
-bool
-fs_visitor::try_emit_line(ir_expression *ir)
-{
-   /* LINE's src0 must be of type float. */
-   if (ir->type != glsl_type::float_type)
-      return false;
-
-   ir_rvalue *nonmul = ir->operands[1];
-   ir_expression *mul = ir->operands[0]->as_expression();
-
-   if (!mul || mul->operation != ir_binop_mul) {
-      nonmul = ir->operands[0];
-      mul = ir->operands[1]->as_expression();
-
-      if (!mul || mul->operation != ir_binop_mul)
-         return false;
-   }
-
-   ir_constant *const_add = nonmul->as_constant();
-   if (!const_add)
-      return false;
-
-   int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
-   if (add_operand_vf == -1)
-      return false;
-
-   ir_rvalue *non_const_mul = mul->operands[1];
-   ir_constant *const_mul = mul->operands[0]->as_constant();
-   if (!const_mul) {
-      const_mul = mul->operands[1]->as_constant();
-
-      if (!const_mul)
-         return false;
-
-      non_const_mul = mul->operands[0];
-   }
-
-   int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
-   if (mul_operand_vf == -1)
-      return false;
-
-   non_const_mul->accept(this);
-   fs_reg src1 = this->result;
-
-   fs_reg src0 = vgrf(ir->type);
-   emit(BRW_OPCODE_MOV, src0,
-        fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
-
-   this->result = vgrf(ir->type);
-   emit(BRW_OPCODE_LINE, this->result, src0, src1);
-   return true;
-}
-
-bool
-fs_visitor::try_emit_mad(ir_expression *ir)
-{
-   /* 3-src instructions were introduced in gen6. */
-   if (devinfo->gen < 6)
-      return false;
-
-   /* MAD can only handle floating-point data. */
-   if (ir->type != glsl_type::float_type)
-      return false;
-
-   ir_rvalue *nonmul;
-   ir_expression *mul;
-   bool mul_negate, mul_abs;
-
-   for (int i = 0; i < 2; i++) {
-      mul_negate = false;
-      mul_abs = false;
-
-      mul = ir->operands[i]->as_expression();
-      nonmul = ir->operands[1 - i];
-
-      if (mul && mul->operation == ir_unop_abs) {
-         mul = mul->operands[0]->as_expression();
-         mul_abs = true;
-      } else if (mul && mul->operation == ir_unop_neg) {
-         mul = mul->operands[0]->as_expression();
-         mul_negate = true;
-      }
-
-      if (mul && mul->operation == ir_binop_mul)
-         break;
-   }
-
-   if (!mul || mul->operation != ir_binop_mul)
-      return false;
-
-   nonmul->accept(this);
-   fs_reg src0 = this->result;
-
-   mul->operands[0]->accept(this);
-   fs_reg src1 = this->result;
-   src1.negate ^= mul_negate;
-   src1.abs = mul_abs;
-   if (mul_abs)
-      src1.negate = false;
-
-   mul->operands[1]->accept(this);
-   fs_reg src2 = this->result;
-   src2.abs = mul_abs;
-   if (mul_abs)
-      src2.negate = false;
-
-   this->result = vgrf(ir->type);
-   emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
-
-   return true;
-}
-
-bool
-fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
-{
-   /* On platforms that do not natively generate 0u and ~0u for Boolean
-    * results, b2f expressions that look like
-    *
-    *     f = b2f(expr cmp 0)
-    *
-    * will generate better code by pretending the expression is
-    *
-    *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
-    *
-    * This is because the last instruction of "expr" can generate the
-    * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
-    * trick to generate 0u or ~0u for the Boolean result.  This means code like
-    *
-    *     mov(16)         g16<1>F         1F
-    *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
-    *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
-    *
-    * will be generated instead of
-    *
-    *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
-    *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
-    *     and(16)         g4<1>D          g2<8,8,1>D      1D
-    *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
-    *
-    * When the comparison is != 0.0 using the knowledge that the false case
-    * already results in zero would allow better code generation by possibly
-    * avoiding a load-immediate instruction.
-    */
-   ir_expression *cmp = ir->operands[0]->as_expression();
-   if (cmp == NULL)
-      return false;
-
-   if (cmp->operation == ir_binop_nequal) {
-      for (unsigned i = 0; i < 2; i++) {
-         ir_constant *c = cmp->operands[i]->as_constant();
-         if (c == NULL || !c->is_zero())
-            continue;
-
-         ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
-         if (expr != NULL) {
-            fs_reg op[2];
-
-            for (unsigned j = 0; j < 2; j++) {
-               cmp->operands[j]->accept(this);
-               op[j] = this->result;
-
-               resolve_ud_negate(&op[j]);
-            }
-
-            emit_bool_to_cond_code_of_reg(cmp, op);
-
-            /* In this case we know when the condition is true, op[i ^ 1]
-             * contains zero.  Invert the predicate, use op[i ^ 1] as src0,
-             * and immediate 1.0f as src1.
-             */
-            this->result = vgrf(ir->type);
-            op[i ^ 1].type = BRW_REGISTER_TYPE_F;
-
-            fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
-            inst->predicate = BRW_PREDICATE_NORMAL;
-            inst->predicate_inverse = true;
-            return true;
-         }
-      }
-   }
-
-   emit_bool_to_cond_code(cmp);
-
-   fs_reg temp = vgrf(ir->type);
-   emit(MOV(temp, fs_reg(1.0f)));
-
-   this->result = vgrf(ir->type);
-   fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
-   inst->predicate = BRW_PREDICATE_NORMAL;
-
-   return true;
-}
-
-static int
-pack_pixel_offset(float x)
-{
-   /* Clamp upper end of the range to +7/16. See explanation in non-constant
-    * offset case below. */
-   int n = MIN2((int)(x * 16), 7);
-   return n & 0xf;
-}
-
-void
-fs_visitor::emit_interpolate_expression(ir_expression *ir)
-{
-   /* in SIMD16 mode, the pixel interpolator returns coords interleaved
-    * 8 channels at a time, same as the barycentric coords presented in
-    * the FS payload. this requires a bit of extra work to support.
-    */
-   no16("interpolate_at_* not yet supported in SIMD16 mode.");
-
-   assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-
-   ir_dereference * deref = ir->operands[0]->as_dereference();
-   ir_swizzle * swiz = NULL;
-   if (!deref) {
-      /* the api does not allow a swizzle here, but the varying packing code
-       * may have pushed one into here.
-       */
-      swiz = ir->operands[0]->as_swizzle();
-      assert(swiz);
-      deref = swiz->val->as_dereference();
-   }
-   assert(deref);
-   ir_variable * var = deref->variable_referenced();
-   assert(var);
-
-   /* 1. collect interpolation factors */
-
-   fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
-
-   /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
-    * even when there is no payload. in the per-slot offset case, we'll replace this with
-    * the proper source data. */
-   fs_reg src = vgrf(glsl_type::float_type);
-   int mlen = 1;     /* one reg unless overriden */
-   int reg_width = dispatch_width / 8;
-   fs_inst *inst;
-
-   switch (ir->operation) {
-   case ir_unop_interpolate_at_centroid:
-      inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
-      break;
-
-   case ir_binop_interpolate_at_sample: {
-      ir_constant *sample_num = ir->operands[1]->as_constant();
-      assert(sample_num || !"nonconstant sample number should have been lowered.");
-
-      unsigned msg_data = sample_num->value.i[0] << 4;
-      inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data));
-      break;
-   }
-
-   case ir_binop_interpolate_at_offset: {
-      ir_constant *const_offset = ir->operands[1]->as_constant();
-      if (const_offset) {
-         unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
-                            (pack_pixel_offset(const_offset->value.f[1]) << 4);
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
-                     fs_reg(msg_data));
-      } else {
-         /* pack the operands: hw wants offsets as 4 bit signed ints */
-         ir->operands[1]->accept(this);
-         src = vgrf(glsl_type::ivec2_type);
-         fs_reg src2 = src;
-         for (int i = 0; i < 2; i++) {
-            fs_reg temp = vgrf(glsl_type::float_type);
-            emit(MUL(temp, this->result, fs_reg(16.0f)));
-            emit(MOV(src2, temp));  /* float to int */
-
-            /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
-             * that we support a maximum offset of +0.5, which isn't representable
-             * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
-             * which is the opposite of what the shader author wanted.
-             *
-             * This is legal due to ARB_gpu_shader5's quantization rules:
-             *
-             * "Not all values of <offset> may be supported; x and y offsets may
-             * be rounded to fixed-point values with the number of fraction bits
-             * given by the implementation-dependent constant
-             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
-             */
-
-            fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
-            inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
-
-            src2 = offset(src2, 1);
-            this->result = offset(this->result, 1);
-         }
-
-         mlen = 2 * reg_width;
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
-                     fs_reg(0u));
-      }
-      break;
-   }
-
-   default:
-      unreachable("not reached");
-   }
-
-   inst->mlen = mlen;
-   inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
-   inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
-         INTERP_QUALIFIER_NOPERSPECTIVE;
-
-   /* 2. emit linterp */
-
-   fs_reg res = vgrf(ir->type);
-   this->result = res;
-
-   for (int i = 0; i < ir->type->vector_elements; i++) {
-      int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
-      emit(FS_OPCODE_LINTERP, res, dst_xy,
-           fs_reg(interp_reg(var->data.location, ch)));
-      res = offset(res, 1);
-   }
-}
-
-void
-fs_visitor::visit(ir_expression *ir)
-{
-   unsigned int operand;
-   fs_reg op[3], temp;
-   fs_inst *inst;
-   struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
-
-   assert(ir->get_num_operands() <= 3);
-
-   if (try_emit_saturate(ir))
-      return;
-
-   /* Deal with the real oddball stuff first */
-   switch (ir->operation) {
-   case ir_binop_add:
-      if (devinfo->gen <= 5 && try_emit_line(ir))
-         return;
-      if (try_emit_mad(ir))
-         return;
-      break;
-
-   case ir_triop_csel:
-      ir->operands[1]->accept(this);
-      op[1] = this->result;
-      ir->operands[2]->accept(this);
-      op[2] = this->result;
-
-      emit_bool_to_cond_code(ir->operands[0]);
-
-      this->result = vgrf(ir->type);
-      inst = emit(SEL(this->result, op[1], op[2]));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      return;
-
-   case ir_unop_b2f:
-      if (devinfo->gen <= 5 && try_emit_b2f_of_comparison(ir))
-         return;
-      break;
-
-   case ir_unop_interpolate_at_centroid:
-   case ir_binop_interpolate_at_offset:
-   case ir_binop_interpolate_at_sample:
-      emit_interpolate_expression(ir);
-      return;
-
-   default:
-      break;
-   }
-
-   for (operand = 0; operand < ir->get_num_operands(); operand++) {
-      ir->operands[operand]->accept(this);
-      if (this->result.file == BAD_FILE) {
-	 fail("Failed to get tree for expression operand:\n");
-	 ir->operands[operand]->fprint(stderr);
-         fprintf(stderr, "\n");
-      }
-      assert(this->result.file == GRF ||
-             this->result.file == UNIFORM || this->result.file == ATTR);
-      op[operand] = this->result;
-
-      /* Matrix expression operands should have been broken down to vector
-       * operations already.
-       */
-      assert(!ir->operands[operand]->type->is_matrix());
-      /* And then those vector operands should have been broken down to scalar.
-       */
-      assert(!ir->operands[operand]->type->is_vector());
-   }
-
-   /* Storage for our result.  If our result goes into an assignment, it will
-    * just get copy-propagated out, so no worries.
-    */
-   this->result = vgrf(ir->type);
-
-   switch (ir->operation) {
-   case ir_unop_logic_not:
-      emit(NOT(this->result, op[0]));
-      break;
-   case ir_unop_neg:
-      op[0].negate = !op[0].negate;
-      emit(MOV(this->result, op[0]));
-      break;
-   case ir_unop_abs:
-      op[0].abs = true;
-      op[0].negate = false;
-      emit(MOV(this->result, op[0]));
-      break;
-   case ir_unop_sign:
-      if (ir->type->is_float()) {
-         /* AND(val, 0x80000000) gives the sign bit.
-          *
-          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
-          * zero.
-          */
-         emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-
-         op[0].type = BRW_REGISTER_TYPE_UD;
-         this->result.type = BRW_REGISTER_TYPE_UD;
-         emit(AND(this->result, op[0], fs_reg(0x80000000u)));
-
-         inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-         this->result.type = BRW_REGISTER_TYPE_F;
-      } else {
-         /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
-          *               -> non-negative val generates 0x00000000.
-          *  Predicated OR sets 1 if val is positive.
-          */
-         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
-
-         emit(ASR(this->result, op[0], fs_reg(31)));
-
-         inst = emit(OR(this->result, this->result, fs_reg(1)));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-      }
-      break;
-   case ir_unop_rcp:
-      emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
-      break;
-
-   case ir_unop_exp2:
-      emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
-      break;
-   case ir_unop_log2:
-      emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
-      break;
-   case ir_unop_exp:
-   case ir_unop_log:
-      unreachable("not reached: should be handled by ir_explog_to_explog2");
-   case ir_unop_sin:
-      emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
-      break;
-   case ir_unop_cos:
-      emit_math(SHADER_OPCODE_COS, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdx:
-      /* Select one of the two opcodes based on the glHint value. */
-      if (fs_key->high_quality_derivatives)
-         emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
-      else
-         emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdx_coarse:
-      emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdx_fine:
-      emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdy:
-      /* Select one of the two opcodes based on the glHint value. */
-      if (fs_key->high_quality_derivatives)
-         emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
-      else
-         emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
-      break;
-
-   case ir_unop_dFdy_coarse:
-      emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
-      break;
-
-   case ir_unop_dFdy_fine:
-      emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
-      break;
-
-   case ir_binop_add:
-      emit(ADD(this->result, op[0], op[1]));
-      break;
-   case ir_binop_sub:
-      unreachable("not reached: should be handled by ir_sub_to_add_neg");
-
-   case ir_binop_mul:
-      if (devinfo->gen < 8 && ir->type->is_integer()) {
-	 /* For integer multiplication, the MUL uses the low 16 bits
-	  * of one of the operands (src0 on gen6, src1 on gen7).  The
-	  * MACH accumulates in the contribution of the upper 16 bits
-	  * of that operand.
-          */
-         if (ir->operands[0]->is_uint16_constant()) {
-            if (devinfo->gen < 7)
-               emit(MUL(this->result, op[0], op[1]));
-            else
-               emit(MUL(this->result, op[1], op[0]));
-         } else if (ir->operands[1]->is_uint16_constant()) {
-            if (devinfo->gen < 7)
-               emit(MUL(this->result, op[1], op[0]));
-            else
-               emit(MUL(this->result, op[0], op[1]));
-         } else {
-            if (devinfo->gen >= 7)
-               no16("SIMD16 explicit accumulator operands unsupported\n");
-
-            struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                        this->result.type);
-
-            emit(MUL(acc, op[0], op[1]));
-            emit(MACH(reg_null_d, op[0], op[1]));
-            emit(MOV(this->result, fs_reg(acc)));
-         }
-      } else {
-	 emit(MUL(this->result, op[0], op[1]));
-      }
-      break;
-   case ir_binop_imul_high: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  this->result.type);
-
-      fs_inst *mul = emit(MUL(acc, op[0], op[1]));
-      emit(MACH(this->result, op[0], op[1]));
-
-      /* Until Gen8, integer multiplies read 32-bits from one source, and
-       * 16-bits from the other, and relying on the MACH instruction to
-       * generate the high bits of the result.
-       *
-       * On Gen8, the multiply instruction does a full 32x32-bit multiply,
-       * but in order to do a 64x64-bit multiply we have to simulate the
-       * previous behavior and then use a MACH instruction.
-       *
-       * FINISHME: Don't use source modifiers on src1.
-       */
-      if (devinfo->gen >= 8) {
-         assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
-                mul->src[1].type == BRW_REGISTER_TYPE_UD);
-         if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
-            mul->src[1].type = BRW_REGISTER_TYPE_W;
-            mul->src[1].stride = 2;
-         } else {
-            mul->src[1].type = BRW_REGISTER_TYPE_UW;
-            mul->src[1].stride = 2;
-         }
-      }
-
-      break;
-   }
-   case ir_binop_div:
-      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
-      assert(ir->type->is_integer());
-      emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
-      break;
-   case ir_binop_carry: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      emit(ADDC(reg_null_ud, op[0], op[1]));
-      emit(MOV(this->result, fs_reg(acc)));
-      break;
-   }
-   case ir_binop_borrow: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      emit(SUBB(reg_null_ud, op[0], op[1]));
-      emit(MOV(this->result, fs_reg(acc)));
-      break;
-   }
-   case ir_binop_mod:
-      /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
-      assert(ir->type->is_integer());
-      emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
-      break;
-
-   case ir_binop_less:
-   case ir_binop_greater:
-   case ir_binop_lequal:
-   case ir_binop_gequal:
-   case ir_binop_equal:
-   case ir_binop_all_equal:
-   case ir_binop_nequal:
-   case ir_binop_any_nequal:
-      if (devinfo->gen <= 5) {
-         resolve_bool_comparison(ir->operands[0], &op[0]);
-         resolve_bool_comparison(ir->operands[1], &op[1]);
-      }
-
-      emit(CMP(this->result, op[0], op[1],
-               brw_conditional_for_comparison(ir->operation)));
-      break;
-
-   case ir_binop_logic_xor:
-      emit(XOR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_logic_or:
-      emit(OR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_logic_and:
-      emit(AND(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_dot:
-   case ir_unop_any:
-      unreachable("not reached: should be handled by brw_fs_channel_expressions");
-
-   case ir_unop_noise:
-      unreachable("not reached: should be handled by lower_noise");
-
-   case ir_quadop_vector:
-      unreachable("not reached: should be handled by lower_quadop_vector");
-
-   case ir_binop_vector_extract:
-      unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
-
-   case ir_triop_vector_insert:
-      unreachable("not reached: should be handled by lower_vector_insert()");
-
-   case ir_binop_ldexp:
-      unreachable("not reached: should be handled by ldexp_to_arith()");
-
-   case ir_unop_sqrt:
-      emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
-      break;
-
-   case ir_unop_rsq:
-      emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
-      break;
-
-   case ir_unop_bitcast_i2f:
-   case ir_unop_bitcast_u2f:
-      op[0].type = BRW_REGISTER_TYPE_F;
-      this->result = op[0];
-      break;
-   case ir_unop_i2u:
-   case ir_unop_bitcast_f2u:
-      op[0].type = BRW_REGISTER_TYPE_UD;
-      this->result = op[0];
-      break;
-   case ir_unop_u2i:
-   case ir_unop_bitcast_f2i:
-      op[0].type = BRW_REGISTER_TYPE_D;
-      this->result = op[0];
-      break;
-   case ir_unop_i2f:
-   case ir_unop_u2f:
-   case ir_unop_f2i:
-   case ir_unop_f2u:
-      emit(MOV(this->result, op[0]));
-      break;
-
-   case ir_unop_b2i:
-      emit(AND(this->result, op[0], fs_reg(1)));
-      break;
-   case ir_unop_b2f:
-      if (devinfo->gen <= 5) {
-         resolve_bool_comparison(ir->operands[0], &op[0]);
-      }
-      op[0].type = BRW_REGISTER_TYPE_D;
-      this->result.type = BRW_REGISTER_TYPE_D;
-      emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
-      this->result.type = BRW_REGISTER_TYPE_F;
-      break;
-
-   case ir_unop_f2b:
-      emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-      break;
-   case ir_unop_i2b:
-      emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      break;
-
-   case ir_unop_trunc:
-      emit(RNDZ(this->result, op[0]));
-      break;
-   case ir_unop_ceil: {
-         fs_reg tmp = vgrf(ir->type);
-         op[0].negate = !op[0].negate;
-         emit(RNDD(tmp, op[0]));
-         tmp.negate = true;
-         emit(MOV(this->result, tmp));
-      }
-      break;
-   case ir_unop_floor:
-      emit(RNDD(this->result, op[0]));
-      break;
-   case ir_unop_fract:
-      emit(FRC(this->result, op[0]));
-      break;
-   case ir_unop_round_even:
-      emit(RNDE(this->result, op[0]));
-      break;
-
-   case ir_binop_min:
-   case ir_binop_max:
-      resolve_ud_negate(&op[0]);
-      resolve_ud_negate(&op[1]);
-      emit_minmax(ir->operation == ir_binop_min ?
-                  BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
-                  this->result, op[0], op[1]);
-      break;
-   case ir_unop_pack_snorm_2x16:
-   case ir_unop_pack_snorm_4x8:
-   case ir_unop_pack_unorm_2x16:
-   case ir_unop_pack_unorm_4x8:
-   case ir_unop_unpack_snorm_2x16:
-   case ir_unop_unpack_snorm_4x8:
-   case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_unorm_4x8:
-   case ir_unop_unpack_half_2x16:
-   case ir_unop_pack_half_2x16:
-      unreachable("not reached: should be handled by lower_packing_builtins");
-   case ir_unop_unpack_half_2x16_split_x:
-      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
-      break;
-   case ir_unop_unpack_half_2x16_split_y:
-      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
-      break;
-   case ir_binop_pow:
-      emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
-      break;
-
-   case ir_unop_bitfield_reverse:
-      emit(BFREV(this->result, op[0]));
-      break;
-   case ir_unop_bit_count:
-      emit(CBIT(this->result, op[0]));
-      break;
-   case ir_unop_find_msb:
-      temp = vgrf(glsl_type::uint_type);
-      emit(FBH(temp, op[0]));
-
-      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
-       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
-       * subtract the result from 31 to convert the MSB count into an LSB count.
-       */
-
-      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
-      emit(MOV(this->result, temp));
-      emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
-
-      temp.negate = true;
-      inst = emit(ADD(this->result, temp, fs_reg(31)));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      break;
-   case ir_unop_find_lsb:
-      emit(FBL(this->result, op[0]));
-      break;
-   case ir_unop_saturate:
-      inst = emit(MOV(this->result, op[0]));
-      inst->saturate = true;
-      break;
-   case ir_triop_bitfield_extract:
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(BFE(this->result, op[2], op[1], op[0]));
-      break;
-   case ir_binop_bfm:
-      emit(BFI1(this->result, op[0], op[1]));
-      break;
-   case ir_triop_bfi:
-      emit(BFI2(this->result, op[0], op[1], op[2]));
-      break;
-   case ir_quadop_bitfield_insert:
-      unreachable("not reached: should be handled by "
-              "lower_instructions::bitfield_insert_to_bfm_bfi");
-
-   case ir_unop_bit_not:
-      emit(NOT(this->result, op[0]));
-      break;
-   case ir_binop_bit_and:
-      emit(AND(this->result, op[0], op[1]));
-      break;
-   case ir_binop_bit_xor:
-      emit(XOR(this->result, op[0], op[1]));
-      break;
-   case ir_binop_bit_or:
-      emit(OR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_lshift:
-      emit(SHL(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_rshift:
-      if (ir->type->base_type == GLSL_TYPE_INT)
-	 emit(ASR(this->result, op[0], op[1]));
-      else
-	 emit(SHR(this->result, op[0], op[1]));
-      break;
-   case ir_binop_pack_half_2x16_split:
-      emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
-      break;
-   case ir_binop_ubo_load: {
-      /* This IR node takes a constant uniform block and a constant or
-       * variable byte offset within the block and loads a vector from that.
-       */
-      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
-      ir_constant *const_offset = ir->operands[1]->as_constant();
-      fs_reg surf_index;
-      uint32_t binding, set, index, set_index;
-
-      if (const_uniform_block) {
-         /* The block index is a constant, so just emit the binding table entry
-          * as an immediate.
-          */
-         index = const_uniform_block->value.u[0];
-         set = shader->base.UniformBlocks[index].Set;
-         set_index = shader->base.UniformBlocks[index].Binding;
-         binding = stage_prog_data->bind_map[set].index[set_index];
-         surf_index = fs_reg(binding);
-      } else {
-         assert(0 && "need more info from the ir for this.");
-
-         /* The block index is not a constant. Evaluate the index expression
-          * per-channel and add the base UBO index; we have to select a value
-          * from any live channel.
-          */
-         surf_index = vgrf(glsl_type::uint_type);
-         emit(ADD(surf_index, op[0],
-                  fs_reg(stage_prog_data->binding_table.ubo_start)));
-         emit_uniformize(surf_index, surf_index);
-
-         /* Assume this may touch any UBO. It would be nice to provide
-          * a tighter bound, but the array information is already lowered away.
-          */
-         brw_mark_surface_used(prog_data,
-                               stage_prog_data->binding_table.ubo_start +
-                               shader_prog->NumUniformBlocks - 1);
-      }
-
-      if (const_offset) {
-         fs_reg packed_consts = vgrf(glsl_type::float_type);
-         packed_consts.type = result.type;
-
-         fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
-         emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
-                                   packed_consts, surf_index, const_offset_reg));
-
-         for (int i = 0; i < ir->type->vector_elements; i++) {
-            packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
-
-            /* The std140 packing rules don't allow vectors to cross 16-byte
-             * boundaries, and a reg is 32 bytes.
-             */
-            assert(packed_consts.subreg_offset < 32);
-
-            /* UBO bools are any nonzero value.  We consider bools to be
-             * values with the low bit set to 1.  Convert them using CMP.
-             */
-            if (ir->type->base_type == GLSL_TYPE_BOOL) {
-               emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
-            } else {
-               emit(MOV(result, packed_consts));
-            }
-
-            result = offset(result, 1);
-         }
-      } else {
-         /* Turn the byte offset into a dword offset. */
-         fs_reg base_offset = vgrf(glsl_type::int_type);
-         emit(SHR(base_offset, op[1], fs_reg(2)));
-
-         for (int i = 0; i < ir->type->vector_elements; i++) {
-            emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
-                                            base_offset, i));
-
-            if (ir->type->base_type == GLSL_TYPE_BOOL)
-               emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
-
-            result = offset(result, 1);
-         }
-      }
-
-      result.reg_offset = 0;
-      break;
-   }
-
-   case ir_triop_fma:
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(MAD(this->result, op[2], op[1], op[0]));
-      break;
-
-   case ir_triop_lrp:
-      emit_lrp(this->result, op[0], op[1], op[2]);
-      break;
-
-   case ir_triop_csel:
-   case ir_unop_interpolate_at_centroid:
-   case ir_binop_interpolate_at_offset:
-   case ir_binop_interpolate_at_sample:
-      unreachable("already handled above");
-      break;
-
-   case ir_unop_d2f:
-   case ir_unop_f2d:
-   case ir_unop_d2i:
-   case ir_unop_i2d:
-   case ir_unop_d2u:
-   case ir_unop_u2d:
-   case ir_unop_d2b:
-   case ir_unop_pack_double_2x32:
-   case ir_unop_unpack_double_2x32:
-   case ir_unop_frexp_sig:
-   case ir_unop_frexp_exp:
-      unreachable("fp64 todo");
-      break;
-   }
-}
-
-void
-fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
-				   const glsl_type *type, bool predicated)
-{
-   switch (type->base_type) {
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_BOOL:
-      for (unsigned int i = 0; i < type->components(); i++) {
-	 l.type = brw_type_for_base_type(type);
-	 r.type = brw_type_for_base_type(type);
-
-	 if (predicated || !l.equals(r)) {
-	    fs_inst *inst = emit(MOV(l, r));
-	    inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
-	 }
-
-	 l = offset(l, 1);
-	 r = offset(r, 1);
-      }
-      break;
-   case GLSL_TYPE_ARRAY:
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_assignment_writes(l, r, type->fields.array, predicated);
-      }
-      break;
-
-   case GLSL_TYPE_STRUCT:
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_assignment_writes(l, r, type->fields.structure[i].type,
-				predicated);
-      }
-      break;
-
-   case GLSL_TYPE_SAMPLER:
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_ATOMIC_UINT:
-      break;
-
-   case GLSL_TYPE_DOUBLE:
-   case GLSL_TYPE_VOID:
-   case GLSL_TYPE_ERROR:
-   case GLSL_TYPE_INTERFACE:
-   case GLSL_TYPE_FUNCTION:
-      unreachable("not reached");
-   }
-}
-
-/* If the RHS processing resulted in an instruction generating a
- * temporary value, and it would be easy to rewrite the instruction to
- * generate its result right into the LHS instead, do so.  This ends
- * up reliably removing instructions where it can be tricky to do so
- * later without real UD chain information.
- */
-bool
-fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
-                                   fs_reg dst,
-                                   fs_reg src,
-                                   fs_inst *pre_rhs_inst,
-                                   fs_inst *last_rhs_inst)
-{
-   /* Only attempt if we're doing a direct assignment. */
-   if (ir->condition ||
-       !(ir->lhs->type->is_scalar() ||
-        (ir->lhs->type->is_vector() &&
-         ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
-      return false;
-
-   /* Make sure the last instruction generated our source reg. */
-   fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
-						    last_rhs_inst,
-						    src);
-   if (!modify)
-      return false;
-
-   /* If last_rhs_inst wrote a different number of components than our LHS,
-    * we can't safely rewrite it.
-    */
-   if (alloc.sizes[dst.reg] != modify->regs_written)
-      return false;
-
-   /* Success!  Rewrite the instruction. */
-   modify->dst = dst;
-
-   return true;
-}
-
-void
-fs_visitor::visit(ir_assignment *ir)
-{
-   fs_reg l, r;
-   fs_inst *inst;
-
-   /* FINISHME: arrays on the lhs */
-   ir->lhs->accept(this);
-   l = this->result;
-
-   fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
-
-   ir->rhs->accept(this);
-   r = this->result;
-
-   fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
-
-   assert(l.file != BAD_FILE);
-   assert(r.file != BAD_FILE);
-
-   if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
-      return;
-
-   if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition);
-   }
-
-   if (ir->lhs->type->is_scalar() ||
-       ir->lhs->type->is_vector()) {
-      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
-	 if (ir->write_mask & (1 << i)) {
-	    inst = emit(MOV(l, r));
-	    if (ir->condition)
-	       inst->predicate = BRW_PREDICATE_NORMAL;
-	    r = offset(r, 1);
-	 }
-	 l = offset(l, 1);
-      }
-   } else {
-      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
-   }
-}
-
 fs_inst *
 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
                               fs_reg coordinate, int coord_components,
@@ -1458,7 +94,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 
    if (shadow_c.file != BAD_FILE) {
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
 
@@ -1466,7 +102,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
        * the unused slots must be zeroed.
        */
       for (int i = coord_components; i < 3; i++) {
-         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
       }
       mlen += 3;
 
@@ -1474,25 +110,25 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 	 /* There's no plain shadow compare message, so we use shadow
 	  * compare with a bias of 0.0.
 	  */
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
 	 mlen++;
       } else if (op == ir_txb || op == ir_txl) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen), lod);
 	 mlen++;
       } else {
          unreachable("Should not get here.");
       }
 
-      emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen), shadow_c);
       mlen++;
    } else if (op == ir_tex) {
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
       /* zero the others. */
       for (int i = coord_components; i<3; i++) {
-         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
       }
       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
       mlen += 3;
@@ -1500,7 +136,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
       fs_reg &dPdx = lod;
 
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
       /* the slots for u and v are always present, but r is optional */
@@ -1521,20 +157,20 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
        *        m5     m6     m7     m8     m9     m10
        */
       for (int i = 0; i < grad_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdx);
 	 dPdx = offset(dPdx, 1);
       }
       mlen += MAX2(grad_components, 2);
 
       for (int i = 0; i < grad_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdy);
 	 dPdy = offset(dPdy, 1);
       }
       mlen += MAX2(grad_components, 2);
    } else if (op == ir_txs) {
       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
       simd16 = true;
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
       mlen += 2;
    } else {
       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
@@ -1544,8 +180,8 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
       assert(op == ir_txb || op == ir_txl || op == ir_txf);
 
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
-                  coordinate));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
+                 coordinate);
 	 coordinate = offset(coordinate, 1);
       }
 
@@ -1553,13 +189,13 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
        * be necessary for TXF (ld), but seems wise to do for all messages.
        */
       for (int i = coord_components; i < 3; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
       }
 
       /* lod/bias appears after u/v/r. */
       mlen += 6;
 
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod);
       mlen++;
 
       /* The unused upper half. */
@@ -1587,7 +223,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
       unreachable("not reached");
    }
 
-   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
+   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
    inst->base_mrf = base_mrf;
    inst->mlen = mlen;
    inst->header_size = 1;
@@ -1595,7 +231,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 
    if (simd16) {
       for (int i = 0; i < 4; i++) {
-	 emit(MOV(orig_dst, dst));
+         bld.MOV(orig_dst, dst);
 	 orig_dst = offset(orig_dst, 1);
 	 dst = offset(dst, 2);
       }
@@ -1621,7 +257,7 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
 
    /* Copy the coordinates. */
    for (int i = 0; i < vector_elements; i++) {
-      emit(MOV(retype(offset(message, i), coordinate.type), coordinate));
+      bld.MOV(retype(offset(message, i), coordinate.type), coordinate);
       coordinate = offset(coordinate, 1);
    }
 
@@ -1630,20 +266,20 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
    /* Messages other than sample and ld require all three components */
    if (has_lod || shadow_c.file != BAD_FILE) {
       for (int i = vector_elements; i < 3; i++) {
-         emit(MOV(offset(message, i), fs_reg(0.0f)));
+         bld.MOV(offset(message, i), fs_reg(0.0f));
       }
    }
 
    if (has_lod) {
       fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
                               BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
    }
 
    if (shadow_c.file != BAD_FILE) {
       fs_reg msg_ref = offset(message, 3 + has_lod);
-      emit(MOV(msg_ref, shadow_c));
+      bld.MOV(msg_ref, shadow_c);
       msg_end = offset(msg_ref, 1);
    }
 
@@ -1658,7 +294,7 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
    default: unreachable("not reached");
    }
 
-   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
+   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
    inst->base_mrf = message.reg - 1;
    inst->mlen = msg_end.reg - inst->base_mrf;
    inst->header_size = 1;
@@ -1698,7 +334,7 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    }
 
    for (int i = 0; i < vector_elements; i++) {
-      emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
+      bld.MOV(retype(offset(msg_coords, i), coordinate.type), coordinate);
       coordinate = offset(coordinate, 1);
    }
    fs_reg msg_end = offset(msg_coords, vector_elements);
@@ -1706,7 +342,7 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
 
    if (shadow_c.file != BAD_FILE) {
       fs_reg msg_shadow = msg_lod;
-      emit(MOV(msg_shadow, shadow_c));
+      bld.MOV(msg_shadow, shadow_c);
       msg_lod = offset(msg_shadow, 1);
       msg_end = msg_lod;
    }
@@ -1717,13 +353,13 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
       opcode = SHADER_OPCODE_TEX;
       break;
    case ir_txb:
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = FS_OPCODE_TXB;
       break;
    case ir_txl:
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXL;
@@ -1740,11 +376,11 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
        */
       msg_end = msg_lod;
       for (int i = 0; i < grad_components; i++) {
-         emit(MOV(msg_end, lod));
+         bld.MOV(msg_end, lod);
          lod = offset(lod, 1);
          msg_end = offset(msg_end, 1);
 
-         emit(MOV(msg_end, lod2));
+         bld.MOV(msg_end, lod2);
          lod2 = offset(lod2, 1);
          msg_end = offset(msg_end, 1);
       }
@@ -1754,21 +390,21 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    }
    case ir_txs:
       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXS;
       break;
    case ir_query_levels:
       msg_lod = msg_end;
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXS;
       break;
    case ir_txf:
       msg_lod = offset(msg_coords, 3);
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXF;
@@ -1776,9 +412,9 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    case ir_txf_ms:
       msg_lod = offset(msg_coords, 3);
       /* lod */
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
       /* sample index */
-      emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
+      bld.MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index);
       msg_end = offset(msg_lod, 2);
 
       opcode = SHADER_OPCODE_TXF_CMS;
@@ -1793,7 +429,7 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
       unreachable("not reached");
    }
 
-   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
+   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
    inst->base_mrf = message.reg;
    inst->mlen = msg_end.reg - message.reg;
    inst->header_size = header_size;
@@ -1851,7 +487,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    }
 
    if (shadow_c.file != BAD_FILE) {
-      emit(MOV(sources[length], shadow_c));
+      bld.MOV(sources[length], shadow_c);
       length++;
    }
 
@@ -1874,11 +510,11 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    case ir_lod:
       break;
    case ir_txb:
-      emit(MOV(sources[length], lod));
+      bld.MOV(sources[length], lod);
       length++;
       break;
    case ir_txl:
-      emit(MOV(sources[length], lod));
+      bld.MOV(sources[length], lod);
       length++;
       break;
    case ir_txd: {
@@ -1888,7 +524,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
        */
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(sources[length], coordinate));
+         bld.MOV(sources[length], coordinate);
 	 coordinate = offset(coordinate, 1);
 	 length++;
 
@@ -1896,11 +532,11 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
           * only derivatives for (u, v, r).
           */
          if (i < grad_components) {
-            emit(MOV(sources[length], lod));
+            bld.MOV(sources[length], lod);
             lod = offset(lod, 1);
             length++;
 
-            emit(MOV(sources[length], lod2));
+            bld.MOV(sources[length], lod2);
             lod2 = offset(lod2, 1);
             length++;
          }
@@ -1910,11 +546,11 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
       break;
    }
    case ir_txs:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
       length++;
       break;
    case ir_query_levels:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u));
       length++;
       break;
    case ir_txf:
@@ -1922,23 +558,23 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
        * On Gen9 they are u, v, lod, r
        */
 
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
       coordinate = offset(coordinate, 1);
       length++;
 
       if (devinfo->gen >= 9) {
          if (coord_components >= 2) {
-            emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
             coordinate = offset(coordinate, 1);
          }
          length++;
       }
 
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
       length++;
 
       for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
-	 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
 	 coordinate = offset(coordinate, 1);
 	 length++;
       }
@@ -1946,18 +582,18 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
       coordinate_done = true;
       break;
    case ir_txf_ms:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
       length++;
 
       /* data from the multisample control surface */
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
       length++;
 
       /* there is no offsetting for this message; just copy in the integer
        * texture coordinates
        */
       for (int i = 0; i < coord_components; i++) {
-         emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
          coordinate = offset(coordinate, 1);
          length++;
       }
@@ -1971,19 +607,19 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
 
          /* More crazy intermixing */
          for (int i = 0; i < 2; i++) { /* u, v */
-            emit(MOV(sources[length], coordinate));
+            bld.MOV(sources[length], coordinate);
             coordinate = offset(coordinate, 1);
             length++;
          }
 
          for (int i = 0; i < 2; i++) { /* offu, offv */
-            emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
             offset_value = offset(offset_value, 1);
             length++;
          }
 
          if (coord_components == 3) { /* r if present */
-            emit(MOV(sources[length], coordinate));
+            bld.MOV(sources[length], coordinate);
             coordinate = offset(coordinate, 1);
             length++;
          }
@@ -1996,7 +632,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    /* Set up the coordinate (except for cases where it was done above) */
    if (!coordinate_done) {
       for (int i = 0; i < coord_components; i++) {
-         emit(MOV(sources[length], coordinate));
+         bld.MOV(sources[length], coordinate);
          coordinate = offset(coordinate, 1);
          length++;
       }
@@ -2010,7 +646,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
 
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
                                BRW_REGISTER_TYPE_F, dispatch_width);
-   emit(LOAD_PAYLOAD(src_payload, sources, length, header_size));
+   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
 
    /* Generate the SEND */
    enum opcode opcode;
@@ -2033,7 +669,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    default:
       unreachable("not reached");
    }
-   fs_inst *inst = emit(opcode, dst, src_payload, sampler);
+   fs_inst *inst = bld.emit(opcode, dst, src_payload, sampler);
    inst->base_mrf = -1;
    inst->mlen = mlen;
    inst->header_size = header_size;
@@ -2051,7 +687,6 @@ fs_reg
 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
                              bool is_rect, uint32_t sampler, int texunit)
 {
-   fs_inst *inst = NULL;
    bool needs_gl_clamp = true;
    fs_reg scale_x, scale_y;
 
@@ -2110,10 +745,10 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       fs_reg src = coordinate;
       coordinate = dst;
 
-      emit(MUL(dst, src, scale_x));
+      bld.MUL(dst, src, scale_x);
       dst = offset(dst, 1);
       src = offset(src, 1);
-      emit(MUL(dst, src, scale_y));
+      bld.MUL(dst, src, scale_y);
    } else if (is_rect) {
       /* On gen6+, the sampler handles the rectangle coordinates
        * natively, without needing rescaling.  But that means we have
@@ -2127,8 +762,8 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 	    fs_reg chan = coordinate;
 	    chan = offset(chan, i);
 
-	    inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
-	    inst->conditional_mod = BRW_CONDITIONAL_GE;
+            set_condmod(BRW_CONDITIONAL_GE,
+                        bld.emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f)));
 
 	    /* Our parameter comes in as 1.0/width or 1.0/height,
 	     * because that's what people normally want for doing
@@ -2137,11 +772,11 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 	     * parameter type, so just invert back.
 	     */
 	    fs_reg limit = vgrf(glsl_type::float_type);
-	    emit(MOV(limit, i == 0 ? scale_x : scale_y));
-	    emit(SHADER_OPCODE_RCP, limit, limit);
+            bld.MOV(limit, i == 0 ? scale_x : scale_y);
+            bld.emit(SHADER_OPCODE_RCP, limit, limit);
 
-	    inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
-	    inst->conditional_mod = BRW_CONDITIONAL_L;
+            set_condmod(BRW_CONDITIONAL_L,
+                        bld.emit(BRW_OPCODE_SEL, chan, chan, limit));
 	 }
       }
    }
@@ -2151,9 +786,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 	    fs_reg chan = coordinate;
 	    chan = offset(chan, i);
-
-	    fs_inst *inst = emit(MOV(chan, chan));
-	    inst->saturate = true;
+            set_saturate(true, bld.MOV(chan, chan));
 	 }
       }
    }
@@ -2173,13 +806,13 @@ fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
    /* parameters are: u, v, r; missing parameters are treated as zero */
    for (int i = 0; i < components; i++) {
       sources[i] = vgrf(glsl_type::float_type);
-      emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
+      bld.MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate);
       coordinate = offset(coordinate, 1);
    }
 
-   emit(LOAD_PAYLOAD(payload, sources, components, 0));
+   bld.LOAD_PAYLOAD(payload, sources, components, 0);
 
-   fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
+   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
    inst->base_mrf = -1;
    inst->mlen = components * reg_width;
    inst->header_size = 0;
@@ -2219,7 +852,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
          this->result = res;
 
          for (int i=0; i<4; i++) {
-            emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
+            bld.MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
             res = offset(res, 1);
          }
          return;
@@ -2276,7 +909,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
    if (op == ir_txs && is_cube_array) {
       fs_reg depth = offset(dst, 2);
       fs_reg fixed_depth = vgrf(glsl_type::int_type);
-      emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
+      bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 
       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
       int components = inst->regs_written / (dst.width / 8);
@@ -2287,167 +920,12 @@ fs_visitor::emit_texture(ir_texture_opcode op,
             fixed_payload[i] = offset(dst, i);
          }
       }
-      emit(LOAD_PAYLOAD(dst, fixed_payload, components, 0));
+      bld.LOAD_PAYLOAD(dst, fixed_payload, components, 0);
    }
 
    swizzle_result(op, dest_type->vector_elements, dst, sampler);
 }
 
-void
-fs_visitor::visit(ir_texture *ir)
-{
-   uint32_t sampler;
-
-   ir_dereference_variable *deref_var = ir->sampler->as_dereference_variable();
-   assert(deref_var);
-   ir_variable *var = deref_var->var;
-
-   sampler = stage_prog_data->bind_map[var->data.set].index[var->data.index];
-
-   ir_rvalue *nonconst_sampler_index =
-      _mesa_get_sampler_array_nonconst_index(ir->sampler);
-
-   /* Handle non-constant sampler array indexing */
-   fs_reg sampler_reg;
-   if (nonconst_sampler_index) {
-      /* The highest sampler which may be used by this operation is
-       * the last element of the array. Mark it here, because the generator
-       * doesn't have enough information to determine the bound.
-       */
-      uint32_t array_size = ir->sampler->as_dereference_array()
-         ->array->type->array_size();
-
-      uint32_t max_used = sampler + array_size - 1;
-      if (ir->op == ir_tg4 && devinfo->gen < 8) {
-         max_used += stage_prog_data->binding_table.gather_texture_start;
-      } else {
-         max_used += stage_prog_data->binding_table.texture_start;
-      }
-
-      brw_mark_surface_used(prog_data, max_used);
-
-      /* Emit code to evaluate the actual indexing expression */
-      nonconst_sampler_index->accept(this);
-      fs_reg temp = vgrf(glsl_type::uint_type);
-      emit(ADD(temp, this->result, fs_reg(sampler)));
-      emit_uniformize(temp, temp);
-
-      sampler_reg = temp;
-   } else {
-      /* Single sampler, or constant array index; the indexing expression
-       * is just an immediate.
-       */
-      sampler_reg = fs_reg(sampler);
-   }
-
-   /* FINISHME: We're failing to recompile our programs when the sampler is
-    * updated.  This only matters for the texture rectangle scale parameters
-    * (pre-gen6, or gen6+ with GL_CLAMP).
-    */
-   int texunit = prog->SamplerUnits[sampler];
-
-   /* Should be lowered by do_lower_texture_projection */
-   assert(!ir->projector);
-
-   /* Should be lowered */
-   assert(!ir->offset || !ir->offset->type->is_array());
-
-   /* Generate code to compute all the subexpression trees.  This has to be
-    * done before loading any values into MRFs for the sampler message since
-    * generating these values may involve SEND messages that need the MRFs.
-    */
-   fs_reg coordinate;
-   int coord_components = 0;
-   if (ir->coordinate) {
-      coord_components = ir->coordinate->type->vector_elements;
-      ir->coordinate->accept(this);
-      coordinate = this->result;
-   }
-
-   fs_reg shadow_comparitor;
-   if (ir->shadow_comparitor) {
-      ir->shadow_comparitor->accept(this);
-      shadow_comparitor = this->result;
-   }
-
-   fs_reg offset_value;
-   if (ir->offset) {
-      ir_constant *const_offset = ir->offset->as_constant();
-      if (const_offset) {
-         /* Store the header bitfield in an IMM register.  This allows us to
-          * use offset_value.file to distinguish between no offset, a constant
-          * offset, and a non-constant offset.
-          */
-         offset_value =
-            fs_reg(brw_texture_offset(const_offset->value.i,
-                                      const_offset->type->vector_elements));
-      } else {
-         ir->offset->accept(this);
-         offset_value = this->result;
-      }
-   }
-
-   fs_reg lod, lod2, sample_index, mcs;
-   int grad_components = 0;
-   switch (ir->op) {
-   case ir_tex:
-   case ir_lod:
-   case ir_tg4:
-   case ir_query_levels:
-      break;
-   case ir_txb:
-      ir->lod_info.bias->accept(this);
-      lod = this->result;
-      break;
-   case ir_txd:
-      ir->lod_info.grad.dPdx->accept(this);
-      lod = this->result;
-
-      ir->lod_info.grad.dPdy->accept(this);
-      lod2 = this->result;
-
-      grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
-      break;
-   case ir_txf:
-   case ir_txl:
-   case ir_txs:
-      ir->lod_info.lod->accept(this);
-      lod = this->result;
-      break;
-   case ir_txf_ms:
-      ir->lod_info.sample_index->accept(this);
-      sample_index = this->result;
-
-      if (devinfo->gen >= 7 &&
-          key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
-         mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
-                              sampler_reg);
-      } else {
-         mcs = fs_reg(0u);
-      }
-      break;
-   default:
-      unreachable("Unrecognized texture opcode");
-   };
-
-   int gather_component = 0;
-   if (ir->op == ir_tg4)
-      gather_component = ir->lod_info.component->as_constant()->value.i[0];
-
-   bool is_rect =
-      ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
-
-   bool is_cube_array =
-      ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-      ir->sampler->type->sampler_array;
-
-   emit_texture(ir->op, ir->type, coordinate, coord_components,
-                shadow_comparitor, lod, lod2, grad_components,
-                sample_index, offset_value, mcs,
-                gather_component, is_cube_array, is_rect, sampler,
-                sampler_reg, texunit);
-}
-
 /**
  * Apply workarounds for Gen6 gather with UINT/SINT
  */
@@ -2462,16 +940,16 @@ fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
    for (int i = 0; i < 4; i++) {
       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
       /* Convert from UNORM to UINT */
-      emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
-      emit(MOV(dst, dst_f));
+      bld.MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1)));
+      bld.MOV(dst, dst_f);
 
       if (wa & WA_SIGN) {
          /* Reinterpret the UINT value as a signed INT value by
           * shifting the sign bit into place, then shifting back
           * preserving sign.
           */
-         emit(SHL(dst, dst, fs_reg(32 - width)));
-         emit(ASR(dst, dst, fs_reg(32 - width)));
+         bld.SHL(dst, dst, fs_reg(32 - width));
+         bld.ASR(dst, dst, fs_reg(32 - width));
       }
 
       dst = offset(dst, 1);
@@ -2535,461 +1013,18 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
 	 l = offset(l, i);
 
 	 if (swiz == SWIZZLE_ZERO) {
-	    emit(MOV(l, fs_reg(0.0f)));
+            bld.MOV(l, fs_reg(0.0f));
 	 } else if (swiz == SWIZZLE_ONE) {
-	    emit(MOV(l, fs_reg(1.0f)));
+            bld.MOV(l, fs_reg(1.0f));
 	 } else {
-            emit(MOV(l, offset(orig_val,
-                               GET_SWZ(key_tex->swizzles[sampler], i))));
+            bld.MOV(l, offset(orig_val,
+                              GET_SWZ(key_tex->swizzles[sampler], i)));
 	 }
       }
       this->result = swizzled_result;
    }
 }
 
-void
-fs_visitor::visit(ir_swizzle *ir)
-{
-   ir->val->accept(this);
-   fs_reg val = this->result;
-
-   if (ir->type->vector_elements == 1) {
-      this->result = offset(this->result, ir->mask.x);
-      return;
-   }
-
-   fs_reg result = vgrf(ir->type);
-   this->result = result;
-
-   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
-      fs_reg channel = val;
-      int swiz = 0;
-
-      switch (i) {
-      case 0:
-	 swiz = ir->mask.x;
-	 break;
-      case 1:
-	 swiz = ir->mask.y;
-	 break;
-      case 2:
-	 swiz = ir->mask.z;
-	 break;
-      case 3:
-	 swiz = ir->mask.w;
-	 break;
-      }
-
-      emit(MOV(result, offset(channel, swiz)));
-      result = offset(result, 1);
-   }
-}
-
-void
-fs_visitor::visit(ir_discard *ir)
-{
-   /* We track our discarded pixels in f0.1.  By predicating on it, we can
-    * update just the flag bits that aren't yet discarded.  If there's no
-    * condition, we emit a CMP of g0 != g0, so all currently executing
-    * channels will get turned off.
-    */
-   fs_inst *cmp;
-   if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition);
-      cmp = (fs_inst *) this->instructions.get_tail();
-      cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
-   } else {
-      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
-                                      BRW_REGISTER_TYPE_UW));
-      cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
-   }
-   cmp->predicate = BRW_PREDICATE_NORMAL;
-   cmp->flag_subreg = 1;
-
-   if (devinfo->gen >= 6) {
-      emit_discard_jump();
-   }
-}
-
-void
-fs_visitor::visit(ir_constant *ir)
-{
-   /* Set this->result to reg at the bottom of the function because some code
-    * paths will cause this visitor to be applied to other fields.  This will
-    * cause the value stored in this->result to be modified.
-    *
-    * Make reg constant so that it doesn't get accidentally modified along the
-    * way.  Yes, I actually had this problem. :(
-    */
-   const fs_reg reg = vgrf(ir->type);
-   fs_reg dst_reg = reg;
-
-   if (ir->type->is_array()) {
-      const unsigned size = type_size(ir->type->fields.array);
-
-      for (unsigned i = 0; i < ir->type->length; i++) {
-	 ir->array_elements[i]->accept(this);
-	 fs_reg src_reg = this->result;
-
-	 dst_reg.type = src_reg.type;
-	 for (unsigned j = 0; j < size; j++) {
-	    emit(MOV(dst_reg, src_reg));
-	    src_reg = offset(src_reg, 1);
-	    dst_reg = offset(dst_reg, 1);
-	 }
-      }
-   } else if (ir->type->is_record()) {
-      foreach_in_list(ir_constant, field, &ir->components) {
-	 const unsigned size = type_size(field->type);
-
-	 field->accept(this);
-	 fs_reg src_reg = this->result;
-
-	 dst_reg.type = src_reg.type;
-	 for (unsigned j = 0; j < size; j++) {
-	    emit(MOV(dst_reg, src_reg));
-	    src_reg = offset(src_reg, 1);
-	    dst_reg = offset(dst_reg, 1);
-	 }
-      }
-   } else {
-      const unsigned size = type_size(ir->type);
-
-      for (unsigned i = 0; i < size; i++) {
-	 switch (ir->type->base_type) {
-	 case GLSL_TYPE_FLOAT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
-	    break;
-	 case GLSL_TYPE_UINT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
-	    break;
-	 case GLSL_TYPE_INT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
-	    break;
-	 case GLSL_TYPE_BOOL:
-            emit(MOV(dst_reg, fs_reg(ir->value.b[i] != 0 ? ~0 : 0)));
-	    break;
-	 default:
-	    unreachable("Non-float/uint/int/bool constant");
-	 }
-	 dst_reg = offset(dst_reg, 1);
-      }
-   }
-
-   this->result = reg;
-}
-
-void
-fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
-{
-   ir_expression *expr = ir->as_expression();
-
-   if (!expr || expr->operation == ir_binop_ubo_load) {
-      ir->accept(this);
-
-      fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      return;
-   }
-
-   fs_reg op[3];
-
-   assert(expr->get_num_operands() <= 3);
-   for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-      assert(expr->operands[i]->type->is_scalar());
-
-      expr->operands[i]->accept(this);
-      op[i] = this->result;
-
-      resolve_ud_negate(&op[i]);
-   }
-
-   emit_bool_to_cond_code_of_reg(expr, op);
-}
-
-void
-fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
-{
-   fs_inst *inst;
-
-   switch (expr->operation) {
-   case ir_unop_logic_not:
-      inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
-      inst->conditional_mod = BRW_CONDITIONAL_Z;
-      break;
-
-   case ir_binop_logic_xor:
-      if (devinfo->gen <= 5) {
-         fs_reg temp = vgrf(expr->type);
-         emit(XOR(temp, op[0], op[1]));
-         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
-      } else {
-         inst = emit(XOR(reg_null_d, op[0], op[1]));
-      }
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-
-   case ir_binop_logic_or:
-      if (devinfo->gen <= 5) {
-         fs_reg temp = vgrf(expr->type);
-         emit(OR(temp, op[0], op[1]));
-         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
-      } else {
-         inst = emit(OR(reg_null_d, op[0], op[1]));
-      }
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-
-   case ir_binop_logic_and:
-      if (devinfo->gen <= 5) {
-         fs_reg temp = vgrf(expr->type);
-         emit(AND(temp, op[0], op[1]));
-         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
-      } else {
-         inst = emit(AND(reg_null_d, op[0], op[1]));
-      }
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-
-   case ir_unop_f2b:
-      if (devinfo->gen >= 6) {
-         emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-      } else {
-         inst = emit(MOV(reg_null_f, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_unop_i2b:
-      if (devinfo->gen >= 6) {
-         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      } else {
-         inst = emit(MOV(reg_null_d, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_binop_greater:
-   case ir_binop_gequal:
-   case ir_binop_less:
-   case ir_binop_lequal:
-   case ir_binop_equal:
-   case ir_binop_all_equal:
-   case ir_binop_nequal:
-   case ir_binop_any_nequal:
-      if (devinfo->gen <= 5) {
-         resolve_bool_comparison(expr->operands[0], &op[0]);
-         resolve_bool_comparison(expr->operands[1], &op[1]);
-      }
-
-      emit(CMP(reg_null_d, op[0], op[1],
-               brw_conditional_for_comparison(expr->operation)));
-      break;
-
-   case ir_triop_csel: {
-      /* Expand the boolean condition into the flag register. */
-      inst = emit(MOV(reg_null_d, op[0]));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-
-      /* Select which boolean to return. */
-      fs_reg temp = vgrf(expr->operands[1]->type);
-      inst = emit(SEL(temp, op[1], op[2]));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-
-      /* Expand the result to a condition code. */
-      inst = emit(MOV(reg_null_d, temp));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-   }
-
-   default:
-      unreachable("not reached");
-   }
-}
-
-/**
- * Emit a gen6 IF statement with the comparison folded into the IF
- * instruction.
- */
-void
-fs_visitor::emit_if_gen6(ir_if *ir)
-{
-   ir_expression *expr = ir->condition->as_expression();
-
-   if (expr && expr->operation != ir_binop_ubo_load) {
-      fs_reg op[3];
-      fs_inst *inst;
-      fs_reg temp;
-
-      assert(expr->get_num_operands() <= 3);
-      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 assert(expr->operands[i]->type->is_scalar());
-
-	 expr->operands[i]->accept(this);
-	 op[i] = this->result;
-      }
-
-      switch (expr->operation) {
-      case ir_unop_logic_not:
-         emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
-         return;
-
-      case ir_binop_logic_xor:
-         emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_binop_logic_or:
-         temp = vgrf(glsl_type::bool_type);
-         emit(OR(temp, op[0], op[1]));
-         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_binop_logic_and:
-         temp = vgrf(glsl_type::bool_type);
-         emit(AND(temp, op[0], op[1]));
-         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_unop_f2b:
-	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 return;
-
-      case ir_unop_i2b:
-	 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-
-      case ir_binop_greater:
-      case ir_binop_gequal:
-      case ir_binop_less:
-      case ir_binop_lequal:
-      case ir_binop_equal:
-      case ir_binop_all_equal:
-      case ir_binop_nequal:
-      case ir_binop_any_nequal:
-         if (devinfo->gen <= 5) {
-            resolve_bool_comparison(expr->operands[0], &op[0]);
-            resolve_bool_comparison(expr->operands[1], &op[1]);
-         }
-
-	 emit(IF(op[0], op[1],
-                 brw_conditional_for_comparison(expr->operation)));
-	 return;
-
-      case ir_triop_csel: {
-         /* Expand the boolean condition into the flag register. */
-         fs_inst *inst = emit(MOV(reg_null_d, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-
-         /* Select which boolean to use as the result. */
-         fs_reg temp = vgrf(expr->operands[1]->type);
-         inst = emit(SEL(temp, op[1], op[2]));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-	 emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-      }
-
-      default:
-	 unreachable("not reached");
-      }
-   }
-
-   ir->condition->accept(this);
-   emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
-}
-
-bool
-fs_visitor::try_opt_frontfacing_ternary(ir_if *ir)
-{
-   ir_dereference_variable *deref = ir->condition->as_dereference_variable();
-   if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
-      return false;
-
-   if (ir->then_instructions.length() != 1 ||
-       ir->else_instructions.length() != 1)
-      return false;
-
-   ir_assignment *then_assign =
-         ((ir_instruction *)ir->then_instructions.head)->as_assignment();
-   ir_assignment *else_assign =
-         ((ir_instruction *)ir->else_instructions.head)->as_assignment();
-
-   if (!then_assign || then_assign->condition ||
-       !else_assign || else_assign->condition ||
-       then_assign->write_mask != else_assign->write_mask ||
-       !then_assign->lhs->equals(else_assign->lhs))
-      return false;
-
-   ir_constant *then_rhs = then_assign->rhs->as_constant();
-   ir_constant *else_rhs = else_assign->rhs->as_constant();
-
-   if (!then_rhs || !else_rhs)
-      return false;
-
-   if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
-      return false;
-
-   if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
-       (else_rhs->is_one() && then_rhs->is_negative_one())) {
-      then_assign->lhs->accept(this);
-      fs_reg dst = this->result;
-      dst.type = BRW_REGISTER_TYPE_D;
-      fs_reg tmp = vgrf(glsl_type::int_type);
-
-      if (devinfo->gen >= 6) {
-         /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
-         fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
-
-         /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
-          *
-          *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
-          *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
-          *
-          * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
-          */
-
-         if (then_rhs->is_negative_one()) {
-            assert(else_rhs->is_one());
-            g0.negate = true;
-         }
-
-         tmp.type = BRW_REGISTER_TYPE_W;
-         tmp.subreg_offset = 2;
-         tmp.stride = 2;
-
-         fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
-         or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
-
-         tmp.type = BRW_REGISTER_TYPE_D;
-         tmp.subreg_offset = 0;
-         tmp.stride = 1;
-      } else {
-         /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
-         fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
-
-         /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
-          *
-          *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
-          *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
-          *
-          * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
-          */
-
-         if (then_rhs->is_negative_one()) {
-            assert(else_rhs->is_one());
-            g1_6.negate = true;
-         }
-
-         emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
-      }
-      emit(AND(dst, tmp, fs_reg(0xbf800000)));
-      return true;
-   }
-
-   return false;
-}
-
 /**
  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
  *
@@ -3056,21 +1091,21 @@ fs_visitor::try_replace_with_sel()
       if (src0.file == IMM) {
          src0 = vgrf(glsl_type::float_type);
          src0.type = then_mov->src[0].type;
-         emit(MOV(src0, then_mov->src[0]));
+         bld.MOV(src0, then_mov->src[0]);
       }
 
-      fs_inst *sel;
       if (if_inst->conditional_mod) {
          /* Sandybridge-specific IF with embedded comparison */
-         emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                  if_inst->conditional_mod));
-         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
-         sel->predicate = BRW_PREDICATE_NORMAL;
+         bld.CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                 if_inst->conditional_mod);
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.emit(BRW_OPCODE_SEL, then_mov->dst,
+                                src0, else_mov->src[0]));
       } else {
          /* Separate CMP and IF instructions */
-         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
-         sel->predicate = if_inst->predicate;
-         sel->predicate_inverse = if_inst->predicate_inverse;
+         set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
+                           bld.emit(BRW_OPCODE_SEL, then_mov->dst,
+                                    src0, else_mov->src[0]));
       }
 
       return true;
@@ -3080,178 +1115,6 @@ fs_visitor::try_replace_with_sel()
 }
 
 void
-fs_visitor::visit(ir_if *ir)
-{
-   if (try_opt_frontfacing_ternary(ir))
-      return;
-
-   /* Don't point the annotation at the if statement, because then it plus
-    * the then and else blocks get printed.
-    */
-   this->base_ir = ir->condition;
-
-   if (devinfo->gen == 6) {
-      emit_if_gen6(ir);
-   } else {
-      emit_bool_to_cond_code(ir->condition);
-
-      emit(IF(BRW_PREDICATE_NORMAL));
-   }
-
-   foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
-      this->base_ir = ir_;
-      ir_->accept(this);
-   }
-
-   if (!ir->else_instructions.is_empty()) {
-      emit(BRW_OPCODE_ELSE);
-
-      foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
-	 this->base_ir = ir_;
-	 ir_->accept(this);
-      }
-   }
-
-   emit(BRW_OPCODE_ENDIF);
-
-   if (!try_replace_with_sel() && devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-}
-
-void
-fs_visitor::visit(ir_loop *ir)
-{
-   if (devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-
-   this->base_ir = NULL;
-   emit(BRW_OPCODE_DO);
-
-   foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
-      this->base_ir = ir_;
-      ir_->accept(this);
-   }
-
-   this->base_ir = NULL;
-   emit(BRW_OPCODE_WHILE);
-}
-
-void
-fs_visitor::visit(ir_loop_jump *ir)
-{
-   switch (ir->mode) {
-   case ir_loop_jump::jump_break:
-      emit(BRW_OPCODE_BREAK);
-      break;
-   case ir_loop_jump::jump_continue:
-      emit(BRW_OPCODE_CONTINUE);
-      break;
-   }
-}
-
-void
-fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
-{
-   ir_dereference *deref = static_cast<ir_dereference *>(
-      ir->actual_parameters.get_head());
-   ir_variable *location = deref->variable_referenced();
-   unsigned surf_index = (stage_prog_data->binding_table.abo_start +
-                          location->data.binding);
-
-   /* Calculate the surface offset */
-   fs_reg offset = vgrf(glsl_type::uint_type);
-   ir_dereference_array *deref_array = deref->as_dereference_array();
-
-   if (deref_array) {
-      deref_array->array_index->accept(this);
-
-      fs_reg tmp = vgrf(glsl_type::uint_type);
-      emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
-      emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
-   } else {
-      offset = fs_reg(location->data.atomic.offset);
-   }
-
-   /* Emit the appropriate machine instruction */
-   const char *callee = ir->callee->function_name();
-   ir->return_deref->accept(this);
-   fs_reg dst = this->result;
-
-   if (!strcmp("__intrinsic_atomic_read", callee)) {
-      emit_untyped_surface_read(surf_index, dst, offset);
-
-   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
-      emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
-                          fs_reg(), fs_reg());
-
-   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
-      emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
-                          fs_reg(), fs_reg());
-   }
-}
-
-void
-fs_visitor::visit(ir_call *ir)
-{
-   const char *callee = ir->callee->function_name();
-
-   if (!strcmp("__intrinsic_atomic_read", callee) ||
-       !strcmp("__intrinsic_atomic_increment", callee) ||
-       !strcmp("__intrinsic_atomic_predecrement", callee)) {
-      visit_atomic_counter_intrinsic(ir);
-   } else {
-      unreachable("Unsupported intrinsic.");
-   }
-}
-
-void
-fs_visitor::visit(ir_return *)
-{
-   unreachable("FINISHME");
-}
-
-void
-fs_visitor::visit(ir_function *ir)
-{
-   /* Ignore function bodies other than main() -- we shouldn't see calls to
-    * them since they should all be inlined before we get to ir_to_mesa.
-    */
-   if (strcmp(ir->name, "main") == 0) {
-      const ir_function_signature *sig;
-      exec_list empty;
-
-      sig = ir->matching_signature(NULL, &empty, false);
-
-      assert(sig);
-
-      foreach_in_list(ir_instruction, ir_, &sig->body) {
-	 this->base_ir = ir_;
-	 ir_->accept(this);
-      }
-   }
-}
-
-void
-fs_visitor::visit(ir_function_signature *)
-{
-   unreachable("not reached");
-}
-
-void
-fs_visitor::visit(ir_emit_vertex *)
-{
-   unreachable("not reached");
-}
-
-void
-fs_visitor::visit(ir_end_primitive *)
-{
-   unreachable("not reached");
-}
-
-void
 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
                                 fs_reg dst, fs_reg offset, fs_reg src0,
                                 fs_reg src1)
@@ -3263,17 +1126,16 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
 
    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    /* Initialize the sample mask in the message header. */
-   emit(MOV(sources[0], fs_reg(0u)))
-      ->force_writemask_all = true;
+   bld.exec_all().MOV(sources[0], fs_reg(0u));
 
    if (stage == MESA_SHADER_FRAGMENT) {
       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
-            ->force_writemask_all = true;
+         bld.exec_all()
+            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
       } else {
-         emit(MOV(component(sources[0], 7),
-                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
-            ->force_writemask_all = true;
+         bld.exec_all()
+            .MOV(component(sources[0], 7),
+                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
       }
    } else {
       /* The execution mask is part of the side-band information sent together with
@@ -3282,37 +1144,37 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
        * the atomic operation.
        */
       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      emit(MOV(component(sources[0], 7),
-               fs_reg(0xffffu)))->force_writemask_all = true;
+      bld.exec_all()
+         .MOV(component(sources[0], 7), fs_reg(0xffffu));
    }
    length++;
 
    /* Set the atomic operation offset. */
    sources[1] = vgrf(glsl_type::uint_type);
-   emit(MOV(sources[1], offset));
+   bld.MOV(sources[1], offset);
    length++;
 
    /* Set the atomic operation arguments. */
    if (src0.file != BAD_FILE) {
       sources[length] = vgrf(glsl_type::uint_type);
-      emit(MOV(sources[length], src0));
+      bld.MOV(sources[length], src0);
       length++;
    }
 
    if (src1.file != BAD_FILE) {
       sources[length] = vgrf(glsl_type::uint_type);
-      emit(MOV(sources[length], src1));
+      bld.MOV(sources[length], src1);
       length++;
    }
 
    int mlen = 1 + (length - 1) * reg_width;
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
                                BRW_REGISTER_TYPE_UD, dispatch_width);
-   emit(LOAD_PAYLOAD(src_payload, sources, length, 1));
+   bld.LOAD_PAYLOAD(src_payload, sources, length, 1);
 
    /* Emit the instruction. */
-   fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
-                        fs_reg(surf_index), fs_reg(atomic_op));
+   fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
+                            fs_reg(surf_index), fs_reg(atomic_op));
    inst->mlen = mlen;
 }
 
@@ -3326,17 +1188,17 @@ fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
 
    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    /* Initialize the sample mask in the message header. */
-   emit(MOV(sources[0], fs_reg(0u)))
-      ->force_writemask_all = true;
+   bld.exec_all()
+      .MOV(sources[0], fs_reg(0u));
 
    if (stage == MESA_SHADER_FRAGMENT) {
       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
-            ->force_writemask_all = true;
+         bld.exec_all()
+            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
       } else {
-         emit(MOV(component(sources[0], 7),
-                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
-            ->force_writemask_all = true;
+         bld.exec_all()
+            .MOV(component(sources[0], 7),
+                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
       }
    } else {
       /* The execution mask is part of the side-band information sent together with
@@ -3345,48 +1207,25 @@ fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
        * the atomic operation.
        */
       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      emit(MOV(component(sources[0], 7),
-               fs_reg(0xffffu)))->force_writemask_all = true;
+      bld.exec_all()
+         .MOV(component(sources[0], 7), fs_reg(0xffffu));
    }
 
    /* Set the surface read offset. */
    sources[1] = vgrf(glsl_type::uint_type);
-   emit(MOV(sources[1], offset));
+   bld.MOV(sources[1], offset);
 
    int mlen = 1 + reg_width;
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
                                BRW_REGISTER_TYPE_UD, dispatch_width);
-   fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1));
+   fs_inst *inst = bld.LOAD_PAYLOAD(src_payload, sources, 2, 1);
 
    /* Emit the instruction. */
-   inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
-               fs_reg(surf_index), fs_reg(1));
+   inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
+                   fs_reg(surf_index), fs_reg(1));
    inst->mlen = mlen;
 }
 
-fs_inst *
-fs_visitor::emit(fs_inst *inst)
-{
-   if (dispatch_width == 16 && inst->exec_size == 8)
-      inst->force_uncompressed = true;
-
-   inst->annotation = this->current_annotation;
-   inst->ir = this->base_ir;
-
-   this->instructions.push_tail(inst);
-
-   return inst;
-}
-
-void
-fs_visitor::emit(exec_list list)
-{
-   foreach_in_list_safe(fs_inst, inst, &list) {
-      inst->exec_node::remove();
-      emit(inst);
-   }
-}
-
 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 void
 fs_visitor::emit_dummy_fs()
@@ -3396,12 +1235,12 @@ fs_visitor::emit_dummy_fs()
    /* Everyone's favorite color. */
    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
    for (int i = 0; i < 4; i++) {
-      emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
-                      dispatch_width), fs_reg(color[i])));
+      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
+                     dispatch_width), fs_reg(color[i]));
    }
 
    fs_inst *write;
-   write = emit(FS_OPCODE_FB_WRITE);
+   write = bld.emit(FS_OPCODE_FB_WRITE);
    write->eot = true;
    if (devinfo->gen >= 6) {
       write->base_mrf = 2;
@@ -3454,19 +1293,19 @@ fs_visitor::emit_interpolation_setup_gen4()
 {
    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 
-   this->current_annotation = "compute pixel centers";
+   fs_builder abld = bld.annotate("compute pixel centers");
    this->pixel_x = vgrf(glsl_type::uint_type);
    this->pixel_y = vgrf(glsl_type::uint_type);
    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
-   emit(ADD(this->pixel_x,
+   abld.ADD(this->pixel_x,
             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
-            fs_reg(brw_imm_v(0x10101010))));
-   emit(ADD(this->pixel_y,
+            fs_reg(brw_imm_v(0x10101010)));
+   abld.ADD(this->pixel_y,
             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
-            fs_reg(brw_imm_v(0x11001100))));
+            fs_reg(brw_imm_v(0x11001100)));
 
-   this->current_annotation = "compute pixel deltas from v0";
+   abld = bld.annotate("compute pixel deltas from v0");
 
    this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
       vgrf(glsl_type::vec2_type);
@@ -3475,27 +1314,27 @@ fs_visitor::emit_interpolation_setup_gen4()
    const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
 
    if (devinfo->has_pln && dispatch_width == 16) {
-      emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
-      emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
-      emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
-         ->force_sechalf = true;
-      emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
-         ->force_sechalf = true;
+      for (unsigned i = 0; i < 2; i++) {
+         abld.half(i).ADD(half(offset(delta_xy, i), 0),
+                          half(this->pixel_x, i), xstart);
+         abld.half(i).ADD(half(offset(delta_xy, i), 1),
+                          half(this->pixel_y, i), ystart);
+      }
    } else {
-      emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
-      emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
+      abld.ADD(offset(delta_xy, 0), this->pixel_x, xstart);
+      abld.ADD(offset(delta_xy, 1), this->pixel_y, ystart);
    }
 
-   this->current_annotation = "compute pos.w and 1/pos.w";
+   abld = bld.annotate("compute pos.w and 1/pos.w");
    /* Compute wpos.w.  It's always in our setup, since it's needed to
     * interpolate the other attributes.
     */
    this->wpos_w = vgrf(glsl_type::float_type);
-   emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
+   abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
+             interp_reg(VARYING_SLOT_POS, 3));
    /* Compute the pixel 1/W value from wpos.w. */
    this->pixel_w = vgrf(glsl_type::float_type);
-   emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
-   this->current_annotation = NULL;
+   abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
 }
 
 /** Emits the interpolation for the varying inputs. */
@@ -3504,8 +1343,8 @@ fs_visitor::emit_interpolation_setup_gen6()
 {
    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 
-   this->current_annotation = "compute pixel centers";
-   if (brw->gen >= 8 || dispatch_width == 8) {
+   fs_builder abld = bld.annotate("compute pixel centers");
+   if (devinfo->gen >= 8 || dispatch_width == 8) {
       /* The "Register Region Restrictions" page says for BDW (and newer,
        * presumably):
        *
@@ -3518,15 +1357,15 @@ fs_visitor::emit_interpolation_setup_gen6()
        */
       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
                           BRW_REGISTER_TYPE_UW, dispatch_width * 2);
-      emit(ADD(int_pixel_xy,
+      abld.exec_all()
+          .ADD(int_pixel_xy,
                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
-               fs_reg(brw_imm_v(0x11001010))))
-         ->force_writemask_all = true;
+               fs_reg(brw_imm_v(0x11001010)));
 
       this->pixel_x = vgrf(glsl_type::float_type);
       this->pixel_y = vgrf(glsl_type::float_type);
-      emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
-      emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
+      abld.emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
+      abld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
    } else {
       /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
        *
@@ -3540,12 +1379,12 @@ fs_visitor::emit_interpolation_setup_gen6()
       fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
       int_pixel_x.type = BRW_REGISTER_TYPE_UW;
       int_pixel_y.type = BRW_REGISTER_TYPE_UW;
-      emit(ADD(int_pixel_x,
+      abld.ADD(int_pixel_x,
                fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
-               fs_reg(brw_imm_v(0x10101010))));
-      emit(ADD(int_pixel_y,
+               fs_reg(brw_imm_v(0x10101010)));
+      abld.ADD(int_pixel_y,
                fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
-               fs_reg(brw_imm_v(0x11001100))));
+               fs_reg(brw_imm_v(0x11001100)));
 
       /* As of gen6, we can no longer mix float and int sources.  We have
        * to turn the integer pixel centers into floats for their actual
@@ -3553,21 +1392,19 @@ fs_visitor::emit_interpolation_setup_gen6()
        */
       this->pixel_x = vgrf(glsl_type::float_type);
       this->pixel_y = vgrf(glsl_type::float_type);
-      emit(MOV(this->pixel_x, int_pixel_x));
-      emit(MOV(this->pixel_y, int_pixel_y));
+      abld.MOV(this->pixel_x, int_pixel_x);
+      abld.MOV(this->pixel_y, int_pixel_y);
    }
 
-   this->current_annotation = "compute pos.w";
+   abld = bld.annotate("compute pos.w");
    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
    this->wpos_w = vgrf(glsl_type::float_type);
-   emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
+   abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
 
    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
       uint8_t reg = payload.barycentric_coord_reg[i];
       this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
    }
-
-   this->current_annotation = NULL;
 }
 
 void
@@ -3581,7 +1418,7 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
       fs_reg tmp = vgrf(glsl_type::vec4_type);
       assert(color.type == BRW_REGISTER_TYPE_F);
       for (unsigned i = 0; i < components; i++) {
-         inst = emit(MOV(offset(tmp, i), offset(color, i)));
+         inst = bld.MOV(offset(tmp, i), offset(color, i));
          inst->saturate = true;
       }
       color = tmp;
@@ -3627,7 +1464,7 @@ fs_visitor::emit_alpha_test()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   this->current_annotation = "Alpha test";
+   const fs_builder abld = bld.annotate("Alpha test");
 
    fs_inst *cmp;
    if (key->alpha_test_func == GL_ALWAYS)
@@ -3637,30 +1474,29 @@ fs_visitor::emit_alpha_test()
       /* f0.1 = 0 */
       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
                                       BRW_REGISTER_TYPE_UW));
-      cmp = emit(CMP(reg_null_f, some_reg, some_reg,
-                     BRW_CONDITIONAL_NEQ));
+      cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
+                     BRW_CONDITIONAL_NEQ);
    } else {
       /* RT0 alpha */
       fs_reg color = offset(outputs[0], 3);
 
       /* f0.1 &= func(color, ref) */
-      cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
-                     cond_for_alpha_func(key->alpha_test_func)));
+      cmp = abld.CMP(bld.null_reg_f(), color, fs_reg(key->alpha_test_ref),
+                     cond_for_alpha_func(key->alpha_test_func));
    }
    cmp->predicate = BRW_PREDICATE_NORMAL;
    cmp->flag_subreg = 1;
 }
 
 fs_inst *
-fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
+fs_visitor::emit_single_fb_write(const fs_builder &bld,
+                                 fs_reg color0, fs_reg color1,
                                  fs_reg src0_alpha, unsigned components,
                                  unsigned exec_size, bool use_2nd_half)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-
-   this->current_annotation = "FB write header";
    int header_size = 2, payload_header_size;
 
    /* We can potentially have a message length of up to 15, so we have to set
@@ -3691,22 +1527,23 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
 
    if (payload.aa_dest_stencil_reg) {
       sources[length] = fs_reg(GRF, alloc.allocate(1));
-      emit(MOV(sources[length],
-               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
+      bld.exec_all().annotate("FB write stencil/AA alpha")
+         .MOV(sources[length],
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
       length++;
    }
 
    prog_data->uses_omask =
       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    if (prog_data->uses_omask) {
-      this->current_annotation = "FB write oMask";
       assert(this->sample_mask.file != BAD_FILE);
       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
        * it's unsinged single words, one vgrf is always 16-wide.
        */
       sources[length] = fs_reg(GRF, alloc.allocate(1),
                                BRW_REGISTER_TYPE_UW, 16);
-      emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
+      bld.exec_all().annotate("FB write oMask")
+         .emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
       length++;
    }
 
@@ -3752,7 +1589,11 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth.file != BAD_FILE);
-         sources[length] = this->frag_depth;
+         if (exec_size < dispatch_width) {
+            sources[length] = half(this->frag_depth, use_2nd_half);
+         } else {
+            sources[length] = this->frag_depth;
+         }
       } else {
 	 /* Pass through the payload depth. */
          sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
@@ -3763,28 +1604,29 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
    if (payload.dest_depth_reg)
       sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
 
+   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
    fs_inst *load;
    fs_inst *write;
    if (devinfo->gen >= 7) {
       /* Send from the GRF */
       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
-      load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size));
+      load = ubld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
       payload.reg = alloc.allocate(load->regs_written);
       load->dst = payload;
-      write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
+      write = ubld.emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
       write->base_mrf = -1;
    } else {
       /* Send from the MRF */
-      load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
-                               sources, length, payload_header_size));
+      load = ubld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
+                               sources, length, payload_header_size);
 
       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
        * will do this for us if we just give it a COMPR4 destination.
        */
-      if (brw->gen < 6 && exec_size == 16)
+      if (devinfo->gen < 6 && exec_size == 16)
          load->dst.reg |= BRW_MRF_COMPR4;
 
-      write = emit(FS_OPCODE_FB_WRITE);
+      write = ubld.emit(FS_OPCODE_FB_WRITE);
       write->exec_size = exec_size;
       write->base_mrf = 1;
    }
@@ -3807,10 +1649,10 @@ fs_visitor::emit_fb_writes()
 
    fs_inst *inst = NULL;
    if (do_dual_src) {
-      this->current_annotation = ralloc_asprintf(this->mem_ctx,
-						 "FB dual-source write");
-      inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
-                                  reg_undef, 4, 8);
+      const fs_builder abld = bld.annotate("FB dual-source write");
+
+      inst = emit_single_fb_write(abld, this->outputs[0],
+                                  this->dual_src_output, reg_undef, 4, 8);
       inst->target = 0;
 
       /* SIMD16 dual source blending requires to send two SIMD8 dual source
@@ -3831,8 +1673,9 @@ fs_visitor::emit_fb_writes()
        * m + 3: a1
        */
       if (dispatch_width == 16) {
-         inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
-                                     reg_undef, 4, 8, true);
+         inst = emit_single_fb_write(abld, this->outputs[0],
+                                     this->dual_src_output, reg_undef, 4, 8,
+                                     true);
          inst->target = 0;
       }
 
@@ -3843,14 +1686,14 @@ fs_visitor::emit_fb_writes()
          if (this->outputs[target].file == BAD_FILE)
             continue;
 
-         this->current_annotation = ralloc_asprintf(this->mem_ctx,
-                                                    "FB write target %d",
-                                                    target);
+         const fs_builder abld = bld.annotate(
+            ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
+
          fs_reg src0_alpha;
          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
             src0_alpha = offset(outputs[0], 3);
 
-         inst = emit_single_fb_write(this->outputs[target], reg_undef,
+         inst = emit_single_fb_write(abld, this->outputs[target], reg_undef,
                                      src0_alpha,
                                      this->output_components[target],
                                      dispatch_width);
@@ -3863,19 +1706,17 @@ fs_visitor::emit_fb_writes()
        * alpha out the pipeline to our null renderbuffer to support
        * alpha-testing, alpha-to-coverage, and so on.
        */
-      inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0,
+      inst = emit_single_fb_write(bld, reg_undef, reg_undef, reg_undef, 0,
                                   dispatch_width);
       inst->target = 0;
    }
 
    inst->eot = true;
-   this->current_annotation = NULL;
 }
 
 void
-fs_visitor::setup_uniform_clipplane_values()
+fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 {
-   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
    const struct brw_vue_prog_key *key =
       (const struct brw_vue_prog_key *) this->key;
 
@@ -3889,7 +1730,7 @@ fs_visitor::setup_uniform_clipplane_values()
    }
 }
 
-void fs_visitor::compute_clip_distance()
+void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
 {
    struct brw_vue_prog_data *vue_prog_data =
       (struct brw_vue_prog_data *) prog_data;
@@ -3918,9 +1759,9 @@ void fs_visitor::compute_clip_distance()
    if (outputs[clip_vertex].file == BAD_FILE)
       return;
 
-   setup_uniform_clipplane_values();
+   setup_uniform_clipplane_values(clip_planes);
 
-   current_annotation = "user clip distances";
+   const fs_builder abld = bld.annotate("user clip distances");
 
    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
@@ -3930,16 +1771,16 @@ void fs_visitor::compute_clip_distance()
       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
       output.reg_offset = i & 3;
 
-      emit(MUL(output, outputs[clip_vertex], u));
+      abld.MUL(output, outputs[clip_vertex], u);
       for (int j = 1; j < 4; j++) {
          u.reg = userplane[i].reg + j;
-         emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
+         abld.MAD(output, output, offset(outputs[clip_vertex], j), u);
       }
    }
 }
 
 void
-fs_visitor::emit_urb_writes()
+fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
 {
    int slot, urb_offset, length;
    struct brw_vs_prog_data *vs_prog_data =
@@ -3954,18 +1795,17 @@ fs_visitor::emit_urb_writes()
 
    /* Lower legacy ff and ClipVertex clipping to clip distances */
    if (key->base.userclip_active && !prog->UsesClipDistanceOut)
-      compute_clip_distance();
+      compute_clip_distance(clip_planes);
 
    /* If we don't have any valid slots to write, just do a minimal urb write
     * send to terminate the shader. */
    if (vue_map->slots_valid == 0) {
 
       fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-      fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
-                                                      BRW_REGISTER_TYPE_UD))));
-      inst->force_writemask_all = true;
+      bld.exec_all().MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
+                                                BRW_REGISTER_TYPE_UD)));
 
-      inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+      fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
       inst->eot = true;
       inst->mlen = 1;
       inst->offset = 1;
@@ -3994,7 +1834,7 @@ fs_visitor::emit_urb_writes()
          }
 
          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-         emit(MOV(zero, fs_reg(0u)));
+         bld.MOV(zero, fs_reg(0u));
 
          sources[length++] = zero;
          if (vue_map->slots_valid & VARYING_BIT_LAYER)
@@ -4049,8 +1889,7 @@ fs_visitor::emit_urb_writes()
             for (int i = 0; i < 4; i++) {
                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
                src = offset(this->outputs[varying], i);
-               fs_inst *inst = emit(MOV(reg, src));
-               inst->saturate = true;
+               set_saturate(true, bld.MOV(reg, src));
                sources[length++] = reg;
             }
          } else {
@@ -4060,7 +1899,7 @@ fs_visitor::emit_urb_writes()
          break;
       }
 
-      current_annotation = "URB write";
+      const fs_builder abld = bld.annotate("URB write");
 
       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
        * the last slot or if we need to flush (see BAD_FILE varying case
@@ -4073,22 +1912,14 @@ fs_visitor::emit_urb_writes()
          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
                                  BRW_REGISTER_TYPE_F, dispatch_width);
-
-         /* We need WE_all on the MOV for the message header (the URB handles)
-          * so do a MOV to a dummy register and set force_writemask_all on the
-          * MOV.  LOAD_PAYLOAD will preserve that.
-          */
-         fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
-                               BRW_REGISTER_TYPE_UD);
-         fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
-                                                       BRW_REGISTER_TYPE_UD))));
-         inst->force_writemask_all = true;
-         payload_sources[0] = dummy;
+         payload_sources[0] =
+            fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
-         emit(LOAD_PAYLOAD(payload, payload_sources, length + 1, 1));
+         abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1);
 
-         inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+         fs_inst *inst =
+            abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
          inst->eot = last;
          inst->mlen = length + 1;
          inst->offset = urb_offset;
@@ -4100,21 +1931,9 @@ fs_visitor::emit_urb_writes()
 }
 
 void
-fs_visitor::resolve_ud_negate(fs_reg *reg)
-{
-   if (reg->type != BRW_REGISTER_TYPE_UD ||
-       !reg->negate)
-      return;
-
-   fs_reg temp = vgrf(glsl_type::uint_type);
-   emit(MOV(temp, *reg));
-   *reg = temp;
-}
-
-void
 fs_visitor::emit_cs_terminate()
 {
-   assert(brw->gen >= 7);
+   assert(devinfo->gen >= 7);
 
    /* We are getting the thread ID from the compute shader header */
    assert(stage == MESA_SHADER_COMPUTE);
@@ -4125,94 +1944,53 @@ fs_visitor::emit_cs_terminate()
     */
    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   fs_inst *inst = emit(MOV(payload, g0));
-   inst->force_writemask_all = true;
+   bld.exec_all().MOV(payload, g0);
 
    /* Send a message to the thread spawner to terminate the thread. */
-   inst = emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
+   fs_inst *inst = bld.exec_all()
+                      .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
    inst->eot = true;
 }
 
-/**
- * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
- *
- * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
- * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
- */
 void
-fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
+fs_visitor::emit_barrier()
 {
-   assert(devinfo->gen <= 5);
+   assert(devinfo->gen >= 7);
 
-   if (rvalue->type != glsl_type::bool_type)
-      return;
+   /* We are getting the barrier ID from the compute shader header */
+   assert(stage == MESA_SHADER_COMPUTE);
 
-   fs_reg and_result = vgrf(glsl_type::bool_type);
-   fs_reg neg_result = vgrf(glsl_type::bool_type);
-   emit(AND(and_result, *reg, fs_reg(1)));
-   emit(MOV(neg_result, negate(and_result)));
-   *reg = neg_result;
-}
+   fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 
-fs_visitor::fs_visitor(struct brw_context *brw,
-                       void *mem_ctx,
-                       const struct brw_wm_prog_key *key,
-                       struct brw_wm_prog_data *prog_data,
-                       struct gl_shader_program *shader_prog,
-                       struct gl_fragment_program *fp,
-                       unsigned dispatch_width)
-   : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
-                     MESA_SHADER_FRAGMENT),
-     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
-     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
-     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
-     key(key), prog_data(&prog_data->base),
-     dispatch_width(dispatch_width), promoted_constants(0)
-{
-   this->mem_ctx = mem_ctx;
-   init();
-}
+   /* Clear the message payload */
+   bld.exec_all().MOV(payload, fs_reg(0u));
 
-fs_visitor::fs_visitor(struct brw_context *brw,
-                       void *mem_ctx,
-                       const struct brw_vs_prog_key *key,
-                       struct brw_vs_prog_data *prog_data,
-                       struct gl_shader_program *shader_prog,
-                       struct gl_vertex_program *cp,
-                       unsigned dispatch_width)
-   : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
-                     MESA_SHADER_VERTEX),
-     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
-     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
-     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
-     key(key), prog_data(&prog_data->base.base),
-     dispatch_width(dispatch_width), promoted_constants(0)
-{
-   this->mem_ctx = mem_ctx;
-   init();
+   /* Copy bits 27:24 of r0.2 (barrier id) to the message payload reg.2 */
+   fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
+   bld.exec_all().AND(component(payload, 2), r0_2, fs_reg(0x0f000000u));
+
+   /* Emit a gateway "barrier" message using the payload we set up, followed
+    * by a wait instruction.
+    */
+   bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
 }
 
-fs_visitor::fs_visitor(struct brw_context *brw,
+fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
                        void *mem_ctx,
-                       const struct brw_cs_prog_key *key,
-                       struct brw_cs_prog_data *prog_data,
+                       gl_shader_stage stage,
+                       const void *key,
+                       struct brw_stage_prog_data *prog_data,
                        struct gl_shader_program *shader_prog,
-                       struct gl_compute_program *cp,
-                       unsigned dispatch_width)
-   : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base,
-                     MESA_SHADER_COMPUTE),
-     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
-     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
-     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
-     key(key), prog_data(&prog_data->base),
-     dispatch_width(dispatch_width)
-{
-   this->mem_ctx = mem_ctx;
-   init();
-}
-
-void
-fs_visitor::init()
+                       struct gl_program *prog,
+                       unsigned dispatch_width,
+                       int shader_time_index)
+   : backend_shader(compiler, log_data, mem_ctx,
+                    shader_prog, prog, prog_data, stage),
+     key(key), prog_data(prog_data),
+     dispatch_width(dispatch_width),
+     shader_time_index(shader_time_index),
+     promoted_constants(0),
+     bld(fs_builder(this, dispatch_width).at_end())
 {
    switch (stage) {
    case MESA_SHADER_FRAGMENT:
@@ -4232,9 +2010,6 @@ fs_visitor::init()
    this->failed = false;
    this->simd16_unsupported = false;
    this->no16_msg = NULL;
-   this->variable_ht = hash_table_ctor(0,
-                                       hash_table_pointer_hash,
-                                       hash_table_pointer_compare);
 
    this->nir_locals = NULL;
    this->nir_globals = NULL;
@@ -4247,9 +2022,6 @@ fs_visitor::init()
    this->first_non_payload_grf = 0;
    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
 
-   this->current_annotation = NULL;
-   this->base_ir = NULL;
-
    this->virtual_grf_start = NULL;
    this->virtual_grf_end = NULL;
    this->live_intervals = NULL;
@@ -4269,5 +2041,4 @@ fs_visitor::init()
 
 fs_visitor::~fs_visitor()
 {
-   hash_table_dtor(this->variable_ht);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
index a323e4d9031..0b8bfc3d9bd 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
@@ -47,11 +47,12 @@ brw_upload_gs_pull_constants(struct brw_context *brw)
       return;
 
    /* BRW_NEW_GS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = &brw->gs.prog_data->base.base;
+   const struct brw_vue_prog_data *prog_data = &brw->gs.prog_data->base;
+   const bool dword_pitch = prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
 
    /* _NEW_PROGRAM_CONSTANTS */
    brw_upload_pull_constants(brw, BRW_NEW_GS_CONSTBUF, &gp->program.Base,
-                             stage_state, prog_data, false);
+                             stage_state, &prog_data->base, dword_pitch);
 }
 
 const struct brw_tracked_state brw_gs_pull_constants = {
@@ -77,8 +78,11 @@ brw_upload_gs_ubo_surfaces(struct brw_context *brw)
       return;
 
    /* BRW_NEW_GS_PROG_DATA */
+   struct brw_vue_prog_data *prog_data = &brw->gs.prog_data->base;
+   bool dword_pitch = prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
+
    brw_upload_ubo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY],
-			   &brw->gs.base, &brw->gs.prog_data->base.base, false);
+			   &brw->gs.base, &prog_data->base, dword_pitch);
 }
 
 const struct brw_tracked_state brw_gs_ubo_surfaces = {
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index e347c518348..7a8c210118c 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -322,6 +322,9 @@ FJ(gen4_jump_count, 111,  96, devinfo->gen < 6)
 FC(gen4_pop_count,  115, 112, devinfo->gen < 6)
 /** @} */
 
+/* Message descriptor bits */
+#define MD(x) ((x) + 96)
+
 /**
  * Fields for SEND messages:
  *  @{
@@ -347,6 +350,7 @@ FF(header_present,
    /* 6:   */ 115, 115,
    /* 7:   */ 115, 115,
    /* 8:   */ 115, 115)
+F(gateway_notify, MD(16), MD(15))
 FF(function_control,
    /* 4:   */ 111,  96,
    /* 4.5: */ 111,  96,
@@ -354,6 +358,13 @@ FF(function_control,
    /* 6:   */ 114,  96,
    /* 7:   */ 114,  96,
    /* 8:   */ 114,  96)
+FF(gateway_subfuncid,
+   /* 4:   */ MD(1), MD(0),
+   /* 4.5: */ MD(1), MD(0),
+   /* 5:   */ MD(1), MD(0), /* 2:0, but bit 2 is reserved MBZ */
+   /* 6:   */ MD(2), MD(0),
+   /* 7:   */ MD(2), MD(0),
+   /* 8:   */ MD(2), MD(0))
 FF(sfid,
    /* 4:   */ 123, 120, /* called msg_target */
    /* 4.5  */ 123, 120,
@@ -364,9 +375,6 @@ FF(sfid,
 FC(base_mrf,   27,  24, devinfo->gen < 6);
 /** @} */
 
-/* Message descriptor bits */
-#define MD(x) (x + 96)
-
 /**
  * URB message function control bits:
  *  @{
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index f3dfe790f34..96dc20da3cf 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -131,14 +131,15 @@ horiz_offset(fs_reg reg, unsigned delta)
 static inline fs_reg
 offset(fs_reg reg, unsigned delta)
 {
-   assert(reg.stride > 0);
    switch (reg.file) {
    case BAD_FILE:
       break;
    case GRF:
    case MRF:
    case ATTR:
-      return byte_offset(reg, delta * reg.width * reg.stride * type_sz(reg.type));
+      return byte_offset(reg,
+                         delta * MAX2(reg.width * reg.stride, 1) *
+                         type_sz(reg.type));
    case UNIFORM:
       reg.reg_offset += delta;
       break;
@@ -155,6 +156,7 @@ component(fs_reg reg, unsigned idx)
    assert(idx < reg.width);
    reg.subreg_offset = idx * type_sz(reg.type);
    reg.width = 1;
+   reg.stride = 0;
    return reg;
 }
 
@@ -254,9 +256,62 @@ public:
    uint8_t exec_size;
 
    bool eot:1;
-   bool force_uncompressed:1;
    bool force_sechalf:1;
    bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
 };
 
+/**
+ * Set second-half quarter control on \p inst.
+ */
+static inline fs_inst *
+set_sechalf(fs_inst *inst)
+{
+   inst->force_sechalf = true;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+static inline fs_inst *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  fs_inst *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+static inline fs_inst *
+set_predicate(enum brw_predicate pred, fs_inst *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+static inline fs_inst *
+set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+static inline fs_inst *
+set_saturate(bool saturate, fs_inst *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index a56fdd6fce9..fceacae0e51 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -190,6 +190,50 @@ public:
    }
 };
 
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+inline vec4_instruction *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  vec4_instruction *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+inline vec4_instruction *
+set_predicate(enum brw_predicate pred, vec4_instruction *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+inline vec4_instruction *
+set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+inline vec4_instruction *
+set_saturate(bool saturate, vec4_instruction *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
 } /* namespace brw */
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
index 0424003ffd5..7a5f9834423 100644
--- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
+++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
@@ -89,19 +89,18 @@ txs_type(const glsl_type *type)
 ir_visitor_status
 lower_texture_grad_visitor::visit_leave(ir_texture *ir)
 {
-   /* Only lower textureGrad with shadow samplers */
-   if (ir->op != ir_txd || !ir->shadow_comparitor)
+   /* Only lower textureGrad with cube maps or shadow samplers */
+   if (ir->op != ir_txd ||
+      (ir->sampler->type->sampler_dimensionality != GLSL_SAMPLER_DIM_CUBE &&
+       !ir->shadow_comparitor))
       return visit_continue;
 
-   /* Lower textureGrad() with samplerCubeShadow even if we have the sample_d_c
+   /* Lower textureGrad() with samplerCube* even if we have the sample_d_c
     * message.  GLSL provides gradients for the 'r' coordinate.  Unfortunately:
     *
     * From the Ivybridge PRM, Volume 4, Part 1, sample_d message description:
     * "The r coordinate contains the faceid, and the r gradients are ignored
     *  by hardware."
-    *
-    * We likely need to do a similar treatment for samplerCube and
-    * samplerCubeArray, but we have insufficient testing for that at the moment.
     */
    bool need_lowering = !has_sample_d_c ||
       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE;
@@ -155,9 +154,20 @@ lower_texture_grad_visitor::visit_leave(ir_texture *ir)
 			       expr(ir_unop_sqrt, dot(dPdy, dPdy)));
    }
 
-   /* lambda_base = log2(rho).  We're ignoring GL state biases for now. */
+   /* lambda_base = log2(rho).  We're ignoring GL state biases for now.
+    *
+    * For cube maps the result of these formulas is giving us a value of rho
+    * that is twice the value we should use, so divide it by 2 or,
+    * alternatively, remove one unit from the result of the log2 computation.
+    */
    ir->op = ir_txl;
-   ir->lod_info.lod = expr(ir_unop_log2, rho);
+   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
+      ir->lod_info.lod = expr(ir_binop_add,
+                              expr(ir_unop_log2, rho),
+                              new(mem_ctx) ir_constant(-1.0f));
+   } else {
+      ir->lod_info.lod = expr(ir_unop_log2, rho);
+   }
 
    progress = true;
    return visit_continue;
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index 06916e28cbd..49f2e3e498c 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -339,8 +339,13 @@ is_color_fast_clear_compatible(struct brw_context *brw,
                                mesa_format format,
                                const union gl_color_union *color)
 {
-   if (_mesa_is_format_integer_color(format))
+   if (_mesa_is_format_integer_color(format)) {
+      if (brw->gen >= 8) {
+         perf_debug("Integer fast clear not enabled for (%s)",
+                    _mesa_get_format_name(format));
+      }
       return false;
+   }
 
    for (int i = 0; i < 4; i++) {
       if (color->f[i] != 0.0 && color->f[i] != 1.0 &&
@@ -466,7 +471,8 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
        *      linear (untiled) memory is UNDEFINED."
        */
       if (irb->mt->tiling == I915_TILING_NONE) {
-         perf_debug("falling back to plain clear because buffers are untiled\n");
+         perf_debug("Falling back to plain clear because %dx%d buffer is untiled\n",
+                    irb->mt->logical_width0, irb->mt->logical_height0);
          clear_type = PLAIN_CLEAR;
       }
 
@@ -477,7 +483,8 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
       for (int i = 0; i < 4; i++) {
          if (_mesa_format_has_color_component(irb->mt->format, i) &&
              !color_mask[i]) {
-            perf_debug("falling back to plain clear because of color mask\n");
+            perf_debug("Falling back to plain clear on %dx%d buffer because of color mask\n",
+                       irb->mt->logical_width0, irb->mt->logical_height0);
             clear_type = PLAIN_CLEAR;
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index fc7018d15b9..d079197a2a9 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -414,6 +414,12 @@ brw_meta_stencil_blit(struct brw_context *brw,
    GLenum target;
 
    _mesa_meta_fb_tex_blit_begin(ctx, &blit);
+   /* XXX: Pretend to support stencil textures so _mesa_base_tex_format()
+    * returns a valid format.  When we properly support the extension, we
+    * should remove this.
+    */
+   assert(ctx->Extensions.ARB_texture_stencil8 == false);
+   ctx->Extensions.ARB_texture_stencil8 = true;
 
    _mesa_GenFramebuffers(1, &fbo);
    /* Force the surface to be configured for level zero. */
@@ -451,6 +457,7 @@ brw_meta_stencil_blit(struct brw_context *brw,
    _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
 error:
+   ctx->Extensions.ARB_texture_stencil8 = false;
    _mesa_meta_fb_tex_blit_end(ctx, target, &blit);
    _mesa_meta_end(ctx);
 
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 67a693b5ec1..5a4515b582d 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -39,6 +39,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
+#include "main/framebuffer.h"
 #include "main/fbobject.h"
 #include "main/glformats.h"
 
@@ -46,12 +47,14 @@
 static void upload_drawing_rect(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
+   const struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const unsigned int fb_width = _mesa_geometric_width(fb);
+   const unsigned int fb_height = _mesa_geometric_height(fb);
 
    BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
    OUT_BATCH(0); /* xmin, ymin */
-   OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
-	    ((ctx->DrawBuffer->Height - 1) << 16));
+   OUT_BATCH(((fb_width - 1) & 0xffff) | ((fb_height - 1) << 16));
    OUT_BATCH(0);
    ADVANCE_BATCH();
 }
@@ -767,7 +770,7 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
     * works just fine, and there's no window system to worry about.
     */
    if (_mesa_is_winsys_fbo(ctx->DrawBuffer))
-      OUT_BATCH((32 - (ctx->DrawBuffer->Height & 31)) & 31);
+      OUT_BATCH((32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31);
    else
       OUT_BATCH(0);
    ADVANCE_BATCH();
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index e4119b1aa3f..b7bb2315b97 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -122,18 +122,9 @@ brw_create_nir(struct brw_context *brw,
    /* Get rid of split copies */
    nir_optimize(nir);
 
-   if (shader_prog) {
-      nir_assign_var_locations_scalar_direct_first(nir, &nir->uniforms,
-                                                   &nir->num_direct_uniforms,
-                                                   &nir->num_uniforms);
-   } else {
-      /* ARB programs generally create a giant array of "uniform" data, and allow
-       * indirect addressing without any boundaries.  In the absence of bounds
-       * analysis, it's all or nothing.  num_direct_uniforms is only useful when
-       * we have some direct and some indirect access; it doesn't matter here.
-       */
-      nir->num_direct_uniforms = 0;
-   }
+   nir_assign_var_locations_scalar_direct_first(nir, &nir->uniforms,
+                                                &nir->num_direct_uniforms,
+                                                &nir->num_uniforms);
    nir_assign_var_locations_scalar(&nir->inputs, &nir->num_inputs);
    nir_assign_var_locations_scalar(&nir->outputs, &nir->num_outputs);
 
@@ -176,6 +167,12 @@ brw_create_nir(struct brw_context *brw,
    nir_validate_shader(nir);
 
    if (unlikely(debug_enabled)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_overload(nir, overload) {
+         if (overload->impl)
+            nir_index_ssa_defs(overload->impl);
+      }
+
       fprintf(stderr, "NIR (SSA form) for %s shader:\n",
               _mesa_shader_stage_to_string(stage));
       nir_print_shader(nir, stderr);
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index b056fbfc427..ea128ccb670 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -88,7 +88,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx,
 	 return NULL;
    }
 
-   case MESA_GEOMETRY_PROGRAM: {
+   case GL_GEOMETRY_PROGRAM_NV: {
       struct brw_geometry_program *prog = CALLOC_STRUCT(brw_geometry_program);
       if (prog) {
          prog->id = get_new_program_id(brw->intelScreen);
@@ -287,18 +287,24 @@ void brwInitFragProgFuncs( struct dd_function_table *functions )
    functions->MemoryBarrier = brw_memory_barrier;
 }
 
+struct shader_times {
+   uint64_t time;
+   uint64_t written;
+   uint64_t reset;
+};
+
 void
 brw_init_shader_time(struct brw_context *brw)
 {
-   const int max_entries = 4096;
-   brw->shader_time.bo = drm_intel_bo_alloc(brw->bufmgr, "shader time",
-                                            max_entries * SHADER_TIME_STRIDE,
-                                            4096);
+   const int max_entries = 2048;
+   brw->shader_time.bo =
+      drm_intel_bo_alloc(brw->bufmgr, "shader time",
+                         max_entries * SHADER_TIME_STRIDE * 3, 4096);
    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
                                           max_entries);
-   brw->shader_time.cumulative = rzalloc_array(brw, uint64_t,
+   brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
                                                max_entries);
    brw->shader_time.max_entries = max_entries;
 }
@@ -319,27 +325,6 @@ compare_time(const void *a, const void *b)
 }
 
 static void
-get_written_and_reset(struct brw_context *brw, int i,
-                      uint64_t *written, uint64_t *reset)
-{
-   enum shader_time_shader_type type = brw->shader_time.types[i];
-   assert(type == ST_VS || type == ST_GS || type == ST_FS8 ||
-          type == ST_FS16 || type == ST_CS);
-
-   /* Find where we recorded written and reset. */
-   int wi, ri;
-
-   for (wi = i; brw->shader_time.types[wi] != type + 1; wi++)
-      ;
-
-   for (ri = i; brw->shader_time.types[ri] != type + 2; ri++)
-      ;
-
-   *written = brw->shader_time.cumulative[wi];
-   *reset = brw->shader_time.cumulative[ri];
-}
-
-static void
 print_shader_time_line(const char *stage, const char *name,
                        int shader_num, uint64_t time, uint64_t total)
 {
@@ -374,26 +359,13 @@ brw_report_shader_time(struct brw_context *brw)
       sorted[i] = &scaled[i];
 
       switch (type) {
-      case ST_VS_WRITTEN:
-      case ST_VS_RESET:
-      case ST_GS_WRITTEN:
-      case ST_GS_RESET:
-      case ST_FS8_WRITTEN:
-      case ST_FS8_RESET:
-      case ST_FS16_WRITTEN:
-      case ST_FS16_RESET:
-      case ST_CS_WRITTEN:
-      case ST_CS_RESET:
-         /* We'll handle these when along with the time. */
-         scaled[i] = 0;
-         continue;
-
       case ST_VS:
       case ST_GS:
       case ST_FS8:
       case ST_FS16:
       case ST_CS:
-         get_written_and_reset(brw, i, &written, &reset);
+         written = brw->shader_time.cumulative[i].written;
+         reset = brw->shader_time.cumulative[i].reset;
          break;
 
       default:
@@ -405,7 +377,7 @@ brw_report_shader_time(struct brw_context *brw)
          break;
       }
 
-      uint64_t time = brw->shader_time.cumulative[i];
+      uint64_t time = brw->shader_time.cumulative[i].time;
       if (written) {
          scaled[i] = time / written * (written + reset);
       } else {
@@ -491,16 +463,19 @@ brw_collect_shader_time(struct brw_context *brw)
     * overhead compared to the cost of tracking the time in the first place.
     */
    drm_intel_bo_map(brw->shader_time.bo, true);
-
-   uint32_t *times = brw->shader_time.bo->virtual;
+   void *bo_map = brw->shader_time.bo->virtual;
 
    for (int i = 0; i < brw->shader_time.num_entries; i++) {
-      brw->shader_time.cumulative[i] += times[i * SHADER_TIME_STRIDE / 4];
+      uint32_t *times = bo_map + i * 3 * SHADER_TIME_STRIDE;
+
+      brw->shader_time.cumulative[i].time += times[SHADER_TIME_STRIDE * 0 / 4];
+      brw->shader_time.cumulative[i].written += times[SHADER_TIME_STRIDE * 1 / 4];
+      brw->shader_time.cumulative[i].reset += times[SHADER_TIME_STRIDE * 2 / 4];
    }
 
    /* Zero the BO out to clear it out for our next collection.
     */
-   memset(times, 0, brw->shader_time.bo->size);
+   memset(bo_map, 0, brw->shader_time.bo->size);
    drm_intel_bo_unmap(brw->shader_time.bo);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index 667c9009304..aea4d9b77d3 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -66,10 +66,20 @@ brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
 void
 brw_write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
 {
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_WRITE_DEPTH_COUNT
-                               | PIPE_CONTROL_DEPTH_STALL,
-                               query_bo, idx * sizeof(uint64_t), 0, 0);
+   uint32_t flags;
+
+   flags = (PIPE_CONTROL_WRITE_DEPTH_COUNT |
+            PIPE_CONTROL_DEPTH_STALL);
+
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers for
+    * conditional rendering.
+    */
+   if (brw->predicate.supported)
+      flags |= PIPE_CONTROL_FLUSH_ENABLE;
+
+   brw_emit_pipe_control_write(brw, flags, query_bo,
+                               idx * sizeof(uint64_t), 0, 0);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index c03a8aed796..c8b134103bb 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -765,6 +765,22 @@ brw_ip_reg(void)
 }
 
 static inline struct brw_reg
+brw_notification_reg(void)
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                  BRW_ARF_NOTIFICATION_COUNT,
+                  0,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_UD,
+                  BRW_VERTICAL_STRIDE_0,
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  BRW_SWIZZLE_XXXX,
+                  WRITEMASK_X);
+}
+
+static inline struct brw_reg
 brw_acc_reg(unsigned width)
 {
    return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE,
@@ -778,7 +794,11 @@ brw_flag_reg(int reg, int subreg)
                       BRW_ARF_FLAG + reg, subreg);
 }
 
-
+/**
+ * Return the mask register present in Gen4-5, or the related register present
+ * in Gen7.5 and later hardware referred to as "channel enable" register in
+ * the documentation.
+ */
 static inline struct brw_reg
 brw_mask_reg(unsigned subnr)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 34f75fdd814..ee0add5d765 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -399,10 +399,10 @@ schedule_node::set_latency_gen7(bool is_haswell)
 
 class instruction_scheduler {
 public:
-   instruction_scheduler(backend_visitor *v, int grf_count,
+   instruction_scheduler(backend_shader *s, int grf_count,
                          instruction_scheduler_mode mode)
    {
-      this->bv = v;
+      this->bs = s;
       this->mem_ctx = ralloc_context(NULL);
       this->grf_count = grf_count;
       this->instructions.make_empty();
@@ -455,7 +455,7 @@ public:
    int grf_count;
    int time;
    exec_list instructions;
-   backend_visitor *bv;
+   backend_shader *bs;
 
    instruction_scheduler_mode mode;
 
@@ -606,7 +606,7 @@ vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *b
 schedule_node::schedule_node(backend_instruction *inst,
                              instruction_scheduler *sched)
 {
-   const struct brw_device_info *devinfo = sched->bv->devinfo;
+   const struct brw_device_info *devinfo = sched->bs->devinfo;
 
    this->inst = inst;
    this->child_array_size = 0;
@@ -1384,7 +1384,7 @@ vec4_instruction_scheduler::issue_time(backend_instruction *inst)
 void
 instruction_scheduler::schedule_instructions(bblock_t *block)
 {
-   const struct brw_device_info *devinfo = bv->devinfo;
+   const struct brw_device_info *devinfo = bs->devinfo;
    backend_instruction *inst = block->end();
    time = 0;
 
@@ -1419,7 +1419,7 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
 
       if (debug) {
          fprintf(stderr, "clock %4d, scheduled: ", time);
-         bv->dump_instruction(chosen->inst);
+         bs->dump_instruction(chosen->inst);
       }
 
       /* Now that we've scheduled a new instruction, some of its
@@ -1435,7 +1435,7 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
 
          if (debug) {
             fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
-            bv->dump_instruction(child->inst);
+            bs->dump_instruction(child->inst);
          }
 
          child->cand_generation = cand_generation;
@@ -1474,7 +1474,7 @@ instruction_scheduler::run(cfg_t *cfg)
    if (debug) {
       fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
               post_reg_alloc);
-      bv->dump_instructions();
+      bs->dump_instructions();
    }
 
    /* Populate the remaining GRF uses array to improve the pre-regalloc
@@ -1504,7 +1504,7 @@ instruction_scheduler::run(cfg_t *cfg)
    if (debug) {
       fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
               post_reg_alloc);
-      bv->dump_instructions();
+      bs->dump_instructions();
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 014b43448ad..5d9892214a9 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -52,6 +52,12 @@ static void upload_sf_vp(struct brw_context *brw)
 			 sizeof(*sfv), 32, &brw->sf.vp_offset);
    memset(sfv, 0, sizeof(*sfv));
 
+   /* Accessing the fields Width and Height of gl_framebuffer to produce the
+    * values to program the viewport and scissor is fine as long as the
+    * gl_framebuffer has atleast one attachment.
+    */
+   assert(ctx->DrawBuffer->_HasAttachments);
+
    if (render_to_fbo) {
       y_scale = 1.0;
       y_bias = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index ebfb49acf8d..06393c8ff2b 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -32,16 +32,106 @@
 #include "glsl/glsl_parser_extras.h"
 #include "main/shaderapi.h"
 
+static void
+shader_debug_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+   va_list args;
+
+   va_start(args, fmt);
+   GLuint msg_id = 0;
+   _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                   MESA_DEBUG_TYPE_OTHER,
+                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
+   va_end(args);
+}
+
+static void
+shader_perf_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      va_list args_copy;
+      va_copy(args_copy, args);
+      vfprintf(stderr, fmt, args_copy);
+      va_end(args_copy);
+   }
+
+   if (brw->perf_debug) {
+      GLuint msg_id = 0;
+      _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                      MESA_DEBUG_TYPE_PERFORMANCE,
+                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
+   }
+   va_end(args);
+}
+
 struct brw_compiler *
 brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 {
    struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
 
    compiler->devinfo = devinfo;
+   compiler->shader_debug_log = shader_debug_log_mesa;
+   compiler->shader_perf_log = shader_perf_log_mesa;
 
    brw_fs_alloc_reg_sets(compiler);
    brw_vec4_alloc_reg_set(compiler);
 
+   if (devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
+      compiler->scalar_vs = true;
+
+   nir_shader_compiler_options *nir_options =
+      rzalloc(compiler, nir_shader_compiler_options);
+   nir_options->native_integers = true;
+   /* In order to help allow for better CSE at the NIR level we tell NIR
+    * to split all ffma instructions during opt_algebraic and we then
+    * re-combine them as a later step.
+    */
+   nir_options->lower_ffma = true;
+   nir_options->lower_sub = true;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
+      compiler->glsl_compiler_options[i].MaxIfDepth =
+         devinfo->gen < 6 ? 16 : UINT_MAX;
+
+      compiler->glsl_compiler_options[i].EmitCondCodes = true;
+      compiler->glsl_compiler_options[i].EmitNoNoise = true;
+      compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectOutput =
+	 (i == MESA_SHADER_FRAGMENT);
+      compiler->glsl_compiler_options[i].EmitNoIndirectTemp =
+	 (i == MESA_SHADER_FRAGMENT);
+      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
+      compiler->glsl_compiler_options[i].LowerClipDistance = true;
+   }
+
+   compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = true;
+   compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
+
+   if (compiler->scalar_vs) {
+      /* If we're using the scalar backend for vertex shaders, we need to
+       * configure these accordingly.
+       */
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
+
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions = nir_options;
+   }
+
+   compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions = nir_options;
+   compiler->glsl_compiler_options[MESA_SHADER_COMPUTE].NirOptions = nir_options;
+
    return compiler;
 }
 
@@ -97,7 +187,7 @@ is_scalar_shader_stage(struct brw_context *brw, int stage)
    case MESA_SHADER_FRAGMENT:
       return true;
    case MESA_SHADER_VERTEX:
-      return brw->scalar_vs;
+      return brw->intelScreen->compiler->scalar_vs;
    default:
       return false;
    }
@@ -632,6 +722,8 @@ brw_instruction_name(enum opcode op)
       return "gs_ff_sync_set_primitives";
    case CS_OPCODE_CS_TERMINATE:
       return "cs_terminate";
+   case SHADER_OPCODE_BARRIER:
+      return "barrier";
    }
 
    unreachable("not reached");
@@ -755,19 +847,22 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
    return false;
 }
 
-backend_visitor::backend_visitor(struct brw_context *brw,
-                                 struct gl_shader_program *shader_prog,
-                                 struct gl_program *prog,
-                                 struct brw_stage_prog_data *stage_prog_data,
-                                 gl_shader_stage stage)
-   : brw(brw),
-     devinfo(brw->intelScreen->devinfo),
-     ctx(&brw->ctx),
+backend_shader::backend_shader(const struct brw_compiler *compiler,
+                               void *log_data,
+                               void *mem_ctx,
+                               struct gl_shader_program *shader_prog,
+                               struct gl_program *prog,
+                               struct brw_stage_prog_data *stage_prog_data,
+                               gl_shader_stage stage)
+   : compiler(compiler),
+     log_data(log_data),
+     devinfo(compiler->devinfo),
      shader(shader_prog ?
         (struct brw_shader *)shader_prog->_LinkedShaders[stage] : NULL),
      shader_prog(shader_prog),
      prog(prog),
      stage_prog_data(stage_prog_data),
+     mem_ctx(mem_ctx),
      cfg(NULL),
      stage(stage)
 {
@@ -950,7 +1045,6 @@ backend_instruction::can_do_saturate() const
    case BRW_OPCODE_LINE:
    case BRW_OPCODE_LRP:
    case BRW_OPCODE_MAC:
-   case BRW_OPCODE_MACH:
    case BRW_OPCODE_MAD:
    case BRW_OPCODE_MATH:
    case BRW_OPCODE_MOV:
@@ -1060,6 +1154,7 @@ backend_instruction::has_side_effects() const
    case SHADER_OPCODE_MEMORY_FENCE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
    case FS_OPCODE_FB_WRITE:
+   case SHADER_OPCODE_BARRIER:
       return true;
    default:
       return false;
@@ -1148,13 +1243,13 @@ backend_instruction::remove(bblock_t *block)
 }
 
 void
-backend_visitor::dump_instructions()
+backend_shader::dump_instructions()
 {
    dump_instructions(NULL);
 }
 
 void
-backend_visitor::dump_instructions(const char *name)
+backend_shader::dump_instructions(const char *name)
 {
    FILE *file = stderr;
    if (name && geteuid() != 0) {
@@ -1183,7 +1278,7 @@ backend_visitor::dump_instructions(const char *name)
 }
 
 void
-backend_visitor::calculate_cfg()
+backend_shader::calculate_cfg()
 {
    if (this->cfg)
       return;
@@ -1191,7 +1286,7 @@ backend_visitor::calculate_cfg()
 }
 
 void
-backend_visitor::invalidate_cfg()
+backend_shader::invalidate_cfg()
 {
    ralloc_free(this->cfg);
    this->cfg = NULL;
@@ -1206,7 +1301,7 @@ backend_visitor::invalidate_cfg()
  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
  */
 void
-backend_visitor::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
+backend_shader::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
 {
    int num_textures = _mesa_fls(prog->SamplersUsed);
 
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 59a0eff824e..b2c1a0b8d69 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -86,6 +86,12 @@ struct brw_compiler {
        */
       int aligned_pairs_class;
    } fs_reg_sets[2];
+
+   void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+   void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+
+   bool scalar_vs;
+   struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 };
 
 enum PACKED register_file {
@@ -211,20 +217,23 @@ enum instruction_scheduler_mode {
    SCHEDULE_POST,
 };
 
-class backend_visitor : public ir_visitor {
+class backend_shader {
 protected:
 
-   backend_visitor(struct brw_context *brw,
-                   struct gl_shader_program *shader_prog,
-                   struct gl_program *prog,
-                   struct brw_stage_prog_data *stage_prog_data,
-                   gl_shader_stage stage);
+   backend_shader(const struct brw_compiler *compiler,
+                  void *log_data,
+                  void *mem_ctx,
+                  struct gl_shader_program *shader_prog,
+                  struct gl_program *prog,
+                  struct brw_stage_prog_data *stage_prog_data,
+                  gl_shader_stage stage);
 
 public:
 
-   struct brw_context * const brw;
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
    const struct brw_device_info * const devinfo;
-   struct gl_context * const ctx;
    struct brw_shader * const shader;
    struct gl_shader_program * const shader_prog;
    struct gl_program * const prog;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 26fdae64ea4..987672f8815 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -229,11 +229,14 @@ void brw_destroy_caches( struct brw_context *brw );
 #define BRW_BATCH_STRUCT(brw, s) \
    intel_batchbuffer_data(brw, (s), sizeof(*(s)), RENDER_RING)
 
-void *brw_state_batch(struct brw_context *brw,
-		      enum aub_state_struct_type type,
-		      int size,
-		      int alignment,
-		      uint32_t *out_offset);
+void *__brw_state_batch(struct brw_context *brw,
+                        enum aub_state_struct_type type,
+                        int size,
+                        int alignment,
+                        int index,
+                        uint32_t *out_offset);
+#define brw_state_batch(brw, type, size, alignment, out_offset) \
+   __brw_state_batch(brw, type, size, alignment, 0, out_offset)
 
 /* brw_wm_surface_state.c */
 void gen4_init_vtable_surface_functions(struct brw_context *brw);
@@ -246,6 +249,7 @@ void brw_configure_w_tiled(const struct intel_mipmap_tree *mt,
                            unsigned *pitch, uint32_t *tiling,
                            unsigned *format);
 
+const char *brw_surface_format_name(unsigned format);
 uint32_t brw_format_for_mesa_format(mesa_format mesa_format);
 
 GLuint translate_tex_target(GLenum target);
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index 45dca69823f..a405a80ef6e 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -38,7 +38,8 @@ static void
 brw_track_state_batch(struct brw_context *brw,
 		      enum aub_state_struct_type type,
 		      uint32_t offset,
-		      int size)
+                      int size,
+                      int index)
 {
    struct intel_batchbuffer *batch = &brw->batch;
 
@@ -53,6 +54,7 @@ brw_track_state_batch(struct brw_context *brw,
    brw->state_batch_list[brw->state_batch_count].offset = offset;
    brw->state_batch_list[brw->state_batch_count].size = size;
    brw->state_batch_list[brw->state_batch_count].type = type;
+   brw->state_batch_list[brw->state_batch_count].index = index;
    brw->state_batch_count++;
 }
 
@@ -108,18 +110,20 @@ brw_annotate_aub(struct brw_context *brw)
  * margin (4096 bytes, even if the object is just a 20-byte surface
  * state), and more buffers to walk and count for aperture size checking.
  *
- * However, due to the restrictions inposed by the aperture size
+ * However, due to the restrictions imposed by the aperture size
  * checking performance hacks, we can't have the batch point at a
  * separate indirect state buffer, because once the batch points at
  * it, no more relocations can be added to it.  So, we sneak these
  * buffers in at the top of the batchbuffer.
  */
 void *
-brw_state_batch(struct brw_context *brw,
-		enum aub_state_struct_type type,
-		int size,
-		int alignment,
-		uint32_t *out_offset)
+__brw_state_batch(struct brw_context *brw,
+                  enum aub_state_struct_type type,
+                  int size,
+                  int alignment,
+                  int index,
+                  uint32_t *out_offset)
+
 {
    struct intel_batchbuffer *batch = &brw->batch;
    uint32_t offset;
@@ -140,7 +144,7 @@ brw_state_batch(struct brw_context *brw,
    batch->state_batch_offset = offset;
 
    if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_AUB)))
-      brw_track_state_batch(brw, type, offset, size);
+      brw_track_state_batch(brw, type, offset, size, index);
 
    *out_offset = offset;
    return batch->map + (offset>>2);
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 530f5a8b76e..b6f4d598e1d 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
+ * Copyright © 2007-2015 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -31,6 +31,41 @@
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_eu.h"
+#include "brw_state.h"
+
+static const char *sampler_mip_filter[] = {
+   "NONE",
+   "NEAREST",
+   "RSVD",
+   "LINEAR"
+};
+
+static const char *sampler_mag_filter[] = {
+   "NEAREST",
+   "LINEAR",
+   "ANISOTROPIC",
+   "FLEXIBLE (GEN8+)",
+   "RSVD", "RSVD",
+   "MONO",
+   "RSVD"
+};
+
+static const char *sampler_addr_mode[] = {
+   "WRAP",
+   "MIRROR",
+   "CLAMP",
+   "CUBE",
+   "CLAMP_BORDER",
+   "MIRROR_ONCE",
+   "HALF_BORDER"
+};
+
+static const char *surface_tiling[] = {
+   "LINEAR",
+   "W-tiled",
+   "X-tiled",
+   "Y-tiled"
+};
 
 static void
 batch_out(struct brw_context *brw, const char *name, uint32_t offset,
@@ -50,6 +85,25 @@ batch_out(struct brw_context *brw, const char *name, uint32_t offset,
    va_end(va);
 }
 
+static void
+batch_out64(struct brw_context *brw, const char *name, uint32_t offset,
+            int index, char *fmt, ...)
+{
+   uint32_t *tmp = brw->batch.bo->virtual + offset;
+
+   /* Swap the dwords since we want to handle this as a 64b value, but the data
+    * is typically emitted as dwords.
+    */
+   uint64_t data = ((uint64_t)tmp[index + 1]) << 32 | tmp[index];
+   va_list va;
+
+   fprintf(stderr, "0x%08x:      0x%016" PRIx64 ": %8s: ",
+          offset + index * 4, data, name);
+   va_start(va, fmt);
+   vfprintf(stderr, fmt, va);
+   va_end(va);
+}
+
 static const char *
 get_965_surfacetype(unsigned int surfacetype)
 {
@@ -64,19 +118,6 @@ get_965_surfacetype(unsigned int surfacetype)
     }
 }
 
-static const char *
-get_965_surface_format(unsigned int surface_format)
-{
-    switch (surface_format) {
-    case 0x000: return "r32g32b32a32_float";
-    case 0x0c1: return "b8g8r8a8_unorm";
-    case 0x100: return "b5g6r5_unorm";
-    case 0x102: return "b5g5r5a1_unorm";
-    case 0x104: return "b4g4r4a4_unorm";
-    default: return "unknown";
-    }
-}
-
 static void dump_vs_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "VS_STATE";
@@ -176,7 +217,7 @@ static void dump_surface_state(struct brw_context *brw, uint32_t offset)
 
    batch_out(brw, name, offset, 0, "%s %s\n",
 	     get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
-	     get_965_surface_format(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)));
+             brw_surface_format_name(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)));
    batch_out(brw, name, offset, 1, "offset\n");
    batch_out(brw, name, offset, 2, "%dx%d size, %d mips\n",
 	     GET_FIELD(surf[2], BRW_SURFACE_WIDTH) + 1,
@@ -200,7 +241,7 @@ static void dump_gen7_surface_state(struct brw_context *brw, uint32_t offset)
 
    batch_out(brw, name, offset, 0, "%s %s %s\n",
              get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
-             get_965_surface_format(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)),
+             brw_surface_format_name(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)),
              (surf[0] & GEN7_SURFACE_IS_ARRAY) ? "array" : "");
    batch_out(brw, name, offset, 1, "offset\n");
    batch_out(brw, name, offset, 2, "%dx%d size, %d mips, %d slices\n",
@@ -222,6 +263,87 @@ static void dump_gen7_surface_state(struct brw_context *brw, uint32_t offset)
    batch_out(brw, name, offset, 7, "\n");
 }
 
+static float q_to_float(uint32_t data, int integer_end, int integer_start,
+                        int fractional_end, int fractional_start)
+{
+   /* Convert the number to floating point. */
+   float n = GET_BITS(data, integer_start, fractional_end);
+
+   /* Multiply by 2^-n */
+   return n * exp2(-(fractional_end - fractional_start + 1));
+}
+
+static void
+dump_gen8_surface_state(struct brw_context *brw, uint32_t offset, int index)
+{
+   uint32_t *surf = brw->batch.bo->virtual + offset;
+   int aux_mode = surf[6] & INTEL_MASK(2, 0);
+   const char *aux_str;
+   char *name;
+
+   if (brw->gen >= 9 && (aux_mode == 1 || aux_mode == 5)) {
+      bool msrt = GET_BITS(surf[4], 5, 3) > 0;
+      bool compression = GET_FIELD(surf[7], GEN9_SURFACE_RT_COMPRESSION) == 1;
+      aux_str = ralloc_asprintf(NULL, "AUX_CCS_%c (%s, MULTISAMPLE_COUNT%c1)",
+                                (aux_mode == 1) ? 'D' : 'E',
+                                compression ? "Compressed RT" : "Uncompressed",
+                                msrt ? '>' : '=');
+   } else {
+      static const char *surface_aux_mode[] = { "AUX_NONE", "AUX_MCS",
+                                                "AUX_APPEND", "AUX_HIZ",
+                                                "RSVD", "RSVD"};
+      aux_str = ralloc_asprintf(NULL, "%s", surface_aux_mode[aux_mode]);
+   }
+
+   name = ralloc_asprintf(NULL, "SURF%03d", index);
+   batch_out(brw, name, offset, 0, "%s %s %s VALIGN%d HALIGN%d %s\n",
+             get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
+             brw_surface_format_name(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)),
+             (surf[0] & GEN7_SURFACE_IS_ARRAY) ? "array" : "",
+             1 << (GET_BITS(surf[0], 17, 16) + 1), /* VALIGN */
+             1 << (GET_BITS(surf[0], 15, 14) + 1), /* HALIGN */
+             surface_tiling[GET_BITS(surf[0], 13, 12)]);
+   batch_out(brw, name, offset, 1, "MOCS: 0x%x Base MIP: %.1f (%u mips) Surface QPitch: %d\n",
+             GET_FIELD(surf[1], GEN8_SURFACE_MOCS),
+             q_to_float(surf[1], 23, 20, 19, 19),
+             surf[5] & INTEL_MASK(3, 0),
+             GET_FIELD(surf[1], GEN8_SURFACE_QPITCH) << 2);
+   batch_out(brw, name, offset, 2, "%dx%d [%s]\n",
+             GET_FIELD(surf[2], GEN7_SURFACE_WIDTH) + 1,
+             GET_FIELD(surf[2], GEN7_SURFACE_HEIGHT) + 1,
+             aux_str);
+   batch_out(brw, name, offset, 3, "%d slices (depth), pitch: %d\n",
+             GET_FIELD(surf[3], BRW_SURFACE_DEPTH) + 1,
+             (surf[3] & INTEL_MASK(17, 0)) + 1);
+   batch_out(brw, name, offset, 4, "min array element: %d, array extent %d, MULTISAMPLE_%d\n",
+             GET_FIELD(surf[4], GEN7_SURFACE_MIN_ARRAY_ELEMENT),
+             GET_FIELD(surf[4], GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT) + 1,
+             1 << GET_BITS(surf[4], 5, 3));
+   batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d\n",
+             GET_FIELD(surf[5], BRW_SURFACE_X_OFFSET),
+             GET_FIELD(surf[5], BRW_SURFACE_Y_OFFSET),
+             GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD));
+   batch_out(brw, name, offset, 6, "AUX pitch: %d qpitch: %d\n",
+             GET_FIELD(surf[6], GEN8_SURFACE_AUX_QPITCH) << 2,
+             GET_FIELD(surf[6], GEN8_SURFACE_AUX_PITCH) << 2);
+   if (brw->gen >= 9) {
+      batch_out(brw, name, offset, 7, "Clear color: R(%x)G(%x)B(%x)A(%x)\n",
+                surf[12], surf[13], surf[14], surf[15]);
+   } else {
+      batch_out(brw, name, offset, 7, "Clear color: %c%c%c%c\n",
+                GET_BITS(surf[7], 31, 31) ? 'R' : '-',
+                GET_BITS(surf[7], 30, 30) ? 'G' : '-',
+                GET_BITS(surf[7], 29, 29) ? 'B' : '-',
+                GET_BITS(surf[7], 28, 28) ? 'A' : '-');
+   }
+
+   for (int i = 8; i < 12; i++)
+      batch_out(brw, name, offset, i, "0x%08x\n", surf[i]);
+
+   ralloc_free((void *)aux_str);
+   ralloc_free(name);
+}
+
 static void
 dump_sdc(struct brw_context *brw, uint32_t offset)
 {
@@ -229,7 +351,7 @@ dump_sdc(struct brw_context *brw, uint32_t offset)
 
    if (brw->gen >= 5 && brw->gen <= 6) {
       struct gen5_sampler_default_color *sdc = (brw->batch.bo->virtual +
-						offset);
+                                                offset);
       batch_out(brw, name, offset, 0, "unorm rgba\n");
       batch_out(brw, name, offset, 1, "r %f\n", sdc->f[0]);
       batch_out(brw, name, offset, 2, "b %f\n", sdc->f[1]);
@@ -271,6 +393,45 @@ static void dump_sampler_state(struct brw_context *brw,
    }
 }
 
+static void gen7_dump_sampler_state(struct brw_context *brw,
+                                    uint32_t offset, uint32_t size)
+{
+   const uint32_t *samp = brw->batch.bo->virtual + offset;
+   char name[20];
+
+   for (int i = 0; i < size / 16; i++) {
+      sprintf(name, "SAMPLER_STATE %d", i);
+      batch_out(brw, name, offset, i,
+                "Disabled = %s, Base Mip: %u.%u, Mip/Mag/Min Filter: %s/%s/%s, LOD Bias: %d.%d\n",
+                GET_BITS(samp[0], 31, 31) ? "yes" : "no",
+                GET_BITS(samp[0], 26, 23),
+                GET_BITS(samp[0], 22, 22),
+                sampler_mip_filter[GET_FIELD(samp[0], BRW_SAMPLER_MIP_FILTER)],
+                sampler_mag_filter[GET_FIELD(samp[0], BRW_SAMPLER_MAG_FILTER)],
+                /* min filter defs are the same as mag */
+                sampler_mag_filter[GET_FIELD(samp[0], BRW_SAMPLER_MIN_FILTER)],
+                GET_BITS(samp[0], 13, 10),
+                GET_BITS(samp[0], 9, 1)
+               );
+      batch_out(brw, name, offset, i+1, "Min LOD: %u.%u, Max LOD: %u.%u\n",
+                GET_BITS(samp[1], 31, 28),
+                GET_BITS(samp[1], 27, 20),
+                GET_BITS(samp[1], 19, 16),
+                GET_BITS(samp[1], 15, 8)
+               );
+      batch_out(brw, name, offset, i+2, "Border Color\n"); /* FINISHME: gen8+ */
+      batch_out(brw, name, offset, i+3, "Max aniso: RATIO %d:1, TC[XYZ] Address Control: %s|%s|%s\n",
+                (GET_FIELD(samp[3], BRW_SAMPLER_MAX_ANISOTROPY) + 1) * 2,
+                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCX_WRAP_MODE)],
+                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCY_WRAP_MODE)],
+                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCZ_WRAP_MODE)]
+               );
+
+      samp += 4;
+      offset += 4 * sizeof(uint32_t);
+   }
+}
+
 static void dump_sf_viewport_state(struct brw_context *brw,
 				   uint32_t offset)
 {
@@ -320,10 +481,17 @@ static void dump_sf_clip_viewport_state(struct brw_context *brw,
    batch_out(brw, name, offset, 3, "m30 = %f\n", vp->viewport.m30);
    batch_out(brw, name, offset, 4, "m31 = %f\n", vp->viewport.m31);
    batch_out(brw, name, offset, 5, "m32 = %f\n", vp->viewport.m32);
-   batch_out(brw, name, offset, 6, "guardband xmin = %f\n", vp->guardband.xmin);
-   batch_out(brw, name, offset, 7, "guardband xmax = %f\n", vp->guardband.xmax);
-   batch_out(brw, name, offset, 8, "guardband ymin = %f\n", vp->guardband.ymin);
-   batch_out(brw, name, offset, 9, "guardband ymax = %f\n", vp->guardband.ymax);
+   batch_out(brw, name, offset, 8, "guardband xmin = %f\n", vp->guardband.xmin);
+   batch_out(brw, name, offset, 9, "guardband xmax = %f\n", vp->guardband.xmax);
+   batch_out(brw, name, offset, 9, "guardband ymin = %f\n", vp->guardband.ymin);
+   batch_out(brw, name, offset, 10, "guardband ymax = %f\n", vp->guardband.ymax);
+   if (brw->gen >= 8) {
+      float *cc_vp = brw->batch.bo->virtual + offset;
+      batch_out(brw, name, offset, 12, "Min extents: %.2fx%.2f\n",
+                cc_vp[12], cc_vp[14]);
+      batch_out(brw, name, offset, 14, "Max extents: %.2fx%.2f\n",
+                cc_vp[13], cc_vp[15]);
+   }
 }
 
 
@@ -398,6 +566,92 @@ static void dump_blend_state(struct brw_context *brw, uint32_t offset)
 }
 
 static void
+gen8_dump_blend_state(struct brw_context *brw, uint32_t offset, uint32_t size)
+{
+   const uint32_t *blend = brw->batch.bo->virtual + offset;
+   const char *logicop[] =
+   {
+        "LOGICOP_CLEAR (BLACK)",
+        "LOGICOP_NOR",
+        "LOGICOP_AND_INVERTED",
+        "LOGICOP_COPY_INVERTED",
+        "LOGICOP_AND_REVERSE",
+        "LOGICOP_INVERT",
+        "LOGICOP_XOR",
+        "LOGICOP_NAND",
+        "LOGICOP_AND",
+        "LOGICOP_EQUIV",
+        "LOGICOP_NOOP",
+        "LOGICOP_OR_INVERTED",
+        "LOGICOP_COPY",
+        "LOGICOP_OR_REVERSE",
+        "LOGICOP_OR",
+        "LOGICOP_SET (WHITE)"
+   };
+
+   const char *blend_function[] =
+   { "ADD", "SUBTRACT", "REVERSE_SUBTRACT", "MIN", "MAX};" };
+
+   const char *blend_factor[0x1b] =
+   {
+      "RSVD",
+      "ONE",
+      "SRC_COLOR", "SRC_ALPHA",
+      "DST_ALPHA", "DST_COLOR",
+      "SRC_ALPHA_SATURATE",
+      "CONST_COLOR", "CONST_ALPHA",
+      "SRC1_COLOR", "SRC1_ALPHA",
+      "RSVD", "RSVD", "RSVD", "RSVD", "RSVD", "RSVD",
+      "ZERO",
+      "INV_SRC_COLOR", "INV_SRC_ALPHA",
+      "INV_DST_ALPHA", "INV_DST_COLOR",
+      "RSVD",
+      "INV_CONST_COLOR", "INV_CONST_ALPHA",
+      "INV_SRC1_COLOR", "INV_SRC1_ALPHA"
+   };
+
+   batch_out(brw, "BLEND", offset, 0, "Alpha blend/test\n");
+
+   if (((size) % 2) != 0)
+      fprintf(stderr, "Invalid blend state size %d\n", size);
+
+   for (int i = 1; i < size / 4; i += 2) {
+      char name[sizeof("BLEND_ENTRYXXX")];
+      sprintf(name, "BLEND_ENTRY%02d", (i - 1) / 2);
+      if (blend[i + 1] & GEN8_BLEND_LOGIC_OP_ENABLE) {
+         batch_out(brw, name, offset, i + 1, "%s\n",
+                   logicop[GET_FIELD(blend[i + 1],
+                                     GEN8_BLEND_LOGIC_OP_FUNCTION)]);
+      } else if (blend[i] & GEN8_BLEND_COLOR_BUFFER_BLEND_ENABLE) {
+         batch_out64(brw, name, offset, i,
+                   "\n\t\t\tColor Buffer Blend factor %s,%s,%s,%s (src,dst,src alpha, dst alpha)"
+                   "\n\t\t\tfunction %s,%s (color, alpha), Disables: %c%c%c%c\n",
+                   blend_factor[GET_FIELD(blend[i],
+                                          GEN8_BLEND_SRC_BLEND_FACTOR)],
+                   blend_factor[GET_FIELD(blend[i],
+                                          GEN8_BLEND_DST_BLEND_FACTOR)],
+                   blend_factor[GET_FIELD(blend[i],
+                                          GEN8_BLEND_SRC_ALPHA_BLEND_FACTOR)],
+                   blend_factor[GET_FIELD(blend[i],
+                                          GEN8_BLEND_DST_ALPHA_BLEND_FACTOR)],
+                   blend_function[GET_FIELD(blend[i],
+                                            GEN8_BLEND_COLOR_BLEND_FUNCTION)],
+                   blend_function[GET_FIELD(blend[i],
+                                            GEN8_BLEND_ALPHA_BLEND_FUNCTION)],
+                   blend[i] & GEN8_BLEND_WRITE_DISABLE_RED ? 'R' : '-',
+                   blend[i] & GEN8_BLEND_WRITE_DISABLE_GREEN ? 'G' : '-',
+                   blend[i] & GEN8_BLEND_WRITE_DISABLE_BLUE ? 'B' : '-',
+                   blend[i] & GEN8_BLEND_WRITE_DISABLE_ALPHA ? 'A' : '-'
+                   );
+      } else if (!blend[i] && (blend[i + 1] == 0xb)) {
+         batch_out64(brw, name, offset, i, "NOP blend state\n");
+      } else {
+         batch_out64(brw, name, offset, i, "????\n");
+      }
+   }
+}
+
+static void
 dump_scissor(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "SCISSOR";
@@ -555,20 +809,29 @@ dump_state_batch(struct brw_context *brw)
 	    dump_cc_state_gen4(brw, offset);
 	 break;
       case AUB_TRACE_BLEND_STATE:
-	 dump_blend_state(brw, offset);
+         if (brw->gen >= 8)
+            gen8_dump_blend_state(brw, offset, size);
+         else
+            dump_blend_state(brw, offset);
 	 break;
       case AUB_TRACE_BINDING_TABLE:
 	 dump_binding_table(brw, offset, size);
 	 break;
       case AUB_TRACE_SURFACE_STATE:
-	 if (brw->gen < 7) {
-	    dump_surface_state(brw, offset);
-	 } else {
+         if (brw->gen >= 8) {
+            dump_gen8_surface_state(brw, offset,
+                                    brw->state_batch_list[i].index);
+         } else if (brw->gen >= 7) {
 	    dump_gen7_surface_state(brw, offset);
-	 }
+         } else {
+            dump_surface_state(brw, offset);
+         }
 	 break;
       case AUB_TRACE_SAMPLER_STATE:
-         dump_sampler_state(brw, offset, size);
+         if (brw->gen >= 7)
+            gen7_dump_sampler_state(brw, offset, size);
+         else
+            dump_sampler_state(brw, offset, size);
 	 break;
       case AUB_TRACE_SAMPLER_DEFAULT_COLOR:
 	 dump_sdc(brw, offset);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 84b0861aaad..08d1ac28885 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -41,6 +41,7 @@
 #include "brw_gs.h"
 #include "brw_wm.h"
 #include "brw_cs.h"
+#include "main/framebuffer.h"
 
 static const struct brw_tracked_state *gen4_atoms[] =
 {
@@ -660,6 +661,7 @@ brw_upload_pipeline_state(struct brw_context *brw,
    int i;
    static int dirty_count = 0;
    struct brw_state_flags state = brw->state.pipelines[pipeline];
+   unsigned int fb_samples = _mesa_geometric_samples(ctx->DrawBuffer);
 
    brw_select_pipeline(brw, pipeline);
 
@@ -696,8 +698,8 @@ brw_upload_pipeline_state(struct brw_context *brw,
       brw->ctx.NewDriverState |= BRW_NEW_META_IN_PROGRESS;
    }
 
-   if (brw->num_samples != ctx->DrawBuffer->Visual.samples) {
-      brw->num_samples = ctx->DrawBuffer->Visual.samples;
+   if (brw->num_samples != fb_samples) {
+      brw->num_samples = fb_samples;
       brw->ctx.NewDriverState |= BRW_NEW_NUM_SAMPLES;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index 016f87a4c2a..05016067bba 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -39,13 +39,14 @@ struct surface_format_info {
    int input_vb;
    int streamed_output_vb;
    int color_processing;
+   const char *name;
 };
 
 /* This macro allows us to write the table almost as it appears in the PRM,
  * while restructuring it to turn it into the C code we want.
  */
 #define SF(sampl, filt, shad, ck, rt, ab, vb, so, color, sf) \
-   [sf] = { true, sampl, filt, shad, ck, rt, ab, vb, so, color },
+   [BRW_SURFACEFORMAT_##sf] = { true, sampl, filt, shad, ck, rt, ab, vb, so, color, #sf},
 
 #define Y 0
 #define x 999
@@ -73,6 +74,7 @@ struct surface_format_info {
  * VB    - Input Vertex Buffer
  * SO    - Steamed Output Vertex Buffers (transform feedback)
  * color - Color Processing
+ * sf    - Surface Format
  *
  * See page 88 of the Sandybridge PRM VOL4_Part1 PDF.
  *
@@ -85,230 +87,236 @@ struct surface_format_info {
  */
 const struct surface_format_info surface_formats[] = {
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32A32_FLOAT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32A32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32A32_UINT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R64G64_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32G32B32X32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R64G64_PASSTHRU)
-   SF( Y, 50,  x,  x,  x,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32_FLOAT)
-   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32_SINT)
-   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32_UINT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32G32B32_SFIXED)
-   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x, 60, BRW_SURFACEFORMAT_R16G16B16A16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_FLOAT)
-   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32_FLOAT)
-   SF( Y, 70,  x,  x,  Y,  Y,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32_FLOAT_LD)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32_UINT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L32A32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R64_FLOAT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R16G16B16X16_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R16G16B16X16_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A32X32_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L32X32_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I32X32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32G32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R64_PASSTHRU)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  Y,  x, 60, BRW_SURFACEFORMAT_B8G8R8A8_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB)
+   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32B32A32_FLOAT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32B32A32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32B32A32_UINT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, R32G32B32X32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32B32A32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64_PASSTHRU)
+   SF( Y, 50,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_FLOAT)
+   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_SINT)
+   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_UINT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32B32_SFIXED)
+   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x, 60, R16G16B16A16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16G16B16A16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16B16A16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16B16A16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16G16B16A16_FLOAT)
+   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32_FLOAT)
+   SF( Y, 70,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32_FLOAT_LD)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32_UINT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, R32_FLOAT_X8X24_TYPELESS)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, X32_TYPELESS_G8X24_UINT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, L32A32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64_FLOAT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R16G16B16X16_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R16G16B16X16_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, A32X32_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, L32X32_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, I32X32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16A16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16A16_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64_PASSTHRU)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  Y,  x, 60, B8G8R8A8_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B8G8R8A8_UNORM_SRGB)
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, BRW_SURFACEFORMAT_R10G10B10A2_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x, 60, BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_UINT)
-   SF( Y,  Y,  x,  x,  x,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, BRW_SURFACEFORMAT_R8G8B8A8_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_UINT)
-   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_FLOAT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, BRW_SURFACEFORMAT_B10G10R10A2_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R11G11B10_FLOAT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32_UINT)
-   SF( Y, 50,  Y,  x,  Y,  Y,  Y,  Y,  x, BRW_SURFACEFORMAT_R32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L16A16_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A32_FLOAT)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x, 60, BRW_SURFACEFORMAT_B8G8R8X8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8X8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10X2_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L16A16_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32_SNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, R10G10B10A2_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x, 60, R10G10B10A2_UNORM_SRGB)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R10G10B10A2_UINT)
+   SF( Y,  Y,  x,  x,  x,  Y,  Y,  x,  x, R10G10B10_SNORM_A2_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, R8G8B8A8_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, R8G8B8A8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R8G8B8A8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8B8A8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8B8A8_UINT)
+   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x,  x, R16G16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16G16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16G16_FLOAT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, B10G10R10A2_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, B10G10R10A2_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R11G11B10_FLOAT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32_UINT)
+   SF( Y, 50,  Y,  x,  Y,  Y,  Y,  Y,  x, R32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, R24_UNORM_X8_TYPELESS)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, X24_TYPELESS_G8_UINT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, L16A16_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, I24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, L24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, A24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, I32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, L32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, A32_FLOAT)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x, 60, B8G8R8X8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, B8G8R8X8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R8G8B8X8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R8G8B8X8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R9G9B9E5_SHAREDEXP)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, B10G10R10X2_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, L16A16_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_SNORM)
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R10G10B10X2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32_USCALED)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G6R5_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G5R5A1_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B4G4R4A4_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_UNORM)
-   SF( Y,  Y,  x,  Y,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_UINT)
-   SF( Y,  Y,  Y,  x,  Y, 45,  Y,  x, 70, BRW_SURFACEFORMAT_R16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R16_FLOAT)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A8P8_UNORM_PALETTE0)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A8P8_UNORM_PALETTE1)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I16_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L16_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A16_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8A8_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I16_FLOAT)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L16_FLOAT)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A16_FLOAT)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8A8_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM)
-   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G5R5X1_UNORM)
-   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R10G10B10X2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8A8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8A8_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_USCALED)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B5G6R5_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B5G6R5_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B5G5R5A1_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5A1_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B4G4R4A4_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B4G4R4A4_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R8G8_UNORM)
+   SF( Y,  Y,  x,  Y,  Y, 60,  Y,  x,  x, R8G8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8_UINT)
+   SF( Y,  Y,  Y,  x,  Y, 45,  Y,  x, 70, R16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16_FLOAT)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, A8P8_UNORM_PALETTE0)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, A8P8_UNORM_PALETTE1)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, I16_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, L16_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, A16_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, L8A8_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, I16_FLOAT)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, L16_FLOAT)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, A16_FLOAT)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, L8A8_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, R5G5_SNORM_B6_UNORM)
+   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5X1_UNORM)
+   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5X1_UNORM_SRGB)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8_USCALED)
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16_USCALED)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P8A8_UNORM_PALETTE0)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P8A8_UNORM_PALETTE1)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A1B5G5R5_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A4B4G4R4_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8A8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8A8_SINT)
-   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R8_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8_UINT)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_A8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I8_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P4A4_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A4P4_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8_USCALED)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P8_UNORM_PALETTE0)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8_UNORM_SRGB)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P8_UNORM_PALETTE1)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P4A4_UNORM_PALETTE1)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A4P4_UNORM_PALETTE1)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_Y8_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I8_SINT)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_DXT1_RGB_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R1_UINT)
-   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, BRW_SURFACEFORMAT_YCRCB_NORMAL)
-   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, BRW_SURFACEFORMAT_YCRCB_SWAPUVY)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P2_UNORM_PALETTE0)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P2_UNORM_PALETTE1)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC1_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC2_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC3_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC4_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC5_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC1_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC2_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC3_UNORM_SRGB)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_MONO8)
-   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, BRW_SURFACEFORMAT_YCRCB_SWAPUV)
-   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, BRW_SURFACEFORMAT_YCRCB_SWAPY)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_DXT1_RGB)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16_USCALED)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, P8A8_UNORM_PALETTE0)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, P8A8_UNORM_PALETTE1)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, A1B5G5R5_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, A4B4G4R4_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8A8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8A8_SINT)
+   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x, R8_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8_UINT)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, A8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, I8_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, L8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, P4A4_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, A4P4_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8_USCALED)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P8_UNORM_PALETTE0)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, L8_UNORM_SRGB)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P8_UNORM_PALETTE1)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P4A4_UNORM_PALETTE1)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, A4P4_UNORM_PALETTE1)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, Y8_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, I8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, I8_SINT)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, DXT1_RGB_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R1_UINT)
+   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, YCRCB_NORMAL)
+   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, YCRCB_SWAPUVY)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P2_UNORM_PALETTE0)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P2_UNORM_PALETTE1)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC1_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC2_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC3_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC4_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC5_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC1_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC2_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC3_UNORM_SRGB)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, MONO8)
+   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, YCRCB_SWAPUV)
+   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, YCRCB_SWAPY)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, DXT1_RGB)
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_FXT1)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R64G64B64A64_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R64G64B64_FLOAT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC4_SNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC5_SNORM)
-   SF(50, 50,  x,  x,  x,  x, 60,  x,  x, BRW_SURFACEFORMAT_R16G16B16_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16_USCALED)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC6H_SF16)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC7_UNORM)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC7_UNORM_SRGB)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC6H_UF16)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_PLANAR_420_8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8_UNORM_SRGB)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC1_RGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_RGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_EAC_R11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_EAC_RG11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_EAC_SIGNED_R11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_EAC_SIGNED_RG11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_SRGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R16G16B16_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R16G16B16_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R64G64B64A64_PASSTHRU)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R64G64B64_PASSTHRU)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_RGB8_PTA)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_SRGB8_PTA)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_EAC_RGBA8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_EAC_SRGB8_A8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8_SINT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, FXT1)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64B64A64_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64B64_FLOAT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC4_SNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC5_SNORM)
+   SF(50, 50,  x,  x,  x,  x, 60,  x,  x, R16G16B16_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_USCALED)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC6H_SF16)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC7_UNORM)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC7_UNORM_SRGB)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC6H_UF16)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, PLANAR_420_8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_UNORM_SRGB)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC1_RGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_RGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_R11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_RG11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_SIGNED_R11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_SIGNED_RG11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_SRGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R16G16B16_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R16G16B16_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64B64A64_PASSTHRU)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64B64_PASSTHRU)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_RGB8_PTA)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_SRGB8_PTA)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_EAC_RGBA8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_EAC_SRGB8_A8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_SINT)
 };
 #undef x
 #undef Y
 
+const char *
+brw_surface_format_name(unsigned format)
+{
+   return surface_formats[format].name;
+}
+
 uint32_t
 brw_format_for_mesa_format(mesa_format mesa_format)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 72b02a2cf0a..998d8c42770 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -40,9 +40,88 @@
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
 static unsigned int
+tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
+                                     const struct intel_mipmap_tree *mt)
+{
+   const unsigned *align_yf, *align_ys;
+   const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
+   unsigned ret_align, divisor;
+
+   /* Horizontal alignment tables for TRMODE_{YF,YS}. Value in below
+    * tables specifies the horizontal alignment requirement in elements
+    * for the surface. An element is defined as a pixel in uncompressed
+    * surface formats, and as a compression block in compressed surface
+    * formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
+    * element is a sample.
+    */
+   const unsigned align_1d_yf[] = {4096, 2048, 1024, 512, 256};
+   const unsigned align_1d_ys[] = {65536, 32768, 16384, 8192, 4096};
+   const unsigned align_2d_yf[] = {64, 64, 32, 32, 16};
+   const unsigned align_2d_ys[] = {256, 256, 128, 128, 64};
+   const unsigned align_3d_yf[] = {16, 8, 8, 8, 4};
+   const unsigned align_3d_ys[] = {64, 32, 32, 32, 16};
+   int i = 0;
+
+   /* Alignment computations below assume bpp >= 8 and a power of 2. */
+   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+
+   switch(mt->target) {
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_1D_ARRAY:
+      align_yf = align_1d_yf;
+      align_ys = align_1d_ys;
+      break;
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      align_yf = align_2d_yf;
+      align_ys = align_2d_ys;
+      break;
+   case GL_TEXTURE_3D:
+      align_yf = align_3d_yf;
+      align_ys = align_3d_ys;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* Compute array index. */
+   i = ffs(bpp/8) - 1;
+
+   ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+               align_yf[i] : align_ys[i];
+
+   assert(is_power_of_two(mt->num_samples));
+
+   switch (mt->num_samples) {
+   case 2:
+   case 4:
+      divisor = 2;
+      break;
+   case 8:
+   case 16:
+      divisor = 4;
+      break;
+   default:
+      divisor = 1;
+      break;
+   }
+   return ret_align / divisor;
+}
+
+
+static unsigned int
 intel_horizontal_texture_alignment_unit(struct brw_context *brw,
-                                        struct intel_mipmap_tree *mt)
+                                        struct intel_mipmap_tree *mt,
+                                        uint32_t layout_flags)
 {
+   if (layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16)
+      return 16;
+
    /**
     * From the "Alignment Unit Size" section of various specs, namely:
     * - Gen3 Spec: "Memory Data Formats" Volume,         Section 1.20.1.4
@@ -88,18 +167,85 @@ intel_horizontal_texture_alignment_unit(struct brw_context *brw,
    if (mt->format == MESA_FORMAT_S_UINT8)
       return 8;
 
+   if (brw->gen >= 9 && mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
+      uint32_t align = tr_mode_horizontal_texture_alignment(brw, mt);
+      /* XY_FAST_COPY_BLT doesn't support horizontal alignment < 32. */
+      return align < 32 ? 32 : align;
+   }
+
    if (brw->gen >= 7 && mt->format == MESA_FORMAT_Z_UNORM16)
       return 8;
 
-   if (brw->gen == 8 && mt->mcs_mt && mt->num_samples <= 1)
-      return 16;
-
    return 4;
 }
 
 static unsigned int
+tr_mode_vertical_texture_alignment(const struct brw_context *brw,
+                                   const struct intel_mipmap_tree *mt)
+{
+   const unsigned *align_yf, *align_ys;
+   const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
+   unsigned ret_align, divisor;
+
+   /* Vertical alignment tables for TRMODE_YF and TRMODE_YS. */
+   const unsigned align_2d_yf[] = {64, 32, 32, 16, 16};
+   const unsigned align_2d_ys[] = {256, 128, 128, 64, 64};
+   const unsigned align_3d_yf[] = {16, 16, 16, 8, 8};
+   const unsigned align_3d_ys[] = {32, 32, 32, 16, 16};
+   int i = 0;
+
+   assert(brw->gen >= 9 &&
+          mt->target != GL_TEXTURE_1D &&
+          mt->target != GL_TEXTURE_1D_ARRAY);
+
+   /* Alignment computations below assume bpp >= 8 and a power of 2. */
+   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp)) ;
+
+   switch(mt->target) {
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      align_yf = align_2d_yf;
+      align_ys = align_2d_ys;
+      break;
+   case GL_TEXTURE_3D:
+      align_yf = align_3d_yf;
+      align_ys = align_3d_ys;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* Compute array index. */
+   i = ffs(bpp / 8) - 1;
+
+   ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+               align_yf[i] : align_ys[i];
+
+   assert(is_power_of_two(mt->num_samples));
+
+   switch (mt->num_samples) {
+   case 4:
+   case 8:
+      divisor = 2;
+      break;
+   case 16:
+      divisor = 4;
+      break;
+   default:
+      divisor = 1;
+      break;
+   }
+   return ret_align / divisor;
+}
+
+static unsigned int
 intel_vertical_texture_alignment_unit(struct brw_context *brw,
-                                      mesa_format format, bool multisampled)
+                                      const struct intel_mipmap_tree *mt)
 {
    /**
     * From the "Alignment Unit Size" section of various specs, namely:
@@ -124,23 +270,29 @@ intel_vertical_texture_alignment_unit(struct brw_context *brw,
     * Where "*" means either VALIGN_2 or VALIGN_4 depending on the setting of
     * the SURFACE_STATE "Surface Vertical Alignment" field.
     */
-   if (_mesa_is_format_compressed(format))
+   if (_mesa_is_format_compressed(mt->format))
       /* See comment above for the horizontal alignment */
       return brw->gen >= 9 ? 16 : 4;
 
-   if (format == MESA_FORMAT_S_UINT8)
+   if (mt->format == MESA_FORMAT_S_UINT8)
       return brw->gen >= 7 ? 8 : 4;
 
+   if (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
+      uint32_t align = tr_mode_vertical_texture_alignment(brw, mt);
+      /* XY_FAST_COPY_BLT doesn't support vertical alignment < 64 */
+      return align < 64 ? 64 : align;
+   }
+
    /* Broadwell only supports VALIGN of 4, 8, and 16.  The BSpec says 4
     * should always be used, except for stencil buffers, which should be 8.
     */
    if (brw->gen >= 8)
       return 4;
 
-   if (multisampled)
+   if (mt->num_samples > 1)
       return 4;
 
-   GLenum base_format = _mesa_get_format_base_format(format);
+   GLenum base_format = _mesa_get_format_base_format(mt->format);
 
    if (brw->gen >= 6 &&
        (base_format == GL_DEPTH_COMPONENT ||
@@ -161,7 +313,7 @@ intel_vertical_texture_alignment_unit(struct brw_context *brw,
        *
        *     VALIGN_4 is not supported for surface format R32G32B32_FLOAT.
        */
-      if (base_format == GL_YCBCR_MESA || format == MESA_FORMAT_RGB_FLOAT32)
+      if (base_format == GL_YCBCR_MESA || mt->format == MESA_FORMAT_RGB_FLOAT32)
          return 2;
 
       return 4;
@@ -348,9 +500,9 @@ align_cube(struct intel_mipmap_tree *mt)
       mt->total_height += 2;
 }
 
-static bool
-use_linear_1d_layout(struct brw_context *brw,
-                     struct intel_mipmap_tree *mt)
+bool
+gen9_use_linear_1d_layout(const struct brw_context *brw,
+                          const struct intel_mipmap_tree *mt)
 {
    /* On Gen9+ the mipmap levels of a 1D surface are all laid out in a
     * horizontal line. This isn't done for depth/stencil buffers however
@@ -375,7 +527,7 @@ brw_miptree_layout_texture_array(struct brw_context *brw,
 				 struct intel_mipmap_tree *mt)
 {
    unsigned height = mt->physical_height0;
-   bool layout_1d = use_linear_1d_layout(brw, mt);
+   bool layout_1d = gen9_use_linear_1d_layout(brw, mt);
    int physical_qpitch;
 
    if (layout_1d)
@@ -458,46 +610,111 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
    align_cube(mt);
 }
 
-void
-brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt)
+/**
+ * \brief Helper function for intel_miptree_create().
+ */
+static uint32_t
+brw_miptree_choose_tiling(struct brw_context *brw,
+                          enum intel_miptree_tiling_mode requested,
+                          const struct intel_mipmap_tree *mt)
 {
-   bool multisampled = mt->num_samples > 1;
-   bool gen6_hiz_or_stencil = false;
+   if (mt->format == MESA_FORMAT_S_UINT8) {
+      /* The stencil buffer is W tiled. However, we request from the kernel a
+       * non-tiled buffer because the GTT is incapable of W fencing.
+       */
+      return I915_TILING_NONE;
+   }
 
-   if (brw->gen == 6 && mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
-      const GLenum base_format = _mesa_get_format_base_format(mt->format);
-      gen6_hiz_or_stencil = _mesa_is_depth_or_stencil_format(base_format);
+   /* Some usages may want only one type of tiling, like depth miptrees (Y
+    * tiled), or temporary BOs for uploading data once (linear).
+    */
+   switch (requested) {
+   case INTEL_MIPTREE_TILING_ANY:
+      break;
+   case INTEL_MIPTREE_TILING_Y:
+      return I915_TILING_Y;
+   case INTEL_MIPTREE_TILING_NONE:
+      return I915_TILING_NONE;
    }
 
-   if (gen6_hiz_or_stencil) {
-      /* On gen6, we use ALL_SLICES_AT_EACH_LOD for stencil/hiz because the
-       * hardware doesn't support multiple mip levels on stencil/hiz.
+   if (mt->num_samples > 1) {
+      /* From p82 of the Sandy Bridge PRM, dw3[1] of SURFACE_STATE ("Tiled
+       * Surface"):
        *
-       * PRM Vol 2, Part 1, 7.5.3 Hierarchical Depth Buffer:
-       * "The hierarchical depth buffer does not support the LOD field"
+       *   [DevSNB+]: For multi-sample render targets, this field must be
+       *   1. MSRTs can only be tiled.
        *
-       * PRM Vol 2, Part 1, 7.5.4.1 Separate Stencil Buffer:
-       * "The stencil depth buffer does not support the LOD field"
+       * Our usual reason for preferring X tiling (fast blits using the
+       * blitting engine) doesn't apply to MSAA, since we'll generally be
+       * downsampling or upsampling when blitting between the MSAA buffer
+       * and another buffer, and the blitting engine doesn't support that.
+       * So use Y tiling, since it makes better use of the cache.
        */
-      if (mt->format == MESA_FORMAT_S_UINT8) {
-         /* Stencil uses W tiling, so we force W tiling alignment for the
-          * ALL_SLICES_AT_EACH_LOD miptree layout.
-          */
-         mt->align_w = 64;
-         mt->align_h = 64;
-      } else {
-         /* Depth uses Y tiling, so we force need Y tiling alignment for the
-          * ALL_SLICES_AT_EACH_LOD miptree layout.
-          */
-         mt->align_w = 128 / mt->cpp;
-         mt->align_h = 32;
-      }
-   } else {
-      mt->align_w = intel_horizontal_texture_alignment_unit(brw, mt);
-      mt->align_h =
-         intel_vertical_texture_alignment_unit(brw, mt->format, multisampled);
+      return I915_TILING_Y;
+   }
+
+   GLenum base_format = _mesa_get_format_base_format(mt->format);
+   if (base_format == GL_DEPTH_COMPONENT ||
+       base_format == GL_DEPTH_STENCIL_EXT)
+      return I915_TILING_Y;
+
+   /* 1D textures (and 1D array textures) don't get any benefit from tiling,
+    * in fact it leads to a less efficient use of memory space and bandwidth
+    * due to tile alignment.
+    */
+   if (mt->logical_height0 == 1)
+      return I915_TILING_NONE;
+
+   int minimum_pitch = mt->total_width * mt->cpp;
+
+   /* If the width is much smaller than a tile, don't bother tiling. */
+   if (minimum_pitch < 64)
+      return I915_TILING_NONE;
+
+   if (ALIGN(minimum_pitch, 512) >= 32768 ||
+       mt->total_width >= 32768 || mt->total_height >= 32768) {
+      perf_debug("%dx%d miptree too large to blit, falling back to untiled",
+                 mt->total_width, mt->total_height);
+      return I915_TILING_NONE;
+   }
+
+   /* Pre-gen6 doesn't have BLORP to handle Y-tiling, so use X-tiling. */
+   if (brw->gen < 6)
+      return I915_TILING_X;
+
+   /* From the Sandybridge PRM, Volume 1, Part 2, page 32:
+    * "NOTE: 128BPE Format Color Buffer ( render target ) MUST be either TileX
+    *  or Linear."
+    * 128 bits per pixel translates to 16 bytes per pixel. This is necessary
+    * all the way back to 965, but is permitted on Gen7+.
+    */
+   if (brw->gen < 7 && mt->cpp >= 16)
+      return I915_TILING_X;
+
+   /* From the Ivy Bridge PRM, Vol4 Part1 2.12.2.1 (SURFACE_STATE for most
+    * messages), on p64, under the heading "Surface Vertical Alignment":
+    *
+    *     This field must be set to VALIGN_4 for all tiled Y Render Target
+    *     surfaces.
+    *
+    * So if the surface is renderable and uses a vertical alignment of 2,
+    * force it to be X tiled.  This is somewhat conservative (it's possible
+    * that the client won't ever render to this surface), but it's difficult
+    * to know that ahead of time.  And besides, since we use a vertical
+    * alignment of 4 as often as we can, this shouldn't happen very often.
+    */
+   if (brw->gen == 7 && mt->align_h == 2 &&
+       brw->format_supported_as_render_target[mt->format]) {
+      return I915_TILING_X;
    }
 
+   return I915_TILING_Y | I915_TILING_X;
+}
+
+static void
+intel_miptree_set_total_width_height(struct brw_context *brw,
+                                     struct intel_mipmap_tree *mt)
+{
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
       if (brw->gen == 4) {
@@ -532,7 +749,7 @@ brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt)
          break;
       case INTEL_MSAA_LAYOUT_NONE:
       case INTEL_MSAA_LAYOUT_IMS:
-         if (use_linear_1d_layout(brw, mt))
+         if (gen9_use_linear_1d_layout(brw, mt))
             gen9_miptree_layout_1d(mt);
          else
             brw_miptree_layout_2d(mt);
@@ -540,8 +757,62 @@ brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt)
       }
       break;
    }
+
    DBG("%s: %dx%dx%d\n", __func__,
        mt->total_width, mt->total_height, mt->cpp);
+}
+
+void
+brw_miptree_layout(struct brw_context *brw,
+                   struct intel_mipmap_tree *mt,
+                   enum intel_miptree_tiling_mode requested,
+                   uint32_t layout_flags)
+{
+   bool gen6_hiz_or_stencil = false;
+
+   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
+
+   if (brw->gen == 6 && mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
+      const GLenum base_format = _mesa_get_format_base_format(mt->format);
+      gen6_hiz_or_stencil = _mesa_is_depth_or_stencil_format(base_format);
+   }
+
+   if (gen6_hiz_or_stencil) {
+      /* On gen6, we use ALL_SLICES_AT_EACH_LOD for stencil/hiz because the
+       * hardware doesn't support multiple mip levels on stencil/hiz.
+       *
+       * PRM Vol 2, Part 1, 7.5.3 Hierarchical Depth Buffer:
+       * "The hierarchical depth buffer does not support the LOD field"
+       *
+       * PRM Vol 2, Part 1, 7.5.4.1 Separate Stencil Buffer:
+       * "The stencil depth buffer does not support the LOD field"
+       */
+      if (mt->format == MESA_FORMAT_S_UINT8) {
+         /* Stencil uses W tiling, so we force W tiling alignment for the
+          * ALL_SLICES_AT_EACH_LOD miptree layout.
+          */
+         mt->align_w = 64;
+         mt->align_h = 64;
+         assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
+      } else {
+         /* Depth uses Y tiling, so we force need Y tiling alignment for the
+          * ALL_SLICES_AT_EACH_LOD miptree layout.
+          */
+         mt->align_w = 128 / mt->cpp;
+         mt->align_h = 32;
+      }
+   } else {
+      mt->align_w =
+         intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);
+      mt->align_h = intel_vertical_texture_alignment_unit(brw, mt);
+   }
+
+   intel_miptree_set_total_width_height(brw, mt);
+
+   if (!mt->total_width || !mt->total_height) {
+      intel_miptree_release(&mt);
+      return;
+   }
 
    /* On Gen9+ the alignment values are expressed in multiples of the block
     * size
@@ -552,5 +823,8 @@ brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt)
       mt->align_w /= i;
       mt->align_h /= j;
    }
+
+   if ((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0)
+      mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index b548d234538..04e4e944118 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -35,9 +35,47 @@
 
 #include "main/mtypes.h"
 #include "main/imports.h"
+#include "brw_context.h"
 
 extern GLuint brw_translate_blend_factor( GLenum factor );
 extern GLuint brw_translate_blend_equation( GLenum mode );
 extern GLenum brw_fix_xRGB_alpha(GLenum function);
 
+static inline uint32_t
+brw_get_line_width(struct brw_context *brw)
+{
+   /* From the OpenGL 4.4 spec:
+    *
+    * "The actual width of non-antialiased lines is determined by rounding
+    * the supplied width to the nearest integer, then clamping it to the
+    * implementation-dependent maximum non-antialiased line width."
+    */
+   float line_width =
+      CLAMP(!brw->ctx.Multisample._Enabled && !brw->ctx.Line.SmoothFlag
+            ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width,
+            0.0, brw->ctx.Const.MaxLineWidth);
+   uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
+
+   /* Line width of 0 is not allowed when MSAA enabled */
+   if (brw->ctx.Multisample._Enabled) {
+      if (line_width_u3_7 == 0)
+         line_width_u3_7 = 1;
+   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5) {
+      /* For 1 pixel line thickness or less, the general
+       * anti-aliasing algorithm gives up, and a garbage line is
+       * generated.  Setting a Line Width of 0.0 specifies the
+       * rasterization of the "thinnest" (one-pixel-wide),
+       * non-antialiased lines.
+       *
+       * Lines rendered with zero Line Width are rasterized using
+       * Grid Intersection Quantization rules as specified by
+       * bspec section 6.3.12.1 Zero-Width (Cosmetic) Line
+       * Rasterization.
+       */
+      line_width_u3_7 = 0;
+   }
+
+   return line_width_u3_7;
+}
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 2841d983ad5..a5c686ceaaf 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -35,6 +35,7 @@ extern "C" {
 #include "program/prog_print.h"
 #include "program/prog_parameter.h"
 }
+#include "main/context.h"
 
 #define MAX_INSTRUCTION (1 << 30)
 
@@ -1676,20 +1677,16 @@ vec4_visitor::emit_shader_time_end()
     */
    emit(ADD(diff, src_reg(diff), src_reg(-2u)));
 
-   emit_shader_time_write(st_base, src_reg(diff));
-   emit_shader_time_write(st_written, src_reg(1u));
+   emit_shader_time_write(0, src_reg(diff));
+   emit_shader_time_write(1, src_reg(1u));
    emit(BRW_OPCODE_ELSE);
-   emit_shader_time_write(st_reset, src_reg(1u));
+   emit_shader_time_write(2, src_reg(1u));
    emit(BRW_OPCODE_ENDIF);
 }
 
 void
-vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
-                                     src_reg value)
+vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
 {
-   int shader_time_index =
-      brw_get_shader_time_index(brw, shader_prog, prog, type);
-
    dst_reg dst =
       dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
 
@@ -1698,7 +1695,8 @@ vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
    time.reg_offset++;
 
    offset.type = BRW_REGISTER_TYPE_UD;
-   emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
+   int index = shader_time_index * 3 + shader_time_subindex;
+   emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
 
    time.type = BRW_REGISTER_TYPE_UD;
    emit(MOV(time, src_reg(value)));
@@ -1709,11 +1707,11 @@ vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 }
 
 bool
-vec4_visitor::run()
+vec4_visitor::run(gl_clip_plane *clip_planes)
 {
    sanity_param_count = prog->Parameters->NumParameters;
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_begin();
 
    assign_binding_table_offsets();
@@ -1731,7 +1729,7 @@ vec4_visitor::run()
    base_ir = NULL;
 
    if (key->userclip_active && !prog->UsesClipDistanceOut)
-      setup_uniform_clipplane_values();
+      setup_uniform_clipplane_values(clip_planes);
 
    emit_thread_end();
 
@@ -1768,7 +1766,7 @@ vec4_visitor::run()
          snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass,            \
                   stage_abbrev, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
                                                                        \
-         backend_visitor::dump_instructions(filename);                 \
+         backend_shader::dump_instructions(filename);                  \
       }                                                                \
                                                                        \
       progress = progress || this_progress;                            \
@@ -1781,7 +1779,7 @@ vec4_visitor::run()
       snprintf(filename, 64, "%s-%04d-00-start",
                stage_abbrev, shader_prog ? shader_prog->Name : 0);
 
-      backend_visitor::dump_instructions(filename);
+      backend_shader::dump_instructions(filename);
    }
 
    bool progress;
@@ -1868,8 +1866,6 @@ brw_vs_emit(struct brw_context *brw,
    bool start_busy = false;
    double start_time = 0;
    const unsigned *assembly = NULL;
-   bool use_nir =
-      brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions != NULL;
 
    if (unlikely(brw->perf_debug)) {
       start_busy = (brw->batch.last_bo &&
@@ -1881,22 +1877,33 @@ brw_vs_emit(struct brw_context *brw,
    if (prog)
       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
 
+   int st_index = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      st_index = brw_get_shader_time_index(brw, prog, &c->vp->program.Base,
+                                           ST_VS);
+
    if (unlikely(INTEL_DEBUG & DEBUG_VS))
       brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
 
-   if (use_nir && !c->vp->program.Base.nir) {
-      /* Normally we generate NIR in LinkShader() or ProgramStringNotify(), but
-       * Mesa's fixed-function vertex program handling doesn't notify the driver
-       * at all.  Just do it here, at the last minute, even though it's lame.
-       */
-      assert(c->vp->program.Base.Id == 0 && prog == NULL);
-      c->vp->program.Base.nir =
-         brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
-   }
+   if (brw->intelScreen->compiler->scalar_vs) {
+      if (!c->vp->program.Base.nir) {
+         /* Normally we generate NIR in LinkShader() or
+          * ProgramStringNotify(), but Mesa's fixed-function vertex program
+          * handling doesn't notify the driver at all.  Just do it here, at
+          * the last minute, even though it's lame.
+          */
+         assert(c->vp->program.Base.Id == 0 && prog == NULL);
+         c->vp->program.Base.nir =
+            brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
+      }
 
-   if (brw->scalar_vs && (prog || use_nir)) {
-      fs_visitor v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8);
-      if (!v.run_vs()) {
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_visitor v(brw->intelScreen->compiler, brw,
+                   mem_ctx, MESA_SHADER_VERTEX, &c->key,
+                   &prog_data->base.base, prog, &c->vp->program.Base,
+                   8, st_index);
+      if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
             prog->LinkStatus = false;
             ralloc_strcat(&prog->InfoLog, v.fail_msg);
@@ -1908,7 +1915,8 @@ brw_vs_emit(struct brw_context *brw,
          return NULL;
       }
 
-      fs_generator g(brw, mem_ctx, (void *) &c->key, &prog_data->base.base,
+      fs_generator g(brw->intelScreen->compiler, brw,
+                     mem_ctx, (void *) &c->key, &prog_data->base.base,
                      &c->vp->program.Base, v.promoted_constants,
                      v.runtime_check_aads_emit, "VS");
       if (INTEL_DEBUG & DEBUG_VS) {
@@ -1926,13 +1934,16 @@ brw_vs_emit(struct brw_context *brw,
       g.generate_code(v.cfg, 8);
       assembly = g.get_assembly(final_assembly_size);
 
-      prog_data->base.simd8 = true;
       c->base.last_scratch = v.last_scratch;
    }
 
    if (!assembly) {
-      vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
-      if (!v.run()) {
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+      vec4_vs_visitor v(brw->intelScreen->compiler,
+                        c, prog_data, prog, mem_ctx, st_index,
+                        !_mesa_is_gles3(&brw->ctx));
+      if (!v.run(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
             prog->LinkStatus = false;
             ralloc_strcat(&prog->InfoLog, v.fail_msg);
@@ -1944,7 +1955,8 @@ brw_vs_emit(struct brw_context *brw,
          return NULL;
       }
 
-      vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
+      vec4_generator g(brw->intelScreen->compiler, brw,
+                       prog, &c->vp->program.Base, &prog_data->base,
                        mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
       assembly = g.generate_assembly(v.cfg, final_assembly_size);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 628c6313cc9..2ac16932189 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -73,10 +73,10 @@ class vec4_live_variables;
  * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
  * fixed-function) into VS IR.
  */
-class vec4_visitor : public backend_visitor
+class vec4_visitor : public backend_shader, public ir_visitor
 {
 public:
-   vec4_visitor(struct brw_context *brw,
+   vec4_visitor(const struct brw_compiler *compiler,
                 struct brw_vec4_compile *c,
                 struct gl_program *prog,
                 const struct brw_vue_prog_key *key,
@@ -85,9 +85,7 @@ public:
                 gl_shader_stage stage,
 		void *mem_ctx,
                 bool no_spills,
-                shader_time_shader_type st_base,
-                shader_time_shader_type st_written,
-                shader_time_shader_type st_reset);
+                int shader_time_index);
    ~vec4_visitor();
 
    dst_reg dst_null_f()
@@ -160,6 +158,7 @@ public:
    virtual void visit(ir_if *);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void visit(ir_barrier *);
    /*@}*/
 
    src_reg result;
@@ -178,10 +177,10 @@ public:
 
    struct hash_table *variable_ht;
 
-   bool run(void);
+   bool run(gl_clip_plane *clip_planes);
    void fail(const char *msg, ...);
 
-   void setup_uniform_clipplane_values();
+   void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
    void setup_uniform_values(ir_variable *ir);
    void setup_builtin_uniform_values(ir_variable *ir);
    int setup_uniforms(int payload_reg);
@@ -344,8 +343,7 @@ public:
 
    void emit_shader_time_begin();
    void emit_shader_time_end();
-   void emit_shader_time_write(enum shader_time_shader_type type,
-                               src_reg value);
+   void emit_shader_time_write(int shader_time_subindex, src_reg value);
 
    void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
                             dst_reg dst, src_reg offset, src_reg src0,
@@ -412,9 +410,7 @@ private:
     */
    const bool no_spills;
 
-   const shader_time_shader_type st_base;
-   const shader_time_shader_type st_written;
-   const shader_time_shader_type st_reset;
+   int shader_time_index;
 };
 
 
@@ -426,7 +422,7 @@ private:
 class vec4_generator
 {
 public:
-   vec4_generator(struct brw_context *brw,
+   vec4_generator(const struct brw_compiler *compiler, void *log_data,
                   struct gl_shader_program *shader_prog,
                   struct gl_program *prog,
                   struct brw_vue_prog_data *prog_data,
@@ -508,7 +504,9 @@ private:
                                          struct brw_reg dst);
    void generate_unpack_flags(struct brw_reg dst);
 
-   struct brw_context *brw;
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
    const struct brw_device_info *devinfo;
 
    struct brw_codegen *p;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 9147c3cbb79..c9fe0cebf27 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -114,8 +114,16 @@ instructions_match(vec4_instruction *a, vec4_instruction *b)
 {
    return a->opcode == b->opcode &&
           a->saturate == b->saturate &&
+          a->predicate == b->predicate &&
+          a->predicate_inverse == b->predicate_inverse &&
           a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
           a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->base_mrf == b->base_mrf &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
           a->dst.writemask == b->dst.writemask &&
           a->force_writemask_all == b->force_writemask_all &&
           a->regs_written == b->regs_written &&
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index ef77b8df051..d2de2f0be25 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -134,7 +134,8 @@ vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i)
    return brw_reg;
 }
 
-vec4_generator::vec4_generator(struct brw_context *brw,
+vec4_generator::vec4_generator(const struct brw_compiler *compiler,
+                               void *log_data,
                                struct gl_shader_program *shader_prog,
                                struct gl_program *prog,
                                struct brw_vue_prog_data *prog_data,
@@ -142,13 +143,13 @@ vec4_generator::vec4_generator(struct brw_context *brw,
                                bool debug_flag,
                                const char *stage_name,
                                const char *stage_abbrev)
-   : brw(brw), devinfo(brw->intelScreen->devinfo),
+   : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo),
      shader_prog(shader_prog), prog(prog), prog_data(prog_data),
      mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev),
      debug_flag(debug_flag)
 {
    p = rzalloc(mem_ctx, struct brw_codegen);
-   brw_init_codegen(brw->intelScreen->devinfo, p, mem_ctx);
+   brw_init_codegen(devinfo, p, mem_ctx);
 }
 
 vec4_generator::~vec4_generator()
@@ -398,30 +399,25 @@ vec4_generator::generate_tex(vec4_instruction *inst,
       brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
    } else {
       /* Non-constant sampler index. */
-      /* Note: this clobbers `dst` as a temporary before emitting the send */
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
-      struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
-
       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
 
       brw_push_insn_state(p);
       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
       brw_set_default_access_mode(p, BRW_ALIGN_1);
 
-      /* Some care required: `sampler` and `temp` may alias:
-       *    addr = sampler & 0xff
-       *    temp = (sampler << 8) & 0xf00
-       *    addr = addr | temp
-       */
-      brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
-      brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
-      brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
-      brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
-      brw_OR(p, addr, addr, temp);
+      /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
+      brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      if (base_binding_table_index)
+         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
+      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
 
       brw_pop_insn_state(p);
 
+      if (inst->base_mrf != -1)
+         gen6_resolve_implied_move(p, &src, inst->base_mrf);
+
       /* dst = send(offset, a0.0 | <descriptor>) */
       brw_inst *insn = brw_send_indirect_message(
          p, BRW_SFID_SAMPLER, dst, src, addr);
@@ -1631,16 +1627,11 @@ vec4_generator::generate_code(const cfg_t *cfg)
       ralloc_free(annotation.ann);
    }
 
-   static GLuint msg_id = 0;
-   _mesa_gl_debug(&brw->ctx, &msg_id,
-                  MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                  MESA_DEBUG_TYPE_OTHER,
-                  MESA_DEBUG_SEVERITY_NOTIFICATION,
-                  "%s vec4 shader: %d inst, %d loops, "
-                  "compacted %d to %d bytes.\n",
-                  stage_abbrev,
-                  before_size / 16, loop_count,
-                  before_size, after_size);
+   compiler->shader_debug_log(log_data,
+                              "%s vec4 shader: %d inst, %d loops, "
+                              "compacted %d to %d bytes.\n",
+                              stage_abbrev, before_size / 16, loop_count,
+                              before_size, after_size);
 }
 
 const unsigned *
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 363e30e34e4..69bcf5afc51 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -34,15 +34,15 @@ const unsigned MAX_GS_INPUT_VERTICES = 6;
 
 namespace brw {
 
-vec4_gs_visitor::vec4_gs_visitor(struct brw_context *brw,
+vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
                                  struct brw_gs_compile *c,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
-                                 bool no_spills)
-   : vec4_visitor(brw, &c->base, &c->gp->program.Base, &c->key.base,
+                                 bool no_spills,
+                                 int shader_time_index)
+   : vec4_visitor(compiler, &c->base, &c->gp->program.Base, &c->key.base,
                   &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
-                  no_spills,
-                  ST_GS, ST_GS_WRITTEN, ST_GS_RESET),
+                  no_spills, shader_time_index),
      c(c)
 {
 }
@@ -106,7 +106,7 @@ vec4_gs_visitor::setup_payload()
     * to be interleaved, so one register contains two attribute slots.
     */
    int attributes_per_reg =
-      c->prog_data.dispatch_mode == GEN7_GS_DISPATCH_MODE_DUAL_OBJECT ? 1 : 2;
+      c->prog_data.base.dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
 
    /* If a geometry shader tries to read from an input that wasn't written by
     * the vertex shader, that produces undefined results, but it shouldn't
@@ -629,7 +629,8 @@ generate_assembly(struct brw_context *brw,
                   const cfg_t *cfg,
                   unsigned *final_assembly_size)
 {
-   vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx,
+   vec4_generator g(brw->intelScreen->compiler, brw,
+                    shader_prog, prog, prog_data, mem_ctx,
                     INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
    return g.generate_assembly(cfg, final_assembly_size);
 }
@@ -648,6 +649,10 @@ brw_gs_emit(struct brw_context *brw,
       brw_dump_ir("geometry", prog, &shader->base, NULL);
    }
 
+   int st_index = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      st_index = brw_get_shader_time_index(brw, prog, NULL, ST_GS);
+
    if (brw->gen >= 7) {
       /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
        * so without spilling. If the GS invocations count > 1, then we can't use
@@ -655,10 +660,11 @@ brw_gs_emit(struct brw_context *brw,
        */
       if (c->prog_data.invocations <= 1 &&
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
-         c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_OBJECT;
+         c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(brw, c, prog, mem_ctx, true /* no_spills */);
-         if (v.run()) {
+         vec4_gs_visitor v(brw->intelScreen->compiler,
+                           c, prog, mem_ctx, true /* no_spills */, st_index);
+         if (v.run(NULL /* clip planes */)) {
             return generate_assembly(brw, prog, &c->gp->program.Base,
                                      &c->prog_data.base, mem_ctx, v.cfg,
                                      final_assembly_size);
@@ -690,19 +696,23 @@ brw_gs_emit(struct brw_context *brw,
     * SINGLE mode.
     */
    if (c->prog_data.invocations <= 1 || brw->gen < 7)
-      c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_SINGLE;
+      c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
    else
-      c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE;
+      c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
 
    vec4_gs_visitor *gs = NULL;
    const unsigned *ret = NULL;
 
    if (brw->gen >= 7)
-      gs = new vec4_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
+      gs = new vec4_gs_visitor(brw->intelScreen->compiler,
+                               c, prog, mem_ctx, false /* no_spills */,
+                               st_index);
    else
-      gs = new gen6_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
+      gs = new gen6_gs_visitor(brw->intelScreen->compiler,
+                               c, prog, mem_ctx, false /* no_spills */,
+                               st_index);
 
-   if (!gs->run()) {
+   if (!gs->run(NULL /* clip planes */)) {
       prog->LinkStatus = false;
       ralloc_strcat(&prog->InfoLog, gs->fail_msg);
    } else {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index bcb5a2bcfc1..e693c56b58f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -68,11 +68,12 @@ namespace brw {
 class vec4_gs_visitor : public vec4_visitor
 {
 public:
-   vec4_gs_visitor(struct brw_context *brw,
+   vec4_gs_visitor(const struct brw_compiler *compiler,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
-                   bool no_spills);
+                   bool no_spills,
+                   int shader_time_index);
 
 protected:
    virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index 5368a75bc0f..555c42e2f24 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -191,7 +191,6 @@ vec4_visitor::setup_payload_interference(struct ra_graph *g,
 bool
 vec4_visitor::reg_allocate()
 {
-   struct brw_compiler *compiler = brw->intelScreen->compiler;
    unsigned int hw_reg_mapping[alloc.count];
    int payload_reg_count = this->first_non_payload_grf;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index e51c140c0f2..236fa51f92c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -684,9 +684,12 @@ vec4_visitor::setup_uniform_values(ir_variable *ir)
     * order we'd walk the type, so walk the list of storage and find anything
     * with our name, or the prefix of a component that starts with our name.
     */
-   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
+   for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 
+      if (storage->builtin)
+         continue;
+
       if (strncmp(ir->name, storage->name, namelen) != 0 ||
           (storage->name[namelen] != 0 &&
            storage->name[namelen] != '.' &&
@@ -718,10 +721,8 @@ vec4_visitor::setup_uniform_values(ir_variable *ir)
 }
 
 void
-vec4_visitor::setup_uniform_clipplane_values()
+vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 {
-   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
-
    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
       assert(this->uniforms < uniform_array_size);
       this->uniform_vector_size[this->uniforms] = 4;
@@ -2461,11 +2462,27 @@ vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler
       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
                                     dst_reg(this, glsl_type::uvec4_type));
    inst->base_mrf = 2;
-   inst->mlen = 1;
    inst->src[1] = sampler;
 
+   int param_base;
+
+   if (devinfo->gen >= 9) {
+      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
+      vec4_instruction *header_inst = new(mem_ctx)
+         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+                          dst_reg(MRF, inst->base_mrf));
+
+      emit(header_inst);
+
+      inst->mlen = 2;
+      inst->header_size = 1;
+      param_base = inst->base_mrf + 1;
+   } else {
+      inst->mlen = 1;
+      param_base = inst->base_mrf;
+   }
+
    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
-   int param_base = inst->base_mrf;
    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
    int zero_mask = 0xf & ~coord_mask;
 
@@ -2949,6 +2966,12 @@ vec4_visitor::visit(ir_end_primitive *)
 }
 
 void
+vec4_visitor::visit(ir_barrier *)
+{
+   unreachable("not reached");
+}
+
+void
 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
                                   dst_reg dst, src_reg offset,
                                   src_reg src0, src_reg src1)
@@ -3655,7 +3678,7 @@ vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
    *reg = neg_result;
 }
 
-vec4_visitor::vec4_visitor(struct brw_context *brw,
+vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
                            struct brw_vec4_compile *c,
                            struct gl_program *prog,
                            const struct brw_vue_prog_key *key,
@@ -3664,10 +3687,9 @@ vec4_visitor::vec4_visitor(struct brw_context *brw,
                            gl_shader_stage stage,
 			   void *mem_ctx,
                            bool no_spills,
-                           shader_time_shader_type st_base,
-                           shader_time_shader_type st_written,
-                           shader_time_shader_type st_reset)
-   : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
+                           int shader_time_index)
+   : backend_shader(compiler, NULL, mem_ctx,
+                    shader_prog, prog, &prog_data->base, stage),
      c(c),
      key(key),
      prog_data(prog_data),
@@ -3676,11 +3698,8 @@ vec4_visitor::vec4_visitor(struct brw_context *brw,
      first_non_payload_grf(0),
      need_all_constants_in_pull_buffer(false),
      no_spills(no_spills),
-     st_base(st_base),
-     st_written(st_written),
-     st_reset(st_reset)
+     shader_time_index(shader_time_index)
 {
-   this->mem_ctx = mem_ctx;
    this->failed = false;
 
    this->base_ir = NULL;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
index 92d108598a2..dcbd2405078 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
@@ -381,8 +381,7 @@ vec4_vs_visitor::emit_program_code()
          break;
 
       default:
-         _mesa_problem(ctx, "Unsupported opcode %s in vertex program\n",
-                       _mesa_opcode_string(vpi->Opcode));
+         assert(!"Unsupported opcode in vertex program");
       }
 
       /* Copy the temporary back into the actual destination register. */
@@ -574,15 +573,13 @@ vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
          break;
 
       default:
-         _mesa_problem(ctx, "bad uniform src register file: %s\n",
-                       _mesa_register_file_name((gl_register_file)src.File));
+         assert(!"Bad uniform in src register file");
          return src_reg(this, glsl_type::vec4_type);
       }
       break;
 
    default:
-      _mesa_problem(ctx, "bad src register file: %s\n",
-                    _mesa_register_file_name((gl_register_file)src.File));
+      assert(!"Bad src register file");
       return src_reg(this, glsl_type::vec4_type);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 4baf73ebde1..f93062b46d0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -23,7 +23,6 @@
 
 
 #include "brw_vs.h"
-#include "main/context.h"
 
 
 namespace brw {
@@ -78,7 +77,7 @@ vec4_vs_visitor::emit_prolog()
             /* ES 3.0 has different rules for converting signed normalized
              * fixed-point numbers than desktop GL.
              */
-            if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
+            if ((wa_flags & BRW_ATTRIB_WA_SIGN) && !use_legacy_snorm_formula) {
                /* According to equation 2.2 of the ES 3.0 specification,
                 * signed normalization conversion is done by:
                 *
@@ -212,18 +211,21 @@ vec4_vs_visitor::emit_thread_end()
 }
 
 
-vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
+vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
                                  struct brw_vs_compile *vs_compile,
                                  struct brw_vs_prog_data *vs_prog_data,
                                  struct gl_shader_program *prog,
-                                 void *mem_ctx)
-   : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
+                                 void *mem_ctx,
+                                 int shader_time_index,
+                                 bool use_legacy_snorm_formula)
+   : vec4_visitor(compiler, &vs_compile->base, &vs_compile->vp->program.Base,
                   &vs_compile->key.base, &vs_prog_data->base, prog,
                   MESA_SHADER_VERTEX,
                   mem_ctx, false /* no_spills */,
-                  ST_VS, ST_VS_WRITTEN, ST_VS_RESET),
+                  shader_time_index),
      vs_compile(vs_compile),
-     vs_prog_data(vs_prog_data)
+     vs_prog_data(vs_prog_data),
+     use_legacy_snorm_formula(use_legacy_snorm_formula)
 {
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index d03567e33b8..6e9848fb1e9 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -40,108 +40,6 @@
 
 #include "util/ralloc.h"
 
-static inline void assign_vue_slot(struct brw_vue_map *vue_map,
-                                   int varying)
-{
-   /* Make sure this varying hasn't been assigned a slot already */
-   assert (vue_map->varying_to_slot[varying] == -1);
-
-   vue_map->varying_to_slot[varying] = vue_map->num_slots;
-   vue_map->slot_to_varying[vue_map->num_slots++] = varying;
-}
-
-/**
- * Compute the VUE map for vertex shader program.
- */
-void
-brw_compute_vue_map(const struct brw_device_info *devinfo,
-                    struct brw_vue_map *vue_map,
-                    GLbitfield64 slots_valid)
-{
-   vue_map->slots_valid = slots_valid;
-   int i;
-
-   /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
-    * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
-    */
-   slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
-
-   /* Make sure that the values we store in vue_map->varying_to_slot and
-    * vue_map->slot_to_varying won't overflow the signed chars that are used
-    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
-    * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that
-    * BRW_VARYING_SLOT_COUNT is <= 127, not 128.
-    */
-   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
-
-   vue_map->num_slots = 0;
-   for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
-      vue_map->varying_to_slot[i] = -1;
-      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_COUNT;
-   }
-
-   /* VUE header: format depends on chip generation and whether clipping is
-    * enabled.
-    */
-   if (devinfo->gen < 6) {
-      /* There are 8 dwords in VUE header pre-Ironlake:
-       * dword 0-3 is indices, point width, clip flags.
-       * dword 4-7 is ndc position
-       * dword 8-11 is the first vertex data.
-       *
-       * On Ironlake the VUE header is nominally 20 dwords, but the hardware
-       * will accept the same header layout as Gen4 [and should be a bit faster]
-       */
-      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ);
-      assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC);
-      assign_vue_slot(vue_map, VARYING_SLOT_POS);
-   } else {
-      /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
-       * dword 0-3 of the header is indices, point width, clip flags.
-       * dword 4-7 is the 4D space position
-       * dword 8-15 of the vertex header is the user clip distance if
-       * enabled.
-       * dword 8-11 or 16-19 is the first vertex element data we fill.
-       */
-      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ);
-      assign_vue_slot(vue_map, VARYING_SLOT_POS);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0))
-         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1))
-         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1);
-
-      /* front and back colors need to be consecutive so that we can use
-       * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing
-       * two-sided color.
-       */
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0))
-         assign_vue_slot(vue_map, VARYING_SLOT_COL0);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0))
-         assign_vue_slot(vue_map, VARYING_SLOT_BFC0);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1))
-         assign_vue_slot(vue_map, VARYING_SLOT_COL1);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1))
-         assign_vue_slot(vue_map, VARYING_SLOT_BFC1);
-   }
-
-   /* The hardware doesn't care about the rest of the vertex outputs, so just
-    * assign them contiguously.  Don't reassign outputs that already have a
-    * slot.
-    *
-    * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
-    * since it's encoded as the clip distances by emit_clip_distances().
-    * However, it may be output by transform feedback, and we'd rather not
-    * recompute state when TF changes, so we just always include it.
-    */
-   for (int i = 0; i < VARYING_SLOT_MAX; ++i) {
-      if ((slots_valid & BITFIELD64_BIT(i)) &&
-          vue_map->varying_to_slot[i] == -1) {
-         assign_vue_slot(vue_map, i);
-      }
-   }
-}
-
-
 /**
  * Decide which set of clip planes should be used when clipping via
  * gl_Position or gl_ClipVertex.
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 6157ae6ffa9..61f9b006a58 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -90,11 +90,13 @@ namespace brw {
 class vec4_vs_visitor : public vec4_visitor
 {
 public:
-   vec4_vs_visitor(struct brw_context *brw,
+   vec4_vs_visitor(const struct brw_compiler *compiler,
                    struct brw_vs_compile *vs_compile,
                    struct brw_vs_prog_data *vs_prog_data,
                    struct gl_shader_program *prog,
-                   void *mem_ctx);
+                   void *mem_ctx,
+                   int shader_time_index,
+                   bool use_legacy_snorm_formula);
 
 protected:
    virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
@@ -115,6 +117,8 @@ private:
    struct brw_vs_prog_data * const vs_prog_data;
    src_reg *vp_temp_regs;
    src_reg vp_addr_reg;
+
+   bool use_legacy_snorm_formula;
 };
 
 } /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index f82a62b4851..b2f91bd412b 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -121,7 +121,7 @@ brw_upload_vs_pull_constants(struct brw_context *brw)
    /* BRW_NEW_VS_PROG_DATA */
    const struct brw_stage_prog_data *prog_data = &brw->vs.prog_data->base.base;
 
-   dword_pitch = brw->vs.prog_data->base.simd8;
+   dword_pitch = brw->vs.prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
 
    /* _NEW_PROGRAM_CONSTANTS */
    brw_upload_pull_constants(brw, BRW_NEW_VS_CONSTBUF, &vp->program.Base,
@@ -151,7 +151,7 @@ brw_upload_vs_ubo_surfaces(struct brw_context *brw)
       return;
 
    /* BRW_NEW_VS_PROG_DATA */
-   dword_pitch = brw->vs.prog_data->base.simd8;
+   dword_pitch = brw->vs.prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
    brw_upload_ubo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX],
                            &brw->vs.base, &brw->vs.prog_data->base.base,
                            dword_pitch);
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
new file mode 100644
index 00000000000..76875789ba8
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_vue_map.c
+ *
+ * This file computes the "VUE map" for a (non-fragment) shader stage, which
+ * describes the layout of its output varyings.  The VUE map is used to match
+ * outputs from one stage with the inputs of the next.
+ *
+ * Largely, varyings can be placed however we like - producers/consumers simply
+ * have to agree on the layout.  However, there is also a "VUE Header" that
+ * prescribes a fixed-layout for items that interact with fixed function
+ * hardware, such as the clipper and rasterizer.
+ *
+ * Authors:
+ *   Paul Berry <[email protected]>
+ *   Chris Forbes <[email protected]>
+ *   Eric Anholt <[email protected]>
+ */
+
+
+#include "main/compiler.h"
+#include "brw_context.h"
+
+static inline void
+assign_vue_slot(struct brw_vue_map *vue_map, int varying)
+{
+   /* Make sure this varying hasn't been assigned a slot already */
+   assert (vue_map->varying_to_slot[varying] == -1);
+
+   vue_map->varying_to_slot[varying] = vue_map->num_slots;
+   vue_map->slot_to_varying[vue_map->num_slots++] = varying;
+}
+
+/**
+ * Compute the VUE map for a shader stage.
+ */
+void
+brw_compute_vue_map(const struct brw_device_info *devinfo,
+                    struct brw_vue_map *vue_map,
+                    GLbitfield64 slots_valid)
+{
+   vue_map->slots_valid = slots_valid;
+   int i;
+
+   /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
+    * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
+    */
+   slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
+   /* Make sure that the values we store in vue_map->varying_to_slot and
+    * vue_map->slot_to_varying won't overflow the signed chars that are used
+    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
+    * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that
+    * BRW_VARYING_SLOT_COUNT is <= 127, not 128.
+    */
+   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
+
+   vue_map->num_slots = 0;
+   for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+      vue_map->varying_to_slot[i] = -1;
+      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_COUNT;
+   }
+
+   /* VUE header: format depends on chip generation and whether clipping is
+    * enabled.
+    *
+    * See the Sandybridge PRM, Volume 2 Part 1, section 1.5.1 (page 30),
+    * "Vertex URB Entry (VUE) Formats" which describes the VUE header layout.
+    */
+   if (devinfo->gen < 6) {
+      /* There are 8 dwords in VUE header pre-Ironlake:
+       * dword 0-3 is indices, point width, clip flags.
+       * dword 4-7 is ndc position
+       * dword 8-11 is the first vertex data.
+       *
+       * On Ironlake the VUE header is nominally 20 dwords, but the hardware
+       * will accept the same header layout as Gen4 [and should be a bit faster]
+       */
+      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ);
+      assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC);
+      assign_vue_slot(vue_map, VARYING_SLOT_POS);
+   } else {
+      /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
+       * dword 0-3 of the header is indices, point width, clip flags.
+       * dword 4-7 is the 4D space position
+       * dword 8-15 of the vertex header is the user clip distance if
+       * enabled.
+       * dword 8-11 or 16-19 is the first vertex element data we fill.
+       */
+      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ);
+      assign_vue_slot(vue_map, VARYING_SLOT_POS);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0))
+         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1))
+         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1);
+
+      /* front and back colors need to be consecutive so that we can use
+       * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing
+       * two-sided color.
+       */
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0))
+         assign_vue_slot(vue_map, VARYING_SLOT_COL0);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0))
+         assign_vue_slot(vue_map, VARYING_SLOT_BFC0);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1))
+         assign_vue_slot(vue_map, VARYING_SLOT_COL1);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1))
+         assign_vue_slot(vue_map, VARYING_SLOT_BFC1);
+   }
+
+   /* The hardware doesn't care about the rest of the vertex outputs, so just
+    * assign them contiguously.  Don't reassign outputs that already have a
+    * slot.
+    *
+    * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
+    * since it's encoded as the clip distances by emit_clip_distances().
+    * However, it may be output by transform feedback, and we'd rather not
+    * recompute state when TF changes, so we just always include it.
+    */
+   for (int i = 0; i < VARYING_SLOT_MAX; ++i) {
+      if ((slots_valid & BITFIELD64_BIT(i)) &&
+          vue_map->varying_to_slot[i] == -1) {
+         assign_vue_slot(vue_map, i);
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 5496225a6c7..4619ce1080d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -36,6 +36,7 @@
 #include "main/formats.h"
 #include "main/fbobject.h"
 #include "main/samplerobj.h"
+#include "main/framebuffer.h"
 #include "program/prog_parameter.h"
 #include "program/program.h"
 #include "intel_mipmap_tree.h"
@@ -462,7 +463,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    GLuint lookup = 0;
    GLuint line_aa;
    bool program_uses_dfdy = fp->program.UsesDFdy;
-   bool multisample_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisample_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    memset(key, 0, sizeof(*key));
 
@@ -561,7 +562,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
     * drawable height in order to invert the Y axis.
     */
    if (fp->program.Base.InputsRead & VARYING_BIT_POS) {
-      key->drawable_height = ctx->DrawBuffer->Height;
+      key->drawable_height = _mesa_geometric_height(ctx->DrawBuffer);
    }
 
    if ((fp->program.Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
@@ -580,7 +581,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    key->persample_shading =
       _mesa_get_min_invocations_per_fragment(ctx, &fp->program, true) > 1;
    if (key->persample_shading)
-      key->persample_2x = ctx->DrawBuffer->Visual.samples == 2;
+      key->persample_2x = _mesa_geometric_samples(ctx->DrawBuffer) == 2;
 
    key->compute_pos_offset =
       _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1 &&
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 160dd2f6c62..72aad96bb6a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -35,6 +35,7 @@
 #include "main/mtypes.h"
 #include "main/samplerobj.h"
 #include "program/prog_parameter.h"
+#include "main/framebuffer.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_batchbuffer.h"
@@ -738,6 +739,9 @@ brw_update_renderbuffer_surfaces(struct brw_context *brw,
                                  uint32_t *surf_offset)
 {
    GLuint i;
+   const unsigned int w = _mesa_geometric_width(fb);
+   const unsigned int h = _mesa_geometric_height(fb);
+   const unsigned int s = _mesa_geometric_samples(fb);
 
    /* Update surfaces for drawing buffers */
    if (fb->_NumColorDrawBuffers >= 1) {
@@ -748,17 +752,15 @@ brw_update_renderbuffer_surfaces(struct brw_context *brw,
             surf_offset[surf_index] = 
                brw->vtbl.update_renderbuffer_surface(
                   brw, fb->_ColorDrawBuffers[i],
-                  fb->MaxNumLayers > 0, i, surf_index);
+                  _mesa_geometric_layers(fb) > 0, i, surf_index);
 	 } else {
-            brw->vtbl.emit_null_surface_state(
-               brw, fb->Width, fb->Height, fb->Visual.samples,
+            brw->vtbl.emit_null_surface_state(brw, w, h, s,
                &surf_offset[surf_index]);
 	 }
       }
    } else {
       const uint32_t surf_index = render_target_start;
-      brw->vtbl.emit_null_surface_state(
-         brw, fb->Width, fb->Height, fb->Visual.samples,
+      brw->vtbl.emit_null_surface_state(brw, w, h, s,
          &surf_offset[surf_index]);
    }
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index aaf90df2b9c..9a29366f0e0 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -31,6 +31,7 @@
 #include "brw_util.h"
 #include "intel_batchbuffer.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 
 static void
 upload_clip_state(struct brw_context *brw)
@@ -145,11 +146,14 @@ upload_clip_state(struct brw_context *brw)
     * the viewport, so we can ignore this restriction.
     */
    if (brw->gen < 8) {
+      const float fb_width = (float)_mesa_geometric_width(fb);
+      const float fb_height = (float)_mesa_geometric_height(fb);
+
       for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
          if (ctx->ViewportArray[i].X != 0 ||
              ctx->ViewportArray[i].Y != 0 ||
-             ctx->ViewportArray[i].Width != (float) fb->Width ||
-             ctx->ViewportArray[i].Height != (float) fb->Height) {
+             ctx->ViewportArray[i].Width != fb_width ||
+             ctx->ViewportArray[i].Height != fb_height) {
             dw2 &= ~GEN6_CLIP_GB_TEST;
             break;
          }
@@ -179,7 +183,7 @@ upload_clip_state(struct brw_context *brw)
 	     dw2);
    OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
              U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
-             (fb->MaxNumLayers > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
+             (_mesa_geometric_layers(fb) > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
              ((ctx->Const.MaxViewports - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
    ADVANCE_BATCH();
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index 28f23c9e4f7..27254ebb727 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -35,12 +35,13 @@ namespace brw {
 class gen6_gs_visitor : public vec4_gs_visitor
 {
 public:
-   gen6_gs_visitor(struct brw_context *brw,
+   gen6_gs_visitor(const struct brw_compiler *comp,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
-                   bool no_spills) :
-      vec4_gs_visitor(brw, c, prog, mem_ctx, no_spills) {}
+                   bool no_spills,
+                   int shader_time_index) :
+      vec4_gs_visitor(comp, c, prog, mem_ctx, no_spills, shader_time_index) {}
 
 protected:
    virtual void assign_binding_table_offsets();
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index ec46479ff75..36734f598fe 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -26,6 +26,7 @@
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_multisample_state.h"
+#include "main/framebuffer.h"
 
 void
 gen6_get_sample_position(struct gl_context *ctx,
@@ -34,7 +35,7 @@ gen6_get_sample_position(struct gl_context *ctx,
 {
    uint8_t bits;
 
-   switch (fb->Visual.samples) {
+   switch (_mesa_geometric_samples(fb)) {
    case 1:
       result[0] = result[1] = 0.5f;
       return;
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index 6431ed56d81..ba5c944fb3d 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -246,7 +246,7 @@ gen6_queryobj_get_results(struct gl_context *ctx,
        * and correctly emitted the number of pixel shader invocations, but,
        * whomever forgot to undo the multiply by 4.
        */
-      if (brw->gen >= 8 || brw->is_haswell)
+      if (brw->gen == 8 || brw->is_haswell)
          query->Base.Result /= 4;
       break;
 
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index 0111f152ef6..17b4a7fba96 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -39,6 +39,8 @@ gen6_upload_scissor_state(struct brw_context *brw)
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    struct gen6_scissor_rect *scissor;
    uint32_t scissor_state_offset;
+   const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer);
+   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
 
    scissor = brw_state_batch(brw, AUB_TRACE_SCISSOR_STATE,
 			     sizeof(*scissor) * ctx->Const.MaxViewports, 32,
@@ -56,7 +58,11 @@ gen6_upload_scissor_state(struct brw_context *brw)
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
       int bbox[4];
 
-      _mesa_scissor_bounding_box(ctx, ctx->DrawBuffer, i, bbox);
+      bbox[0] = 0;
+      bbox[1] = fb_width;
+      bbox[2] = 0;
+      bbox[3] = fb_height;
+      _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
 
       if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
          /* If the scissor was out of bounds and got clamped to 0 width/height
@@ -80,8 +86,8 @@ gen6_upload_scissor_state(struct brw_context *brw)
          /* memory: Y=0=top */
          scissor[i].xmin = bbox[0];
          scissor[i].xmax = bbox[1] - 1;
-         scissor[i].ymin = ctx->DrawBuffer->Height - bbox[3];
-         scissor[i].ymax = ctx->DrawBuffer->Height - bbox[2] - 1;
+         scissor[i].ymin = fb_height - bbox[3];
+         scissor[i].ymax = fb_height - bbox[2] - 1;
       }
    }
    BEGIN_BATCH(2);
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index e445ce25600..b00517ed81e 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -31,6 +31,7 @@
 #include "brw_util.h"
 #include "main/macros.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "intel_batchbuffer.h"
 
 /**
@@ -273,7 +274,7 @@ upload_sf_state(struct brw_context *brw)
    int i;
    /* _NEW_BUFFER */
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    const int urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
    float point_size;
@@ -361,31 +362,7 @@ upload_sf_state(struct brw_context *brw)
 
    /* _NEW_LINE */
    {
-      /* OpenGL dictates that line width should be rounded to the nearest
-       * integer
-       */
-      float line_width =
-         roundf(CLAMP(ctx->Line.Width, 0.0, ctx->Const.MaxLineWidth));
-      uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
-
-      /* Line width of 0 is not allowed when MSAA enabled */
-      if (ctx->Multisample._Enabled) {
-         if (line_width_u3_7 == 0)
-             line_width_u3_7 = 1;
-      } else if (ctx->Line.SmoothFlag && ctx->Line.Width < 1.5) {
-         /* For 1 pixel line thickness or less, the general
-          * anti-aliasing algorithm gives up, and a garbage line is
-          * generated.  Setting a Line Width of 0.0 specifies the
-          * rasterization of the "thinnest" (one-pixel-wide),
-          * non-antialiased lines.
-          *
-          * Lines rendered with zero Line Width are rasterized using
-          * Grid Intersection Quantization rules as specified by
-          * bspec section 6.3.12.1 Zero-Width (Cosmetic) Line
-          * Rasterization.
-          */
-         line_width_u3_7 = 0;
-      }
+      uint32_t line_width_u3_7 = brw_get_line_width(brw);
       dw3 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
    }
    if (ctx->Line.SmoothFlag) {
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 2fb0182c56e..7c8d8849f4e 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -30,6 +30,7 @@
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "main/viewport.h"
 
 /* The clip VP defines the guardband region where expensive clipping is skipped
@@ -93,10 +94,10 @@ gen6_upload_sf_vp(struct brw_context *brw)
    /* _NEW_BUFFERS */
    if (render_to_fbo) {
       y_scale = 1.0;
-      y_bias = 0;
+      y_bias = 0.0;
    } else {
       y_scale = -1.0;
-      y_bias = ctx->DrawBuffer->Height;
+      y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 7081eb73428..d1748ba7457 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -33,6 +33,7 @@
 #include "program/program.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
+#include "main/framebuffer.h"
 #include "intel_batchbuffer.h"
 
 static void
@@ -284,7 +285,7 @@ upload_wm_state(struct brw_context *brw)
    const struct brw_wm_prog_data *prog_data = brw->wm.prog_data;
 
    /* _NEW_BUFFERS */
-   const bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
     * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index e1c4f8b5d14..8d6d3fe1d34 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -112,7 +112,7 @@ upload_gs_state(struct brw_context *brw)
           GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
          ((brw->gs.prog_data->invocations - 1) <<
           GEN7_GS_INSTANCE_CONTROL_SHIFT) |
-         brw->gs.prog_data->dispatch_mode |
+         SET_FIELD(prog_data->dispatch_mode, GEN7_GS_DISPATCH_MODE) |
          GEN6_GS_STATISTICS_ENABLE |
          (brw->gs.prog_data->include_primitive_id ?
           GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) |
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 58e33370c57..4fa46a8eb97 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -27,6 +27,7 @@
 #include "brw_util.h"
 #include "main/macros.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "intel_batchbuffer.h"
 
 static void
@@ -109,7 +110,7 @@ upload_sf_state(struct brw_context *brw)
    float point_size;
    /* _NEW_BUFFERS */
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    dw1 = GEN6_SF_STATISTICS_ENABLE;
 
@@ -192,30 +193,7 @@ upload_sf_state(struct brw_context *brw)
 
    /* _NEW_LINE */
    {
-      /* OpenGL dictates that line width should be rounded to the nearest
-       * integer
-       */
-      float line_width =
-         roundf(CLAMP(ctx->Line.Width, 0.0, ctx->Const.MaxLineWidth));
-      uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
-      /* Line width of 0 is not allowed when MSAA enabled */
-      if (ctx->Multisample._Enabled) {
-         if (line_width_u3_7 == 0)
-             line_width_u3_7 = 1;
-      } else if (ctx->Line.SmoothFlag && ctx->Line.Width < 1.5) {
-         /* For 1 pixel line thickness or less, the general
-          * anti-aliasing algorithm gives up, and a garbage line is
-          * generated.  Setting a Line Width of 0.0 specifies the
-          * rasterization of the "thinnest" (one-pixel-wide),
-          * non-antialiased lines.
-          *
-          * Lines rendered with zero Line Width are rasterized using
-          * Grid Intersection Quantization rules as specified by
-          * bspec section 6.3.12.1 Zero-Width (Cosmetic) Line
-          * Rasterization.
-          */
-         line_width_u3_7 = 0;
-      }
+      uint32_t line_width_u3_7 = brw_get_line_width(brw);
       dw2 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
    }
    if (ctx->Line.SmoothFlag) {
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
index eb596845b72..b655205ec35 100644
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "main/viewport.h"
 
 static void
@@ -45,10 +46,10 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
    /* _NEW_BUFFERS */
    if (render_to_fbo) {
       y_scale = 1.0;
-      y_bias = 0;
+      y_bias = 0.0;
    } else {
       y_scale = -1.0;
-      y_bias = ctx->DrawBuffer->Height;
+      y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 278b3ec6d21..4b17d06fa83 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -43,18 +43,52 @@ gen7_upload_constant_state(struct brw_context *brw,
    int dwords = brw->gen >= 8 ? 11 : 7;
    BEGIN_BATCH(dwords);
    OUT_BATCH(opcode << 16 | (dwords - 2));
-   OUT_BATCH(active ? stage_state->push_const_size : 0);
-   OUT_BATCH(0);
+
+   /* Workaround for SKL+ (we use option #2 until we have a need for more
+    * constant buffers). This comes from the documentation for 3DSTATE_CONSTANT_*
+    *
+    * The driver must ensure The following case does not occur without a flush
+    * to the 3D engine: 3DSTATE_CONSTANT_* with buffer 3 read length equal to
+    * zero committed followed by a 3DSTATE_CONSTANT_* with buffer 0 read length
+    * not equal to zero committed. Possible ways to avoid this condition
+    * include:
+    *     1. always force buffer 3 to have a non zero read length
+    *     2. always force buffer 0 to a zero read length
+    */
+   if (brw->gen >= 9 && active) {
+      OUT_BATCH(0);
+      OUT_BATCH(stage_state->push_const_size);
+   } else {
+      OUT_BATCH(active ? stage_state->push_const_size : 0);
+      OUT_BATCH(0);
+   }
    /* Pointer to the constant buffer.  Covered by the set of state flags
     * from gen6_prepare_wm_contants
     */
-   OUT_BATCH(active ? (stage_state->push_const_offset | mocs) : 0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   if (brw->gen >= 8) {
+   if (brw->gen >= 9 && active) {
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      /* XXX: When using buffers other than 0, you need to specify the
+       * graphics virtual address regardless of INSPM/debug bits
+       */
+      OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_RENDER, 0,
+                  stage_state->push_const_offset);
       OUT_BATCH(0);
       OUT_BATCH(0);
+   } else if (brw->gen>= 8) {
+      OUT_BATCH(active ? (stage_state->push_const_offset | mocs) : 0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH(active ? (stage_state->push_const_offset | mocs) : 0);
+      OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
    }
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index b9182758852..ea11ae845e3 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -30,6 +30,7 @@
 #include "program/program.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
+#include "main/framebuffer.h"
 #include "intel_batchbuffer.h"
 
 static void
@@ -45,7 +46,7 @@ upload_wm_state(struct brw_context *brw)
    uint32_t dw1, dw2;
 
    /* _NEW_BUFFERS */
-   bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    dw1 = dw2 = 0;
    dw1 |= GEN7_WM_STATISTICS_ENABLE;
@@ -76,6 +77,10 @@ upload_wm_state(struct brw_context *brw)
       dw1 |= GEN7_WM_KILL_ENABLE;
    }
 
+   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx)) {
+      dw1 |= GEN7_WM_DISPATCH_ENABLE;
+   }
+
    /* _NEW_BUFFERS | _NEW_COLOR */
    if (brw_color_buffer_write_enabled(brw) || writes_depth ||
        dw1 & GEN7_WM_KILL_ENABLE) {
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index b502650f991..12ac97a5d14 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -417,6 +417,16 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
    uint32_t surface_width  = ALIGN(mt->logical_width0,  level == 0 ? 8 : 1);
    uint32_t surface_height = ALIGN(mt->logical_height0, level == 0 ? 4 : 1);
 
+   /* From the documentation for 3DSTATE_WM_HZ_OP: "3DSTATE_MULTISAMPLE packet
+    * must be used prior to this packet to change the Number of Multisamples.
+    * This packet must not be used to change Number of Multisamples in a
+    * rendering sequence."
+    */
+   if (brw->num_samples != mt->num_samples) {
+      gen8_emit_3dstate_multisample(brw, mt->num_samples);
+      brw->NewGLState |= _NEW_MULTISAMPLE;
+   }
+
    /* The basic algorithm is:
     * - If needed, emit 3DSTATE_{DEPTH,HIER_DEPTH,STENCIL}_BUFFER and
     *   3DSTATE_CLEAR_PARAMS packets to set up the relevant buffers.
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 46b97131e20..26a02d3b045 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -48,8 +48,7 @@ gen8_upload_gs_state(struct brw_context *brw)
       OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2));
       OUT_BATCH(stage_state->prog_offset);
       OUT_BATCH(0);
-      OUT_BATCH(GEN6_GS_VECTOR_MASK_ENABLE |
-                brw->geometry_program->VerticesIn |
+      OUT_BATCH(brw->geometry_program->VerticesIn |
                 ((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((prog_data->base.binding_table.size_bytes / 4) <<
@@ -59,10 +58,6 @@ gen8_upload_gs_state(struct brw_context *brw)
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                      ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
-         WARN_ONCE(true,
-                   "May need to implement a temporary workaround: GS Number of "
-                   "URB Entries must be less than or equal to the GS Maximum "
-                   "Number of Threads.\n");
       } else {
          OUT_BATCH(0);
          OUT_BATCH(0);
@@ -81,7 +76,8 @@ gen8_upload_gs_state(struct brw_context *brw)
 
       uint32_t dw7 = (brw->gs.prog_data->control_data_header_size_hwords <<
                       GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
-                      brw->gs.prog_data->dispatch_mode |
+                     SET_FIELD(prog_data->dispatch_mode,
+                               GEN7_GS_DISPATCH_MODE) |
                      ((brw->gs.prog_data->invocations - 1) <<
                       GEN7_GS_INSTANCE_CONTROL_SHIFT) |
                       GEN6_GS_STATISTICS_ENABLE |
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 85ad3b6c551..a88f109c691 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -58,6 +58,9 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (prog_data->uses_omask)
       dw1 |= GEN8_PSX_OMASK_TO_RENDER_TARGET;
 
+   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx))
+      dw1 |= GEN8_PSX_SHADER_HAS_UAV;
+
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_PS_EXTRA << 16 | (2 - 2));
    OUT_BATCH(dw1);
@@ -72,7 +75,7 @@ upload_ps_extra(struct brw_context *brw)
       brw_fragment_program_const(brw->fragment_program);
    /* BRW_NEW_FS_PROG_DATA */
    const struct brw_wm_prog_data *prog_data = brw->wm.prog_data;
-   /* BRW_NEW_NUM_SAMPLES | _NEW_MULTISAMPLE */
+   /* BRW_NEW_NUM_SAMPLES */
    const bool multisampled_fbo = brw->num_samples > 1;
 
    gen8_upload_ps_extra(brw, &fp->program, prog_data, multisampled_fbo);
@@ -80,7 +83,7 @@ upload_ps_extra(struct brw_context *brw)
 
 const struct brw_tracked_state gen8_ps_extra = {
    .dirty = {
-      .mesa  = _NEW_MULTISAMPLE,
+      .mesa  = 0,
       .brw   = BRW_NEW_CONTEXT |
                BRW_NEW_FRAGMENT_PROGRAM |
                BRW_NEW_FS_PROG_DATA |
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index 52a21b6a8e8..c2b585d0001 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -154,14 +154,7 @@ upload_sf(struct brw_context *brw)
        dw1 |= GEN6_SF_VIEWPORT_TRANSFORM_ENABLE;
 
    /* _NEW_LINE */
-   /* OpenGL dictates that line width should be rounded to the nearest
-    * integer
-    */
-   float line_width =
-      roundf(CLAMP(ctx->Line.Width, 0.0, ctx->Const.MaxLineWidth));
-   uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
-   if (line_width_u3_7 == 0)
-      line_width_u3_7 = 1;
+   uint32_t line_width_u3_7 = brw_get_line_width(brw);
    if (brw->gen >= 9 || brw->is_cherryview) {
       dw1 |= line_width_u3_7 << GEN9_SF_LINE_WIDTH_SHIFT;
    } else {
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index d0c2d80b17b..b2d1a579815 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -57,6 +57,19 @@ swizzle_to_scs(unsigned swizzle)
 }
 
 static uint32_t
+surface_tiling_resource_mode(uint32_t tr_mode)
+{
+   switch (tr_mode) {
+   case INTEL_MIPTREE_TRMODE_YF:
+      return GEN9_SURFACE_TRMODE_TILEYF;
+   case INTEL_MIPTREE_TRMODE_YS:
+      return GEN9_SURFACE_TRMODE_TILEYS;
+   default:
+      return GEN9_SURFACE_TRMODE_NONE;
+   }
+}
+
+static uint32_t
 surface_tiling_mode(uint32_t tiling)
 {
    switch (tiling) {
@@ -70,8 +83,18 @@ surface_tiling_mode(uint32_t tiling)
 }
 
 static unsigned
-vertical_alignment(const struct intel_mipmap_tree *mt)
+vertical_alignment(const struct brw_context *brw,
+                   const struct intel_mipmap_tree *mt,
+                   uint32_t surf_type)
 {
+   /* On Gen9+ vertical alignment is ignored for 1D surfaces and when
+    * tr_mode is not TRMODE_NONE.
+    */
+   if (brw->gen > 8 &&
+       (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
+        surf_type == BRW_SURFACE_1D))
+      return 0;
+
    switch (mt->align_h) {
    case 4:
       return GEN8_SURFACE_VALIGN_4;
@@ -85,8 +108,18 @@ vertical_alignment(const struct intel_mipmap_tree *mt)
 }
 
 static unsigned
-horizontal_alignment(const struct intel_mipmap_tree *mt)
+horizontal_alignment(const struct brw_context *brw,
+                     const struct intel_mipmap_tree *mt,
+                     uint32_t surf_type)
 {
+   /* On Gen9+ horizontal alignment is ignored when tr_mode is not
+    * TRMODE_NONE.
+    */
+   if (brw->gen > 8 &&
+       (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
+        gen9_use_linear_1d_layout(brw, mt)))
+      return 0;
+
    switch (mt->align_w) {
    case 4:
       return GEN8_SURFACE_HALIGN_4;
@@ -100,11 +133,11 @@ horizontal_alignment(const struct intel_mipmap_tree *mt)
 }
 
 static uint32_t *
-allocate_surface_state(struct brw_context *brw, uint32_t *out_offset)
+allocate_surface_state(struct brw_context *brw, uint32_t *out_offset, int index)
 {
    int dwords = brw->gen >= 9 ? 16 : 13;
-   uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
-                                    dwords * 4, 64, out_offset);
+   uint32_t *surf = __brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
+                                      dwords * 4, 64, index, out_offset);
    memset(surf, 0, dwords * 4);
    return surf;
 }
@@ -120,7 +153,7 @@ gen8_emit_buffer_surface_state(struct brw_context *brw,
                                bool rw)
 {
    const unsigned mocs = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
-   uint32_t *surf = allocate_surface_state(brw, out_offset);
+   uint32_t *surf = allocate_surface_state(brw, out_offset, -1);
 
    surf[0] = BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
              surface_format << BRW_SURFACE_FORMAT_SHIFT |
@@ -164,7 +197,9 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
    struct intel_mipmap_tree *aux_mt = NULL;
    uint32_t aux_mode = 0;
    uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+   int surf_index = surf_offset - &brw->wm.base.surf_offset[0];
    unsigned tiling_mode, pitch;
+   const unsigned tr_mode = surface_tiling_resource_mode(mt->tr_mode);
 
    if (mt->format == MESA_FORMAT_S_UINT8) {
       tiling_mode = GEN8_SURFACE_TILING_W;
@@ -177,18 +212,29 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
    if (mt->mcs_mt) {
       aux_mt = mt->mcs_mt;
       aux_mode = GEN8_SURFACE_AUX_MODE_MCS;
+
+      /*
+       * From the BDW PRM, Volume 2d, page 260 (RENDER_SURFACE_STATE):
+       * "When MCS is enabled for non-MSRT, HALIGN_16 must be used"
+       *
+       * From the hardware spec for GEN9:
+       * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN
+       *  16 must be used."
+       */
+      assert(brw->gen < 9 || mt->align_w == 16);
+      assert(brw->gen < 8 || mt->num_samples > 1 || mt->align_w == 16);
    }
 
-   uint32_t *surf = allocate_surface_state(brw, surf_offset);
+   const uint32_t surf_type = translate_tex_target(target);
+   uint32_t *surf = allocate_surface_state(brw, surf_offset, surf_index);
 
-   surf[0] = translate_tex_target(target) << BRW_SURFACE_TYPE_SHIFT |
+   surf[0] = SET_FIELD(surf_type, BRW_SURFACE_TYPE) |
              format << BRW_SURFACE_FORMAT_SHIFT |
-             vertical_alignment(mt) |
-             horizontal_alignment(mt) |
+             vertical_alignment(brw, mt, surf_type) |
+             horizontal_alignment(brw, mt, surf_type) |
              tiling_mode;
 
-   if (target == GL_TEXTURE_CUBE_MAP ||
-       target == GL_TEXTURE_CUBE_MAP_ARRAY) {
+   if (surf_type == BRW_SURFACE_CUBE) {
       surf[0] |= BRW_SURFACE_CUBEFACE_ENABLES;
    }
 
@@ -209,6 +255,12 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
    surf[5] = SET_FIELD(min_level - mt->first_level, GEN7_SURFACE_MIN_LOD) |
              (max_level - min_level - 1); /* mip count */
 
+   if (brw->gen >= 9) {
+      surf[5] |= SET_FIELD(tr_mode, GEN9_SURFACE_TRMODE);
+      /* Disable Mip Tail by setting a large value. */
+      surf[5] |= SET_FIELD(15, GEN9_SURFACE_MIP_TAIL_START_LOD);
+   }
+
    if (aux_mt) {
       surf[6] = SET_FIELD(mt->qpitch / 4, GEN8_SURFACE_AUX_QPITCH) |
                 SET_FIELD((aux_mt->pitch / 128) - 1, GEN8_SURFACE_AUX_PITCH) |
@@ -310,7 +362,7 @@ gen8_emit_null_surface_state(struct brw_context *brw,
                              unsigned samples,
                              uint32_t *out_offset)
 {
-   uint32_t *surf = allocate_surface_state(brw, out_offset);
+   uint32_t *surf = allocate_surface_state(brw, out_offset, -1);
 
    surf[0] = BRW_SURFACE_NULL << BRW_SURFACE_TYPE_SHIFT |
              BRW_SURFACEFORMAT_B8G8R8A8_UNORM << BRW_SURFACE_FORMAT_SHIFT |
@@ -339,6 +391,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
    unsigned height = mt->logical_height0;
    unsigned pitch = mt->pitch;
    uint32_t tiling = mt->tiling;
+   unsigned tr_mode = surface_tiling_resource_mode(mt->tr_mode);
    uint32_t format = 0;
    uint32_t surf_type;
    uint32_t offset;
@@ -390,15 +443,26 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
    if (mt->mcs_mt) {
       aux_mt = mt->mcs_mt;
       aux_mode = GEN8_SURFACE_AUX_MODE_MCS;
+
+      /*
+       * From the BDW PRM, Volume 2d, page 260 (RENDER_SURFACE_STATE):
+       * "When MCS is enabled for non-MSRT, HALIGN_16 must be used"
+       *
+       * From the hardware spec for GEN9:
+       * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN
+       *  16 must be used."
+       */
+      assert(brw->gen < 9 || mt->align_w == 16);
+      assert(brw->gen < 8 || mt->num_samples > 1 || mt->align_w == 16);
    }
 
-   uint32_t *surf = allocate_surface_state(brw, &offset);
+   uint32_t *surf = allocate_surface_state(brw, &offset, surf_index);
 
    surf[0] = (surf_type << BRW_SURFACE_TYPE_SHIFT) |
              (is_array ? GEN7_SURFACE_IS_ARRAY : 0) |
              (format << BRW_SURFACE_FORMAT_SHIFT) |
-             vertical_alignment(mt) |
-             horizontal_alignment(mt) |
+             vertical_alignment(brw, mt, surf_type) |
+             horizontal_alignment(brw, mt, surf_type) |
              surface_tiling_mode(tiling);
 
    surf[1] = SET_FIELD(mocs, GEN8_SURFACE_MOCS) | mt->qpitch >> 2;
@@ -417,6 +481,12 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
 
    surf[5] = irb->mt_level - irb->mt->first_level;
 
+   if (brw->gen >= 9) {
+      surf[5] |= SET_FIELD(tr_mode, GEN9_SURFACE_TRMODE);
+      /* Disable Mip Tail by setting a large value. */
+      surf[5] |= SET_FIELD(15, GEN9_SURFACE_MIP_TAIL_START_LOD);
+   }
+
    if (aux_mt) {
       surf[6] = SET_FIELD(mt->qpitch / 4, GEN8_SURFACE_AUX_QPITCH) |
                 SET_FIELD((aux_mt->pitch / 128) - 1, GEN8_SURFACE_AUX_PITCH) |
diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
index 322e4663b99..2d8eeb1f10f 100644
--- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "main/viewport.h"
 
 static void
@@ -33,6 +34,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    float y_scale, y_bias;
+   const float fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
    float *vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
@@ -47,7 +49,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
       y_bias = 0;
    } else {
       y_scale = -1.0;
-      y_bias = ctx->DrawBuffer->Height;
+      y_bias = fb_height;
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
@@ -116,8 +118,8 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
       } else {
          vp[12] = ctx->ViewportArray[i].X;
          vp[13] = viewport_Xmax - 1;
-         vp[14] = ctx->DrawBuffer->Height - viewport_Ymax;
-         vp[15] = ctx->DrawBuffer->Height - ctx->ViewportArray[i].Y - 1;
+         vp[14] = fb_height - viewport_Ymax;
+         vp[15] = fb_height - ctx->ViewportArray[i].Y - 1;
       }
 
       vp += 16;
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index f92af55e37f..28f5adddf14 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -39,6 +39,9 @@ upload_vs_state(struct brw_context *brw)
    /* BRW_NEW_VS_PROG_DATA */
    const struct brw_vue_prog_data *prog_data = &brw->vs.prog_data->base;
 
+   assert(prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
+          prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
+
    if (prog_data->base.use_alt_mode)
       floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT;
 
@@ -66,7 +69,8 @@ upload_vs_state(struct brw_context *brw)
              (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
              (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
 
-   uint32_t simd8_enable = prog_data->simd8 ? GEN8_VS_SIMD8_ENABLE : 0;
+   uint32_t simd8_enable = prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ?
+      GEN8_VS_SIMD8_ENABLE : 0;
    OUT_BATCH(((brw->max_vs_threads - 1) << HSW_VS_MAX_THREADS_SHIFT) |
              GEN6_VS_STATISTICS_ENABLE |
              simd8_enable |
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index e522e4e9c1d..ed659ed625e 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -743,27 +743,54 @@ intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
    brw_render_cache_set_clear(brw);
 }
 
-void
-brw_load_register_mem(struct brw_context *brw,
-                      uint32_t reg,
-                      drm_intel_bo *bo,
-                      uint32_t read_domains, uint32_t write_domain,
-                      uint32_t offset)
+static void
+load_sized_register_mem(struct brw_context *brw,
+                        uint32_t reg,
+                        drm_intel_bo *bo,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t offset,
+                        int size)
 {
+   int i;
+
    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
    assert(brw->gen >= 7);
 
    if (brw->gen >= 8) {
-      BEGIN_BATCH(4);
-      OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
-      OUT_BATCH(reg);
-      OUT_RELOC64(bo, read_domains, write_domain, offset);
+      BEGIN_BATCH(4 * size);
+      for (i = 0; i < size; i++) {
+         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
+         OUT_BATCH(reg + i * 4);
+         OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
+      }
       ADVANCE_BATCH();
    } else {
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
-      OUT_BATCH(reg);
-      OUT_RELOC(bo, read_domains, write_domain, offset);
+      BEGIN_BATCH(3 * size);
+      for (i = 0; i < size; i++) {
+         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
+         OUT_BATCH(reg + i * 4);
+         OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
+      }
       ADVANCE_BATCH();
    }
 }
+
+void
+brw_load_register_mem(struct brw_context *brw,
+                      uint32_t reg,
+                      drm_intel_bo *bo,
+                      uint32_t read_domains, uint32_t write_domain,
+                      uint32_t offset)
+{
+   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
+}
+
+void
+brw_load_register_mem64(struct brw_context *brw,
+                        uint32_t reg,
+                        drm_intel_bo *bo,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t offset)
+{
+   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
+}
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index 7680a402975..d3ab769356c 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -77,13 +77,10 @@ br13_for_cpp(int cpp)
    switch (cpp) {
    case 4:
       return BR13_8888;
-      break;
    case 2:
       return BR13_565;
-      break;
    case 1:
       return BR13_8;
-      break;
    default:
       unreachable("not reached");
    }
@@ -130,6 +127,40 @@ set_blitter_tiling(struct brw_context *brw,
       ADVANCE_BATCH();                                                  \
    } while (0)
 
+static int
+blt_pitch(struct intel_mipmap_tree *mt)
+{
+   int pitch = mt->pitch;
+   if (mt->tiling)
+      pitch /= 4;
+   return pitch;
+}
+
+bool
+intel_miptree_blit_compatible_formats(mesa_format src, mesa_format dst)
+{
+   /* The BLT doesn't handle sRGB conversion */
+   assert(src == _mesa_get_srgb_format_linear(src));
+   assert(dst == _mesa_get_srgb_format_linear(dst));
+
+   /* No swizzle or format conversions possible, except... */
+   if (src == dst)
+      return true;
+
+   /* ...we can either discard the alpha channel when going from A->X,
+    * or we can fill the alpha channel with 0xff when going from X->A
+    */
+   if (src == MESA_FORMAT_B8G8R8A8_UNORM || src == MESA_FORMAT_B8G8R8X8_UNORM)
+      return (dst == MESA_FORMAT_B8G8R8A8_UNORM ||
+              dst == MESA_FORMAT_B8G8R8X8_UNORM);
+
+   if (src == MESA_FORMAT_R8G8B8A8_UNORM || src == MESA_FORMAT_R8G8B8X8_UNORM)
+      return (dst == MESA_FORMAT_R8G8B8A8_UNORM ||
+              dst == MESA_FORMAT_R8G8B8X8_UNORM);
+
+   return false;
+}
+
 /**
  * Implements a rectangular block transfer (blit) of pixels between two
  * miptrees.
@@ -172,11 +203,7 @@ intel_miptree_blit(struct brw_context *brw,
     * the X channel don't matter), and XRGB8888 to ARGB8888 by setting the A
     * channel to 1.0 at the end.
     */
-   if (src_format != dst_format &&
-      ((src_format != MESA_FORMAT_B8G8R8A8_UNORM &&
-        src_format != MESA_FORMAT_B8G8R8X8_UNORM) ||
-       (dst_format != MESA_FORMAT_B8G8R8A8_UNORM &&
-        dst_format != MESA_FORMAT_B8G8R8X8_UNORM))) {
+   if (!intel_miptree_blit_compatible_formats(src_format, dst_format)) {
       perf_debug("%s: Can't use hardware blitter from %s to %s, "
                  "falling back.\n", __func__,
                  _mesa_get_format_name(src_format),
@@ -197,14 +224,14 @@ intel_miptree_blit(struct brw_context *brw,
     *
     * Furthermore, intelEmitCopyBlit (which is called below) uses a signed
     * 16-bit integer to represent buffer pitch, so it can only handle buffer
-    * pitches < 32k.
+    * pitches < 32k. However, the pitch is measured in bytes for linear buffers
+    * and dwords for tiled buffers.
     *
     * As a result of these two limitations, we can only use the blitter to do
-    * this copy when the miptree's pitch is less than 32k.
+    * this copy when the miptree's pitch is less than 32k linear or 128k tiled.
     */
-   if (src_mt->pitch >= 32768 ||
-       dst_mt->pitch >= 32768) {
-      perf_debug("Falling back due to >=32k pitch\n");
+   if (blt_pitch(src_mt) >= 32768 || blt_pitch(dst_mt) >= 32768) {
+      perf_debug("Falling back due to >= 32k/128k pitch\n");
       return false;
    }
 
@@ -261,8 +288,9 @@ intel_miptree_blit(struct brw_context *brw,
       return false;
    }
 
-   if (src_mt->format == MESA_FORMAT_B8G8R8X8_UNORM &&
-       dst_mt->format == MESA_FORMAT_B8G8R8A8_UNORM) {
+   /* XXX This could be done in a single pass using XY_FULL_MONO_PATTERN_BLT */
+   if (_mesa_get_format_bits(src_format, GL_ALPHA_BITS) == 0 &&
+       _mesa_get_format_bits(dst_format, GL_ALPHA_BITS) > 0) {
       intel_miptree_set_alpha_to_one(brw, dst_mt,
                                      dst_x, dst_y,
                                      width, height);
diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
index f563939fdd9..2287c379c4e 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -46,6 +46,8 @@ intelEmitCopyBlit(struct brw_context *brw,
                               GLshort w, GLshort h,
 			      GLenum logicop );
 
+bool intel_miptree_blit_compatible_formats(mesa_format src, mesa_format dst);
+
 bool intel_miptree_blit(struct brw_context *brw,
                         struct intel_mipmap_tree *src_mt,
                         int src_level, int src_slice,
diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index 33a0348486d..75cf7854eff 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -88,25 +88,22 @@ intel_debug_flag_for_shader_stage(gl_shader_stage stage)
 }
 
 void
-brw_process_intel_debug_variable(struct brw_context *brw)
+brw_process_intel_debug_variable(struct intel_screen *screen)
 {
    uint64_t intel_debug = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
    (void) p_atomic_cmpxchg(&INTEL_DEBUG, 0, intel_debug);
 
    if (INTEL_DEBUG & DEBUG_BUFMGR)
-      dri_bufmgr_set_debug(brw->bufmgr, true);
+      dri_bufmgr_set_debug(screen->bufmgr, true);
 
-   if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && brw->gen < 7) {
+   if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && screen->devinfo->gen < 7) {
       fprintf(stderr,
               "shader_time debugging requires gen7 (Ivybridge) or better.\n");
       INTEL_DEBUG &= ~DEBUG_SHADER_TIME;
    }
 
-   if (INTEL_DEBUG & DEBUG_PERF)
-      brw->perf_debug = true;
-
    if (INTEL_DEBUG & DEBUG_AUB)
-      drm_intel_bufmgr_gem_set_aub_dump(brw->bufmgr, true);
+      drm_intel_bufmgr_gem_set_aub_dump(screen->bufmgr, true);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_debug.h b/src/mesa/drivers/dri/i965/intel_debug.h
index f754be20b1d..4689492e1fd 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.h
+++ b/src/mesa/drivers/dri/i965/intel_debug.h
@@ -114,8 +114,8 @@ extern uint64_t INTEL_DEBUG;
 
 extern uint64_t intel_debug_flag_for_shader_stage(gl_shader_stage stage);
 
-struct brw_context;
+struct intel_screen;
 
-extern void brw_process_intel_debug_variable(struct brw_context *brw);
+extern void brw_process_intel_debug_variable(struct intel_screen *);
 
 extern bool brw_env_var_as_boolean(const char *var_name, bool default_value);
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index d6da34c7065..c99677c7197 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -323,9 +323,12 @@ intelInitExtensions(struct gl_context *ctx)
       }
    }
 
+   brw->predicate.supported = false;
+
    if (brw->gen >= 7) {
       ctx->Extensions.ARB_conservative_depth = true;
       ctx->Extensions.ARB_derivative_control = true;
+      ctx->Extensions.ARB_framebuffer_no_attachments = true;
       ctx->Extensions.ARB_gpu_shader5 = true;
       ctx->Extensions.ARB_shader_atomic_counters = true;
       ctx->Extensions.ARB_texture_compression_bptc = true;
@@ -337,6 +340,9 @@ intelInitExtensions(struct gl_context *ctx)
          ctx->Extensions.ARB_transform_feedback2 = true;
          ctx->Extensions.ARB_transform_feedback3 = true;
          ctx->Extensions.ARB_transform_feedback_instanced = true;
+
+         if (brw->intelScreen->cmd_parser_version >= 2)
+            brw->predicate.supported = true;
       }
 
       /* Only enable this in core profile because other parts of Mesa behave
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index aebed723f75..1b3a72f3ec2 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -390,7 +390,7 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
                                          image->height,
                                          1,
                                          image->pitch,
-                                         true /*disable_aux_buffers*/);
+                                         MIPTREE_LAYOUT_DISABLE_AUX);
    if (!irb->mt)
       return;
 
@@ -1027,10 +1027,9 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                  intel_image->base.Base.Level,
                                  intel_image->base.Base.Level,
                                  width, height, depth,
-                                 true,
                                  irb->mt->num_samples,
                                  INTEL_MIPTREE_TILING_ANY,
-                                 false);
+                                 MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
 
    if (intel_miptree_wants_hiz_buffer(brw, new_mt)) {
       intel_miptree_alloc_hiz(brw, new_mt);
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 24a5c3dc666..6aa969a4930 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -158,15 +158,32 @@ intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
    }
 }
 
+bool
+intel_tiling_supports_non_msrt_mcs(struct brw_context *brw, unsigned tiling)
+{
+   /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
+    * Target(s)", beneath the "Fast Color Clear" bullet (p326):
+    *
+    *     - Support is limited to tiled render targets.
+    *
+    * Gen9 changes the restriction to Y-tile only.
+    */
+   if (brw->gen >= 9)
+      return tiling == I915_TILING_Y;
+   else if (brw->gen >= 7)
+      return tiling != I915_TILING_NONE;
+   else
+      return false;
+}
 
 /**
  * For a single-sampled render target ("non-MSRT"), determine if an MCS buffer
- * can be used.
+ * can be used. This doesn't (and should not) inspect any of the properties of
+ * the miptree's BO.
  *
  * From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render Target(s)",
  * beneath the "Fast Color Clear" bullet (p326):
  *
- *     - Support is limited to tiled render targets.
  *     - Support is for non-mip-mapped and non-array surface types only.
  *
  * And then later, on p327:
@@ -175,8 +192,8 @@ intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
  *       64bpp, and 128bpp.
  */
 bool
-intel_is_non_msrt_mcs_buffer_supported(struct brw_context *brw,
-                                       struct intel_mipmap_tree *mt)
+intel_miptree_is_fast_clear_capable(struct brw_context *brw,
+                                    struct intel_mipmap_tree *mt)
 {
    /* MCS support does not exist prior to Gen7 */
    if (brw->gen < 7)
@@ -193,15 +210,25 @@ intel_is_non_msrt_mcs_buffer_supported(struct brw_context *brw,
       return false;
    }
 
-   if (mt->tiling != I915_TILING_X &&
-       mt->tiling != I915_TILING_Y)
-      return false;
    if (mt->cpp != 4 && mt->cpp != 8 && mt->cpp != 16)
       return false;
-   if (mt->first_level != 0 || mt->last_level != 0)
+   if (mt->first_level != 0 || mt->last_level != 0) {
+      if (brw->gen >= 8) {
+         perf_debug("Multi-LOD fast clear - giving up (%dx%dx%d).\n",
+                    mt->logical_width0, mt->logical_height0, mt->last_level);
+      }
+
       return false;
-   if (mt->physical_depth0 != 1)
+   }
+   if (mt->physical_depth0 != 1) {
+      if (brw->gen >= 8) {
+         perf_debug("Layered fast clear - giving up. (%dx%d%d)\n",
+                    mt->logical_width0, mt->logical_height0,
+                    mt->physical_depth0);
+      }
+
       return false;
+   }
 
    /* There's no point in using an MCS buffer if the surface isn't in a
     * renderable format.
@@ -244,10 +271,9 @@ intel_miptree_create_layout(struct brw_context *brw,
                             GLuint width0,
                             GLuint height0,
                             GLuint depth0,
-                            bool for_bo,
                             GLuint num_samples,
-                            bool force_all_slices_at_each_lod,
-                            bool disable_aux_buffers)
+                            enum intel_miptree_tiling_mode requested,
+                            uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
    if (!mt)
@@ -286,7 +312,7 @@ intel_miptree_create_layout(struct brw_context *brw,
    mt->logical_height0 = height0;
    mt->logical_depth0 = depth0;
    mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_NO_MCS;
-   mt->disable_aux_buffers = disable_aux_buffers;
+   mt->disable_aux_buffers = (layout_flags & MIPTREE_LAYOUT_DISABLE_AUX) != 0;
    exec_list_make_empty(&mt->hiz_map);
 
    /* The cpp is bytes per (1, blockheight)-sized block for compressed
@@ -422,12 +448,15 @@ intel_miptree_create_layout(struct brw_context *brw,
    mt->physical_height0 = height0;
    mt->physical_depth0 = depth0;
 
-   if (!for_bo &&
+   if (!(layout_flags & MIPTREE_LAYOUT_FOR_BO) &&
        _mesa_get_format_base_format(format) == GL_DEPTH_STENCIL &&
        (brw->must_use_separate_stencil ||
 	(brw->has_separate_stencil &&
          intel_miptree_wants_hiz_buffer(brw, mt)))) {
-      const bool force_all_slices_at_each_lod = brw->gen == 6;
+      uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
+      if (brw->gen == 6)
+         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
+
       mt->stencil_mt = intel_miptree_create(brw,
                                             mt->target,
                                             MESA_FORMAT_S_UINT8,
@@ -436,10 +465,10 @@ intel_miptree_create_layout(struct brw_context *brw,
                                             mt->logical_width0,
                                             mt->logical_height0,
                                             mt->logical_depth0,
-                                            true,
                                             num_samples,
                                             INTEL_MIPTREE_TILING_ANY,
-                                            force_all_slices_at_each_lod);
+                                            stencil_flags);
+
       if (!mt->stencil_mt) {
 	 intel_miptree_release(&mt);
 	 return NULL;
@@ -457,119 +486,36 @@ intel_miptree_create_layout(struct brw_context *brw,
       }
    }
 
-   if (force_all_slices_at_each_lod)
+   if (layout_flags & MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD)
       mt->array_layout = ALL_SLICES_AT_EACH_LOD;
 
-   brw_miptree_layout(brw, mt);
-
-   if (mt->disable_aux_buffers)
-      assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
-
-   return mt;
-}
-
-/**
- * \brief Helper function for intel_miptree_create().
- */
-static uint32_t
-intel_miptree_choose_tiling(struct brw_context *brw,
-                            mesa_format format,
-                            uint32_t width0,
-                            uint32_t num_samples,
-                            enum intel_miptree_tiling_mode requested,
-                            struct intel_mipmap_tree *mt)
-{
-   if (format == MESA_FORMAT_S_UINT8) {
-      /* The stencil buffer is W tiled. However, we request from the kernel a
-       * non-tiled buffer because the GTT is incapable of W fencing.
-       */
-      return I915_TILING_NONE;
-   }
-
-   /* Some usages may want only one type of tiling, like depth miptrees (Y
-    * tiled), or temporary BOs for uploading data once (linear).
-    */
-   switch (requested) {
-   case INTEL_MIPTREE_TILING_ANY:
-      break;
-   case INTEL_MIPTREE_TILING_Y:
-      return I915_TILING_Y;
-   case INTEL_MIPTREE_TILING_NONE:
-      return I915_TILING_NONE;
-   }
-
-   if (num_samples > 1) {
-      /* From p82 of the Sandy Bridge PRM, dw3[1] of SURFACE_STATE ("Tiled
-       * Surface"):
-       *
-       *   [DevSNB+]: For multi-sample render targets, this field must be
-       *   1. MSRTs can only be tiled.
-       *
-       * Our usual reason for preferring X tiling (fast blits using the
-       * blitting engine) doesn't apply to MSAA, since we'll generally be
-       * downsampling or upsampling when blitting between the MSAA buffer
-       * and another buffer, and the blitting engine doesn't support that.
-       * So use Y tiling, since it makes better use of the cache.
-       */
-      return I915_TILING_Y;
-   }
-
-   GLenum base_format = _mesa_get_format_base_format(format);
-   if (base_format == GL_DEPTH_COMPONENT ||
-       base_format == GL_DEPTH_STENCIL_EXT)
-      return I915_TILING_Y;
-
-   /* 1D textures (and 1D array textures) don't get any benefit from tiling,
-    * in fact it leads to a less efficient use of memory space and bandwidth
-    * due to tile alignment.
+   /*
+    * Obey HALIGN_16 constraints for Gen8 and Gen9 buffers which are
+    * multisampled or have an AUX buffer attached to it.
+    *
+    * GEN  |    MSRT        | AUX_CCS_* or AUX_MCS
+    *  -------------------------------------------
+    *  9   |  HALIGN_16     |    HALIGN_16
+    *  8   |  HALIGN_ANY    |    HALIGN_16
+    *  7   |      ?         |        ?
+    *  6   |      ?         |        ?
     */
-   if (mt->logical_height0 == 1)
-      return I915_TILING_NONE;
-
-   int minimum_pitch = mt->total_width * mt->cpp;
-
-   /* If the width is much smaller than a tile, don't bother tiling. */
-   if (minimum_pitch < 64)
-      return I915_TILING_NONE;
-
-   if (ALIGN(minimum_pitch, 512) >= 32768 ||
-       mt->total_width >= 32768 || mt->total_height >= 32768) {
-      perf_debug("%dx%d miptree too large to blit, falling back to untiled",
-                 mt->total_width, mt->total_height);
-      return I915_TILING_NONE;
+   if (intel_miptree_is_fast_clear_capable(brw, mt)) {
+      if (brw->gen >= 9 || (brw->gen == 8 && num_samples <= 1))
+         layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
+   } else if (brw->gen >= 9 && num_samples > 1) {
+      layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
+   } else {
+      /* For now, nothing else has this requirement */
+      assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
    }
 
-   /* Pre-gen6 doesn't have BLORP to handle Y-tiling, so use X-tiling. */
-   if (brw->gen < 6)
-      return I915_TILING_X;
+   brw_miptree_layout(brw, mt, requested, layout_flags);
 
-   /* From the Sandybridge PRM, Volume 1, Part 2, page 32:
-    * "NOTE: 128BPE Format Color Buffer ( render target ) MUST be either TileX
-    *  or Linear."
-    * 128 bits per pixel translates to 16 bytes per pixel. This is necessary
-    * all the way back to 965, but is permitted on Gen7+.
-    */
-   if (brw->gen < 7 && mt->cpp >= 16)
-      return I915_TILING_X;
-
-   /* From the Ivy Bridge PRM, Vol4 Part1 2.12.2.1 (SURFACE_STATE for most
-    * messages), on p64, under the heading "Surface Vertical Alignment":
-    *
-    *     This field must be set to VALIGN_4 for all tiled Y Render Target
-    *     surfaces.
-    *
-    * So if the surface is renderable and uses a vertical alignment of 2,
-    * force it to be X tiled.  This is somewhat conservative (it's possible
-    * that the client won't ever render to this surface), but it's difficult
-    * to know that ahead of time.  And besides, since we use a vertical
-    * alignment of 4 as often as we can, this shouldn't happen very often.
-    */
-   if (brw->gen == 7 && mt->align_h == 2 &&
-       brw->format_supported_as_render_target[format]) {
-      return I915_TILING_X;
-   }
+   if (mt->disable_aux_buffers)
+      assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
 
-   return I915_TILING_Y | I915_TILING_X;
+   return mt;
 }
 
 
@@ -615,33 +561,33 @@ intel_lower_compressed_format(struct brw_context *brw, mesa_format format)
 
 struct intel_mipmap_tree *
 intel_miptree_create(struct brw_context *brw,
-		     GLenum target,
-		     mesa_format format,
-		     GLuint first_level,
-		     GLuint last_level,
-		     GLuint width0,
-		     GLuint height0,
-		     GLuint depth0,
-		     bool expect_accelerated_upload,
+                     GLenum target,
+                     mesa_format format,
+                     GLuint first_level,
+                     GLuint last_level,
+                     GLuint width0,
+                     GLuint height0,
+                     GLuint depth0,
                      GLuint num_samples,
                      enum intel_miptree_tiling_mode requested_tiling,
-                     bool force_all_slices_at_each_lod)
+                     uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
    mesa_format tex_format = format;
    mesa_format etc_format = MESA_FORMAT_NONE;
    GLuint total_width, total_height;
+   uint32_t alloc_flags = 0;
 
    format = intel_lower_compressed_format(brw, format);
 
    etc_format = (format != tex_format) ? tex_format : MESA_FORMAT_NONE;
 
+   assert((layout_flags & MIPTREE_LAYOUT_DISABLE_AUX) == 0);
+   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
    mt = intel_miptree_create_layout(brw, target, format,
-				      first_level, last_level, width0,
-				      height0, depth0,
-                                    false, num_samples,
-                                    force_all_slices_at_each_lod,
-                                    false /*disable_aux_buffers*/);
+                                    first_level, last_level, width0,
+                                    height0, depth0, num_samples,
+                                    requested_tiling, layout_flags);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -659,25 +605,21 @@ intel_miptree_create(struct brw_context *brw,
       total_height = ALIGN(total_height, 64);
    }
 
-   uint32_t tiling = intel_miptree_choose_tiling(brw, format, width0,
-                                                 num_samples, requested_tiling,
-                                                 mt);
    bool y_or_x = false;
 
-   if (tiling == (I915_TILING_Y | I915_TILING_X)) {
+   if (mt->tiling == (I915_TILING_Y | I915_TILING_X)) {
       y_or_x = true;
       mt->tiling = I915_TILING_Y;
-   } else {
-      mt->tiling = tiling;
    }
 
+   if (layout_flags & MIPTREE_LAYOUT_ACCELERATED_UPLOAD)
+      alloc_flags |= BO_ALLOC_FOR_RENDER;
+
    unsigned long pitch;
+   mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree", total_width,
+                                     total_height, mt->cpp, &mt->tiling,
+                                     &pitch, alloc_flags);
    mt->etc_format = etc_format;
-   mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                     total_width, total_height, mt->cpp,
-                                     &mt->tiling, &pitch,
-                                     (expect_accelerated_upload ?
-                                      BO_ALLOC_FOR_RENDER : 0));
    mt->pitch = pitch;
 
    /* If the BO is too large to fit in the aperture, we need to use the
@@ -691,10 +633,8 @@ intel_miptree_create(struct brw_context *brw,
       mt->tiling = I915_TILING_X;
       drm_intel_bo_unreference(mt->bo);
       mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                        total_width, total_height, mt->cpp,
-                                        &mt->tiling, &pitch,
-                                        (expect_accelerated_upload ?
-                                         BO_ALLOC_FOR_RENDER : 0));
+                                  total_width, total_height, mt->cpp,
+                                  &mt->tiling, &pitch, alloc_flags);
       mt->pitch = pitch;
    }
 
@@ -707,6 +647,7 @@ intel_miptree_create(struct brw_context *brw,
 
 
    if (mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) {
+      assert(mt->num_samples > 1);
       if (!intel_miptree_alloc_mcs(brw, mt, num_samples)) {
          intel_miptree_release(&mt);
          return NULL;
@@ -718,8 +659,11 @@ intel_miptree_create(struct brw_context *brw,
     * Allocation of the MCS miptree will be deferred until the first fast
     * clear actually occurs.
     */
-   if (intel_is_non_msrt_mcs_buffer_supported(brw, mt))
+   if (intel_tiling_supports_non_msrt_mcs(brw, mt->tiling) &&
+       intel_miptree_is_fast_clear_capable(brw, mt)) {
       mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
+      assert(brw->gen < 8 || mt->align_w == 16 || num_samples <= 1);
+   }
 
    return mt;
 }
@@ -733,7 +677,7 @@ intel_miptree_create_for_bo(struct brw_context *brw,
                             uint32_t height,
                             uint32_t depth,
                             int pitch,
-                            bool disable_aux_buffers)
+                            uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
    uint32_t tiling, swizzle;
@@ -754,11 +698,18 @@ intel_miptree_create_for_bo(struct brw_context *brw,
 
    target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
 
+   /* 'requested' parameter of intel_miptree_create_layout() is relevant
+    * only for non bo miptree. Tiling for bo is already computed above.
+    * So, the tiling requested (INTEL_MIPTREE_TILING_ANY) below is
+    * just a place holder and will not make any change to the miptree
+    * tiling format.
+    */
+   layout_flags |= MIPTREE_LAYOUT_FOR_BO;
    mt = intel_miptree_create_layout(brw, target, format,
                                     0, 0,
-                                    width, height, depth,
-                                    true, 0, false,
-                                    disable_aux_buffers);
+                                    width, height, depth, 0,
+                                    INTEL_MIPTREE_TILING_ANY,
+                                    layout_flags);
    if (!mt)
       return NULL;
 
@@ -808,7 +759,7 @@ intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
                                                  height,
                                                  1,
                                                  pitch,
-                                                 false);
+                                                 0);
    if (!singlesample_mt)
       goto fail;
 
@@ -817,7 +768,8 @@ intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
     * Allocation of the MCS miptree will be deferred until the first fast
     * clear actually occurs.
     */
-   if (intel_is_non_msrt_mcs_buffer_supported(intel, singlesample_mt))
+   if (intel_tiling_supports_non_msrt_mcs(intel, singlesample_mt->tiling) &&
+       intel_miptree_is_fast_clear_capable(intel, singlesample_mt))
       singlesample_mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
 
    if (num_samples == 0) {
@@ -866,8 +818,9 @@ intel_miptree_create_for_renderbuffer(struct brw_context *brw,
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
-			     width, height, depth, true, num_samples,
-                             INTEL_MIPTREE_TILING_ANY, false);
+                             width, height, depth, num_samples,
+                             INTEL_MIPTREE_TILING_ANY,
+                             MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
    if (!mt)
       goto fail;
 
@@ -1258,8 +1211,10 @@ intel_miptree_copy_slice(struct brw_context *brw,
    assert(src_mt->format == dst_mt->format);
 
    if (dst_mt->compressed) {
-      height = ALIGN(height, dst_mt->align_h) / dst_mt->align_h;
-      width = ALIGN(width, dst_mt->align_w);
+      unsigned int i, j;
+      _mesa_get_format_block_size(dst_mt->format, &i, &j);
+      height = ALIGN(height, j) / j;
+      width = ALIGN(width, i);
    }
 
    /* If it's a packed depth/stencil buffer with separate stencil, the blit
@@ -1378,10 +1333,9 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
                                      mt->logical_width0,
                                      mt->logical_height0,
                                      mt->logical_depth0,
-                                     true,
                                      0 /* num_samples */,
                                      INTEL_MIPTREE_TILING_Y,
-                                     false);
+                                     MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
 
    /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
     *
@@ -1429,6 +1383,9 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    unsigned mcs_height =
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
+   if (brw->gen >= 8)
+      layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1437,10 +1394,9 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                      mcs_width,
                                      mcs_height,
                                      mt->logical_depth0,
-                                     true,
                                      0 /* num_samples */,
                                      INTEL_MIPTREE_TILING_Y,
-                                     false);
+                                     layout_flags);
 
    return mt->mcs_mt;
 }
@@ -1682,7 +1638,10 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                              struct intel_mipmap_tree *mt)
 {
    struct intel_miptree_aux_buffer *buf = calloc(sizeof(*buf), 1);
-   const bool force_all_slices_at_each_lod = brw->gen == 6;
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
+
+   if (brw->gen == 6)
+      layout_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
 
    if (!buf)
       return NULL;
@@ -1695,10 +1654,9 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                                   mt->logical_width0,
                                   mt->logical_height0,
                                   mt->logical_depth0,
-                                  true,
                                   mt->num_samples,
                                   INTEL_MIPTREE_TILING_ANY,
-                                  force_all_slices_at_each_lod);
+                                  layout_flags);
    if (!buf->mt) {
       free(buf);
       return NULL;
@@ -2128,9 +2086,8 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  false, 0,
-                                  INTEL_MIPTREE_TILING_NONE,
-                                  false);
+                                  0, INTEL_MIPTREE_TILING_NONE, 0);
+
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
       goto fail;
@@ -2675,7 +2632,9 @@ intel_miptree_map(struct brw_context *brw,
    } else if (use_intel_mipree_map_blit(brw, mt, mode, level, slice)) {
       intel_miptree_map_blit(brw, mt, map, level, slice);
 #if defined(USE_SSE41)
-   } else if (!(mode & GL_MAP_WRITE_BIT) && !mt->compressed && cpu_has_sse4_1) {
+   } else if (!(mode & GL_MAP_WRITE_BIT) &&
+              !mt->compressed && cpu_has_sse4_1 &&
+              (mt->pitch % 16 == 0)) {
       intel_miptree_map_movntdqa(brw, mt, map, level, slice);
 #endif
    } else {
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 8b42e4adb79..bde6daa4e2d 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -330,6 +330,13 @@ struct intel_miptree_aux_buffer
    struct intel_mipmap_tree *mt; /**< hiz miptree used with Gen6 */
 };
 
+/* Tile resource modes */
+enum intel_miptree_tr_mode {
+   INTEL_MIPTREE_TRMODE_NONE,
+   INTEL_MIPTREE_TRMODE_YF,
+   INTEL_MIPTREE_TRMODE_YS
+};
+
 struct intel_mipmap_tree
 {
    /** Buffer object containing the pixel data. */
@@ -338,6 +345,7 @@ struct intel_mipmap_tree
    uint32_t pitch; /**< pitch in bytes. */
 
    uint32_t tiling; /**< One of the I915_TILING_* flags */
+   enum intel_miptree_tr_mode tr_mode;
 
    /* Effectively the key:
     */
@@ -514,19 +522,27 @@ enum intel_miptree_tiling_mode {
    INTEL_MIPTREE_TILING_NONE,
 };
 
-bool
-intel_is_non_msrt_mcs_buffer_supported(struct brw_context *brw,
-                                       struct intel_mipmap_tree *mt);
-
 void
 intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
                                  unsigned *width_px, unsigned *height);
-
+bool
+intel_tiling_supports_non_msrt_mcs(struct brw_context *brw, unsigned tiling);
+bool
+intel_miptree_is_fast_clear_capable(struct brw_context *brw,
+                                    struct intel_mipmap_tree *mt);
 bool
 intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt);
 
+enum {
+   MIPTREE_LAYOUT_ACCELERATED_UPLOAD       = 1 << 0,
+   MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD   = 1 << 1,
+   MIPTREE_LAYOUT_FOR_BO                   = 1 << 2,
+   MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
+   MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
+};
+
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLenum target,
 					       mesa_format format,
@@ -535,10 +551,9 @@ struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLuint width0,
                                                GLuint height0,
                                                GLuint depth0,
-					       bool expect_accelerated_upload,
                                                GLuint num_samples,
                                                enum intel_miptree_tiling_mode,
-                                               bool force_all_slices_at_each_lod);
+                                               uint32_t flags);
 
 struct intel_mipmap_tree *
 intel_miptree_create_for_bo(struct brw_context *brw,
@@ -549,7 +564,7 @@ intel_miptree_create_for_bo(struct brw_context *brw,
                             uint32_t height,
                             uint32_t depth,
                             int pitch,
-                            bool disable_aux_buffers);
+                            uint32_t layout_flags);
 
 void
 intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
@@ -753,7 +768,11 @@ brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
                                      const struct intel_mipmap_tree *mt,
                                      unsigned level);
 
-void brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt);
+void
+brw_miptree_layout(struct brw_context *brw,
+                   struct intel_mipmap_tree *mt,
+                   enum intel_miptree_tiling_mode requested,
+                   uint32_t layout_flags);
 
 void *intel_miptree_map_raw(struct brw_context *brw,
                             struct intel_mipmap_tree *mt);
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
index 4ecefc8cf54..6c6bd8629ac 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
@@ -28,6 +28,7 @@
 #include "main/glheader.h"
 #include "main/enums.h"
 #include "main/image.h"
+#include "main/glformats.h"
 #include "main/mtypes.h"
 #include "main/condrender.h"
 #include "main/fbobject.h"
@@ -76,8 +77,16 @@ do_blit_drawpixels(struct gl_context * ctx,
    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
 
-   if (!_mesa_format_matches_format_and_type(irb->mt->format, format, type,
-                                             false)) {
+   mesa_format src_format = _mesa_format_from_format_and_type(format, type);
+   if (_mesa_format_is_mesa_array_format(src_format))
+      src_format = _mesa_format_from_array_format(src_format);
+   mesa_format dst_format = irb->mt->format;
+
+   /* We can safely discard sRGB encode/decode for the DrawPixels interface */
+   src_format = _mesa_get_srgb_format_linear(src_format);
+   dst_format = _mesa_get_srgb_format_linear(dst_format);
+
+   if (!intel_miptree_blit_compatible_formats(src_format, dst_format)) {
       DBG("%s: bad format for blit\n", __func__);
       return false;
    }
@@ -112,7 +121,7 @@ do_blit_drawpixels(struct gl_context * ctx,
                                   src_offset,
                                   width, height, 1,
                                   src_stride,
-                                  false /*disable_aux_buffers*/);
+                                  0);
    if (!pbo_mt)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index d3ca38b6ecd..30380570d62 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -226,8 +226,30 @@ intelReadPixels(struct gl_context * ctx,
 
    if (_mesa_is_bufferobj(pack->BufferObj)) {
       if (_mesa_meta_pbo_GetTexSubImage(ctx, 2, NULL, x, y, 0, width, height, 1,
-                                        format, type, pixels, pack))
+                                        format, type, pixels, pack)) {
+         /* _mesa_meta_pbo_GetTexSubImage() implements PBO transfers by
+          * binding the user-provided BO as a fake framebuffer and rendering
+          * to it.  This breaks the invariant of the GL that nothing is able
+          * to render to a BO, causing nondeterministic corruption issues
+          * because the render cache is not coherent with a number of other
+          * caches that the BO could potentially be bound to afterwards.
+          *
+          * This could be solved in the same way that we guarantee texture
+          * coherency after a texture is attached to a framebuffer and
+          * rendered to, but that would involve checking *all* BOs bound to
+          * the pipeline for the case we need to emit a cache flush due to
+          * previous rendering to any of them -- Including vertex, index,
+          * uniform, atomic counter, shader image, transform feedback,
+          * indirect draw buffers, etc.
+          *
+          * That would increase the per-draw call overhead even though it's
+          * very unlikely that any of the BOs bound to the pipeline has been
+          * rendered to via a PBO at any point, so it seems better to just
+          * flush here unconditionally.
+          */
+         intel_batchbuffer_emit_mi_flush(brw);
          return;
+      }
 
       perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
    }
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index 488fb5b98f8..bd14e189da3 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -48,6 +48,20 @@
 #define GEN7_MI_LOAD_REGISTER_MEM	(CMD_MI | (0x29 << 23))
 # define MI_LOAD_REGISTER_MEM_USE_GGTT		(1 << 22)
 
+/* Manipulate the predicate bit based on some register values. Only on Gen7+ */
+#define GEN7_MI_PREDICATE		(CMD_MI | (0xC << 23))
+# define MI_PREDICATE_LOADOP_KEEP		(0 << 6)
+# define MI_PREDICATE_LOADOP_LOAD		(2 << 6)
+# define MI_PREDICATE_LOADOP_LOADINV		(3 << 6)
+# define MI_PREDICATE_COMBINEOP_SET		(0 << 3)
+# define MI_PREDICATE_COMBINEOP_AND		(1 << 3)
+# define MI_PREDICATE_COMBINEOP_OR		(2 << 3)
+# define MI_PREDICATE_COMBINEOP_XOR		(3 << 3)
+# define MI_PREDICATE_COMPAREOP_TRUE		(0 << 0)
+# define MI_PREDICATE_COMPAREOP_FALSE		(1 << 0)
+# define MI_PREDICATE_COMPAREOP_SRCS_EQUAL	(2 << 0)
+# define MI_PREDICATE_COMPAREOP_DELTAS_EQUAL	(3 << 0)
+
 /** @{
  *
  * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
@@ -69,6 +83,7 @@
 #define PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE	(1 << 10) /* GM45+ only */
 #define PIPE_CONTROL_ISP_DIS		(1 << 9)
 #define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_FLUSH_ENABLE	(1 << 7) /* Gen7+ only */
 /* GT */
 #define PIPE_CONTROL_DATA_CACHE_INVALIDATE	(1 << 5)
 #define PIPE_CONTROL_VF_CACHE_INVALIDATE	(1 << 4)
@@ -147,3 +162,11 @@
 # define GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC (1 << 1)
 # define GEN8_HIZ_PMA_MASK_BITS \
    ((GEN8_HIZ_NP_PMA_FIX_ENABLE | GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE) << 16)
+
+/* Predicate registers */
+#define MI_PREDICATE_SRC0               0x2400
+#define MI_PREDICATE_SRC1               0x2408
+#define MI_PREDICATE_DATA               0x2410
+#define MI_PREDICATE_RESULT             0x2418
+#define MI_PREDICATE_RESULT_1           0x241C
+#define MI_PREDICATE_RESULT_2           0x2214
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 4860a160ee9..de14696bd76 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -39,6 +39,7 @@
 #include "swrast/s_renderbuffer.h"
 #include "util/ralloc.h"
 #include "brw_shader.h"
+#include "glsl/nir/nir.h"
 
 #include "utils.h"
 #include "xmlpool.h"
@@ -1372,6 +1373,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    if (!intelScreen->devinfo)
       return false;
 
+   brw_process_intel_debug_variable(intelScreen);
+
    intelScreen->hw_must_use_separate_stencil = intelScreen->devinfo->gen >= 7;
 
    intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
@@ -1407,6 +1410,13 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
          (ret != -1 || errno != EINVAL);
    }
 
+   struct drm_i915_getparam getparam;
+   getparam.param = I915_PARAM_CMD_PARSER_VERSION;
+   getparam.value = &intelScreen->cmd_parser_version;
+   const int ret = drmIoctl(psp->fd, DRM_IOCTL_I915_GETPARAM, &getparam);
+   if (ret == -1)
+      intelScreen->cmd_parser_version = 0;
+
    psp->extensions = !intelScreen->has_context_reset_notification
       ? intelScreenExtensions : intelRobustScreenExtensions;
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index e7a14903d6e..742b3d30eee 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -72,7 +72,13 @@ struct intel_screen
    * Configuration cache with default values for all contexts
    */
    driOptionCache optionCache;
-};
+
+   /**
+    * Version of the command parser reported by the
+    * I915_PARAM_CMD_PARSER_VERSION parameter
+    */
+   int cmd_parser_version;
+ };
 
 extern void intelDestroyContext(__DRIcontext * driContextPriv);
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index 777a682ad21..b0181ad1d75 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -93,7 +93,7 @@ intel_alloc_texture_image_buffer(struct gl_context *ctx,
    } else {
       intel_image->mt = intel_miptree_create_for_teximage(brw, intel_texobj,
                                                           intel_image,
-                                                          false);
+                                                          0);
 
       /* Even if the object currently has a mipmap tree associated
        * with it, this one is a more likely candidate to represent the
@@ -144,10 +144,8 @@ intel_alloc_texture_storage(struct gl_context *ctx,
                                               first_image->TexFormat,
                                               0, levels - 1,
                                               width, height, depth,
-                                              false, /* expect_accelerated */
                                               num_samples,
-                                              INTEL_MIPTREE_TILING_ANY,
-                                              false);
+                                              INTEL_MIPTREE_TILING_ANY, 0);
 
       if (intel_texobj->mt == NULL) {
          return false;
@@ -341,7 +339,7 @@ intel_set_texture_storage_for_buffer_object(struct gl_context *ctx,
                                   buffer_offset,
                                   image->Width, image->Height, image->Depth,
                                   row_stride,
-                                  false /*disable_aux_buffers*/);
+                                  0);
    if (!intel_texobj->mt)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex.h b/src/mesa/drivers/dri/i965/intel_tex.h
index f048e846d55..402a3891ecd 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.h
+++ b/src/mesa/drivers/dri/i965/intel_tex.h
@@ -53,7 +53,7 @@ struct intel_mipmap_tree *
 intel_miptree_create_for_teximage(struct brw_context *brw,
 				  struct intel_texture_object *intelObj,
 				  struct intel_texture_image *intelImage,
-				  bool expect_accelerated_upload);
+                                  uint32_t layout_flags);
 
 GLuint intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit);
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 7952ee5ad88..ebe84b664d4 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -36,7 +36,7 @@ struct intel_mipmap_tree *
 intel_miptree_create_for_teximage(struct brw_context *brw,
 				  struct intel_texture_object *intelObj,
 				  struct intel_texture_image *intelImage,
-				  bool expect_accelerated_upload)
+                                  uint32_t layout_flags)
 {
    GLuint lastLevel;
    int width, height, depth;
@@ -79,10 +79,9 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
 			       width,
 			       height,
 			       depth,
-			       expect_accelerated_upload,
                                intelImage->base.Base.NumSamples,
                                INTEL_MIPTREE_TILING_ANY,
-                               false);
+                               layout_flags);
 }
 
 static void
@@ -155,7 +154,7 @@ intel_set_texture_image_bo(struct gl_context *ctx,
                            GLuint width, GLuint height,
                            GLuint pitch,
                            GLuint tile_x, GLuint tile_y,
-                           bool disable_aux_buffers)
+                           uint32_t layout_flags)
 {
    struct brw_context *brw = brw_context(ctx);
    struct intel_texture_image *intel_image = intel_texture_image(image);
@@ -171,7 +170,7 @@ intel_set_texture_image_bo(struct gl_context *ctx,
 
    intel_image->mt = intel_miptree_create_for_bo(brw, bo, image->TexFormat,
                                                  0, width, height, 1, pitch,
-                                                 disable_aux_buffers);
+                                                 layout_flags);
    if (intel_image->mt == NULL)
        return;
    intel_image->mt->target = target;
@@ -255,8 +254,7 @@ intelSetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
                               rb->Base.Base.Width,
                               rb->Base.Base.Height,
                               rb->mt->pitch,
-                              0, 0,
-                              false /*disable_aux_buffers*/);
+                              0, 0, 0);
    _mesa_unlock_texture(&brw->ctx, texObj);
 }
 
@@ -349,7 +347,7 @@ intel_image_target_texture_2d(struct gl_context *ctx, GLenum target,
                               image->width,  image->height,
                               image->pitch,
                               image->tile_x, image->tile_y,
-                              true /*disable_aux_buffers*/);
+                              MIPTREE_LAYOUT_DISABLE_AUX);
 }
 
 /**
@@ -486,8 +484,15 @@ intel_get_tex_image(struct gl_context *ctx,
       if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage, 0, 0, 0,
                                         texImage->Width, texImage->Height,
                                         texImage->Depth, format, type,
-                                        pixels, &ctx->Pack))
+                                        pixels, &ctx->Pack)) {
+         /* Flush to guarantee coherency between the render cache and other
+          * caches the PBO could potentially be bound to after this point.
+          * See the related comment in intelReadPixels() for a more detailed
+          * explanation.
+          */
+         intel_batchbuffer_emit_mi_flush(brw);
          return;
+      }
 
       perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
    }
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 1d827683b99..4991c2997ef 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -47,8 +47,10 @@ intel_update_max_level(struct intel_texture_object *intelObj,
 {
    struct gl_texture_object *tObj = &intelObj->base;
 
-   if (sampler->MinFilter == GL_NEAREST ||
-       sampler->MinFilter == GL_LINEAR) {
+   if (!tObj->_MipmapComplete ||
+       (tObj->_RenderToTexture &&
+        (sampler->MinFilter == GL_NEAREST ||
+         sampler->MinFilter == GL_LINEAR))) {
       intelObj->_MaxLevel = tObj->BaseLevel;
    } else {
       intelObj->_MaxLevel = tObj->_MaxLevel;
@@ -142,10 +144,9 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                                           width,
                                           height,
                                           depth,
-					  true,
                                           0 /* num_samples */,
                                           INTEL_MIPTREE_TILING_ANY,
-                                          false);
+                                          MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
       if (!intelObj->mt)
          return false;
    }
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 206a76e9242..8010fb4f610 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -26,11 +26,13 @@
 #include "brw_cfg.h"
 #include "program/program.h"
 
+using namespace brw;
+
 class cmod_propagation_test : public ::testing::Test {
    virtual void SetUp();
 
 public:
-   struct brw_context *brw;
+   struct brw_compiler *compiler;
    struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct brw_wm_prog_data *prog_data;
@@ -42,30 +44,31 @@ public:
 class cmod_propagation_fs_visitor : public fs_visitor
 {
 public:
-   cmod_propagation_fs_visitor(struct brw_context *brw,
+   cmod_propagation_fs_visitor(struct brw_compiler *compiler,
                                struct brw_wm_prog_data *prog_data,
                                struct gl_shader_program *shader_prog)
-      : fs_visitor(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
+      : fs_visitor(compiler, NULL, NULL, MESA_SHADER_FRAGMENT, NULL,
+                   &prog_data->base, shader_prog,
+                   (struct gl_program *) NULL, 8, -1) {}
 };
 
 
 void cmod_propagation_test::SetUp()
 {
-   brw = (struct brw_context *)calloc(1, sizeof(*brw));
-   devinfo = (struct brw_device_info *)calloc(1, sizeof(*brw));
-   brw->intelScreen = (struct intel_screen *)calloc(1, sizeof(*brw->intelScreen));
-   brw->intelScreen->devinfo = devinfo;
-   ctx = &brw->ctx;
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
 
    fp = ralloc(NULL, struct brw_fragment_program);
    prog_data = ralloc(NULL, struct brw_wm_prog_data);
    shader_prog = ralloc(NULL, struct gl_shader_program);
 
-   v = new cmod_propagation_fs_visitor(brw, prog_data, shader_prog);
+   v = new cmod_propagation_fs_visitor(compiler, prog_data, shader_prog);
 
    _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
 
-   brw->gen = devinfo->gen = 4;
+   devinfo->gen = 4;
 }
 
 static fs_inst *
@@ -100,13 +103,13 @@ cmod_propagation(fs_visitor *v)
 
 TEST_F(cmod_propagation_test, basic)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -132,13 +135,13 @@ TEST_F(cmod_propagation_test, basic)
 
 TEST_F(cmod_propagation_test, cmp_nonzero)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg nonzero(1.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest, nonzero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, nonzero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -165,12 +168,12 @@ TEST_F(cmod_propagation_test, cmp_nonzero)
 
 TEST_F(cmod_propagation_test, non_cmod_instruction)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::uint_type);
    fs_reg src0 = v->vgrf(glsl_type::uint_type);
    fs_reg zero(0u);
-   v->emit(BRW_OPCODE_FBL, dest, src0);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_ud, dest, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.FBL(dest, src0);
+   bld.CMP(bld.null_reg_ud(), dest, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -197,16 +200,15 @@ TEST_F(cmod_propagation_test, non_cmod_instruction)
 
 TEST_F(cmod_propagation_test, intervening_flag_write)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, src2, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -236,17 +238,16 @@ TEST_F(cmod_propagation_test, intervening_flag_write)
 
 TEST_F(cmod_propagation_test, intervening_flag_read)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest0 = v->vgrf(glsl_type::float_type);
    fs_reg dest1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest0, src0, src1);
-   v->emit(BRW_OPCODE_SEL, dest1, src2, zero)
-      ->predicate = BRW_PREDICATE_NORMAL;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -276,16 +277,16 @@ TEST_F(cmod_propagation_test, intervening_flag_read)
 
 TEST_F(cmod_propagation_test, intervening_dest_write)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::vec4_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, offset(dest, 2), src0, src1);
-   v->emit(SHADER_OPCODE_TEX, dest, src2)
+   bld.ADD(offset(dest, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dest, src2)
       ->regs_written = 4;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, offset(dest, 2), zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.CMP(bld.null_reg_f(), offset(dest, 2), zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -316,18 +317,16 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
 
 TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest0 = v->vgrf(glsl_type::float_type);
    fs_reg dest1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest0, src0, src1)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
-   v->emit(BRW_OPCODE_SEL, dest1, src2, zero)
-      ->predicate = BRW_PREDICATE_NORMAL;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -357,14 +356,14 @@ TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
 
 TEST_F(cmod_propagation_test, negate)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
+   bld.ADD(dest, src0, src1);
    dest.negate = true;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -390,13 +389,13 @@ TEST_F(cmod_propagation_test, negate)
 
 TEST_F(cmod_propagation_test, movnz)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_CMP, dest, src0, src1)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
-   v->emit(BRW_OPCODE_MOV, v->reg_null_f, dest)
-      ->conditional_mod = BRW_CONDITIONAL_NZ;
+   bld.CMP(dest, src0, src1, BRW_CONDITIONAL_GE);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(bld.null_reg_f(), dest));
 
    /* = Before =
     *
@@ -422,14 +421,14 @@ TEST_F(cmod_propagation_test, movnz)
 
 TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::int_type);
    fs_reg src1 = v->vgrf(glsl_type::int_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, retype(dest, BRW_REGISTER_TYPE_F),
-                                          zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), retype(dest, BRW_REGISTER_TYPE_F), zero,
+           BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -456,15 +455,15 @@ TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
 
 TEST_F(cmod_propagation_test, andnz_one)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
    fs_reg one(1);
 
-   v->emit(BRW_OPCODE_CMP, retype(dest, BRW_REGISTER_TYPE_F), src0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_L;
-   v->emit(BRW_OPCODE_AND, v->reg_null_d, dest, one)
-      ->conditional_mod = BRW_CONDITIONAL_NZ;
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), dest, one));
 
    /* = Before =
     * 0: cmp.l.f0(8)     dest:F  src0:F  0F
@@ -491,15 +490,15 @@ TEST_F(cmod_propagation_test, andnz_one)
 
 TEST_F(cmod_propagation_test, andnz_non_one)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
    fs_reg nonone(38);
 
-   v->emit(BRW_OPCODE_CMP, retype(dest, BRW_REGISTER_TYPE_F), src0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_L;
-   v->emit(BRW_OPCODE_AND, v->reg_null_d, dest, nonone)
-      ->conditional_mod = BRW_CONDITIONAL_NZ;
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), dest, nonone));
 
    /* = Before =
     * 0: cmp.l.f0(8)     dest:F  src0:F  0F
@@ -526,15 +525,15 @@ TEST_F(cmod_propagation_test, andnz_non_one)
 
 TEST_F(cmod_propagation_test, andz_one)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
    fs_reg one(1);
 
-   v->emit(BRW_OPCODE_CMP, retype(dest, BRW_REGISTER_TYPE_F), src0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_L;
-   v->emit(BRW_OPCODE_AND, v->reg_null_d, dest, one)
-      ->conditional_mod = BRW_CONDITIONAL_Z;
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_Z,
+               bld.AND(bld.null_reg_d(), dest, one));
 
    /* = Before =
     * 0: cmp.l.f0(8)     dest:F  src0:F  0F
diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
index 4c91af3ea8d..3ef0cb319eb 100644
--- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
@@ -26,11 +26,13 @@
 #include "brw_cfg.h"
 #include "program/program.h"
 
+using namespace brw;
+
 class saturate_propagation_test : public ::testing::Test {
    virtual void SetUp();
 
 public:
-   struct brw_context *brw;
+   struct brw_compiler *compiler;
    struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct brw_wm_prog_data *prog_data;
@@ -42,30 +44,31 @@ public:
 class saturate_propagation_fs_visitor : public fs_visitor
 {
 public:
-   saturate_propagation_fs_visitor(struct brw_context *brw,
+   saturate_propagation_fs_visitor(struct brw_compiler *compiler,
                                    struct brw_wm_prog_data *prog_data,
                                    struct gl_shader_program *shader_prog)
-      : fs_visitor(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
+      : fs_visitor(compiler, NULL, NULL, MESA_SHADER_FRAGMENT, NULL,
+                   &prog_data->base, shader_prog,
+                   (struct gl_program *) NULL, 8, -1) {}
 };
 
 
 void saturate_propagation_test::SetUp()
 {
-   brw = (struct brw_context *)calloc(1, sizeof(*brw));
-   devinfo = (struct brw_device_info *)calloc(1, sizeof(*brw));
-   brw->intelScreen = (struct intel_screen *)calloc(1, sizeof(*brw->intelScreen));
-   brw->intelScreen->devinfo = devinfo;
-   ctx = &brw->ctx;
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
 
    fp = ralloc(NULL, struct brw_fragment_program);
    prog_data = ralloc(NULL, struct brw_wm_prog_data);
    shader_prog = ralloc(NULL, struct gl_shader_program);
 
-   v = new saturate_propagation_fs_visitor(brw, prog_data, shader_prog);
+   v = new saturate_propagation_fs_visitor(compiler, prog_data, shader_prog);
 
    _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
 
-   brw->gen = devinfo->gen = 4;
+   devinfo->gen = 4;
 }
 
 static fs_inst *
@@ -100,13 +103,13 @@ saturate_propagation(fs_visitor *v)
 
 TEST_F(saturate_propagation_test, basic)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
 
    /* = Before =
     *
@@ -135,15 +138,15 @@ TEST_F(saturate_propagation_test, basic)
 
 TEST_F(saturate_propagation_test, other_non_saturated_use)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg dst2 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
-   v->emit(BRW_OPCODE_ADD, dst2, dst0, src0);
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   bld.ADD(dst2, dst0, src0);
 
    /* = Before =
     *
@@ -173,14 +176,14 @@ TEST_F(saturate_propagation_test, other_non_saturated_use)
 
 TEST_F(saturate_propagation_test, predicated_instruction)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1)
+   bld.ADD(dst0, src0, src1)
       ->predicate = BRW_PREDICATE_NORMAL;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
 
    /* = Before =
     *
@@ -208,14 +211,14 @@ TEST_F(saturate_propagation_test, predicated_instruction)
 
 TEST_F(saturate_propagation_test, neg_mov_sat)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
+   bld.ADD(dst0, src0, src1);
    dst0.negate = true;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
 
    /* = Before =
     *
@@ -243,14 +246,14 @@ TEST_F(saturate_propagation_test, neg_mov_sat)
 
 TEST_F(saturate_propagation_test, abs_mov_sat)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
+   bld.ADD(dst0, src0, src1);
    dst0.abs = true;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
 
    /* = Before =
     *
@@ -278,16 +281,15 @@ TEST_F(saturate_propagation_test, abs_mov_sat)
 
 TEST_F(saturate_propagation_test, producer_saturates)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg dst2 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1)
-      ->saturate = true;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
-   v->emit(BRW_OPCODE_MOV, dst2, dst0);
+   set_saturate(true, bld.ADD(dst0, src0, src1));
+   set_saturate(true, bld.MOV(dst1, dst0));
+   bld.MOV(dst2, dst0);
 
    /* = Before =
     *
@@ -318,16 +320,15 @@ TEST_F(saturate_propagation_test, producer_saturates)
 
 TEST_F(saturate_propagation_test, intervening_saturating_copy)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg dst2 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
-   v->emit(BRW_OPCODE_MOV, dst2, dst0)
-      ->saturate = true;
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   set_saturate(true, bld.MOV(dst2, dst0));
 
    /* = Before =
     *
@@ -360,16 +361,16 @@ TEST_F(saturate_propagation_test, intervening_saturating_copy)
 
 TEST_F(saturate_propagation_test, intervening_dest_write)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::vec4_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
-   v->emit(BRW_OPCODE_ADD, offset(dst0, 2), src0, src1);
-   v->emit(SHADER_OPCODE_TEX, dst0, src2)
+   bld.ADD(offset(dst0, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dst0, src2)
       ->regs_written = 4;
-   v->emit(BRW_OPCODE_MOV, dst1, offset(dst0, 2))
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, offset(dst0, 2)));
 
    /* = Before =
     *
@@ -400,18 +401,17 @@ TEST_F(saturate_propagation_test, intervening_dest_write)
 
 TEST_F(saturate_propagation_test, mul_neg_mov_sat_mov_sat)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg dst2 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_MUL, dst0, src0, src1);
+   bld.MUL(dst0, src0, src1);
    dst0.negate = true;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
    dst0.negate = false;
-   v->emit(BRW_OPCODE_MOV, dst2, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst2, dst0));
 
    /* = Before =
     *
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index 2ef52e9fd6b..84e43fa75cd 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -33,7 +33,7 @@ class copy_propagation_test : public ::testing::Test {
    virtual void SetUp();
 
 public:
-   struct brw_context *brw;
+   struct brw_compiler *compiler;
    struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct gl_shader_program *shader_prog;
@@ -44,12 +44,11 @@ public:
 class copy_propagation_vec4_visitor : public vec4_visitor
 {
 public:
-   copy_propagation_vec4_visitor(struct brw_context *brw,
+   copy_propagation_vec4_visitor(struct brw_compiler *compiler,
                                   struct gl_shader_program *shader_prog)
-      : vec4_visitor(brw, NULL, NULL, NULL, NULL, shader_prog,
+      : vec4_visitor(compiler, NULL, NULL, NULL, NULL, shader_prog,
                      MESA_SHADER_VERTEX, NULL,
-                     false /* no_spills */,
-                     ST_NONE, ST_NONE, ST_NONE)
+                     false /* no_spills */, -1)
    {
    }
 
@@ -93,21 +92,20 @@ protected:
 
 void copy_propagation_test::SetUp()
 {
-   brw = (struct brw_context *)calloc(1, sizeof(*brw));
-   devinfo = (struct brw_device_info *)calloc(1, sizeof(*brw));
-   brw->intelScreen = (struct intel_screen *)calloc(1, sizeof(*brw->intelScreen));
-   brw->intelScreen->devinfo = devinfo;
-   ctx = &brw->ctx;
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
 
    vp = ralloc(NULL, struct brw_vertex_program);
 
    shader_prog = ralloc(NULL, struct gl_shader_program);
 
-   v = new copy_propagation_vec4_visitor(brw, shader_prog);
+   v = new copy_propagation_vec4_visitor(compiler, shader_prog);
 
    _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
 
-   brw->gen = devinfo->gen = 4;
+   devinfo->gen = 4;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index c8c67574e95..de2afd39cfe 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -35,7 +35,7 @@ class register_coalesce_test : public ::testing::Test {
    virtual void SetUp();
 
 public:
-   struct brw_context *brw;
+   struct brw_compiler *compiler;
    struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct gl_shader_program *shader_prog;
@@ -47,12 +47,11 @@ public:
 class register_coalesce_vec4_visitor : public vec4_visitor
 {
 public:
-   register_coalesce_vec4_visitor(struct brw_context *brw,
+   register_coalesce_vec4_visitor(struct brw_compiler *compiler,
                                   struct gl_shader_program *shader_prog)
-      : vec4_visitor(brw, NULL, NULL, NULL, NULL, shader_prog,
+      : vec4_visitor(compiler, NULL, NULL, NULL, NULL, shader_prog,
                      MESA_SHADER_VERTEX, NULL,
-                     false /* no_spills */,
-                     ST_NONE, ST_NONE, ST_NONE)
+                     false /* no_spills */, -1)
    {
    }
 
@@ -96,21 +95,20 @@ protected:
 
 void register_coalesce_test::SetUp()
 {
-   brw = (struct brw_context *)calloc(1, sizeof(*brw));
-   devinfo = (struct brw_device_info *)calloc(1, sizeof(*brw));
-   brw->intelScreen = (struct intel_screen *)calloc(1, sizeof(*brw->intelScreen));
-   brw->intelScreen->devinfo = devinfo;
-   ctx = &brw->ctx;
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
 
    vp = ralloc(NULL, struct brw_vertex_program);
 
    shader_prog = ralloc(NULL, struct gl_shader_program);
 
-   v = new register_coalesce_vec4_visitor(brw, shader_prog);
+   v = new register_coalesce_vec4_visitor(compiler, shader_prog);
 
    _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
 
-   brw->gen = devinfo->gen = 4;
+   devinfo->gen = 4;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
index 6c479f5f0c6..c78d4baa124 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
@@ -242,7 +242,7 @@ static void
 nouveau_framebuffer_renderbuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
 				 GLenum attachment, struct gl_renderbuffer *rb)
 {
-	_mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb);
+	_mesa_FramebufferRenderbuffer_sw(ctx, fb, attachment, rb);
 
 	context_dirty(ctx, FRAMEBUFFER);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nv10_state_tnl.c b/src/mesa/drivers/dri/nouveau/nv10_state_tnl.c
index c0c7b26bbf7..1398385b262 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_state_tnl.c
+++ b/src/mesa/drivers/dri/nouveau/nv10_state_tnl.c
@@ -31,6 +31,8 @@
 #include "nv10_3d.xml.h"
 #include "nv10_driver.h"
 
+#include "util/simple_list.h"
+
 void
 nv10_emit_clip_plane(struct gl_context *ctx, int emit)
 {
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_tnl.c b/src/mesa/drivers/dri/nouveau/nv20_state_tnl.c
index f0acbed8560..41395516ea4 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_tnl.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_tnl.c
@@ -32,6 +32,8 @@
 #include "nv10_driver.h"
 #include "nv20_driver.h"
 
+#include "util/simple_list.h"
+
 #define LIGHT_MODEL_AMBIENT_R(side)			\
 	((side) ? NV20_3D_LIGHT_MODEL_BACK_AMBIENT_R :	\
 	 NV20_3D_LIGHT_MODEL_FRONT_AMBIENT_R)
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index b0a6bd573b6..6fe70b5c9d0 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -2215,9 +2215,9 @@ GLboolean r200ValidateState( struct gl_context *ctx )
    GLuint new_state = rmesa->radeon.NewGLState;
 
    if (new_state & _NEW_BUFFERS) {
-      _mesa_update_framebuffer(ctx);
+      _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
-      _mesa_update_draw_buffer_bounds(ctx);
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
 
       R200_STATECHANGE(rmesa, ctx);
    }
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 0ca526d2a02..2a8bd6c9edc 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -220,9 +220,9 @@ void radeon_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb)
 	 */
 	if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
 		/* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
-		_mesa_update_framebuffer(ctx);
+		_mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
 		/* this updates the DrawBuffer's Width/Height if it's a FBO */
-		_mesa_update_draw_buffer_bounds(ctx);
+		_mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
 	}
 
 	if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index 97022f95953..ef62d097bae 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -723,7 +723,7 @@ radeon_framebuffer_renderbuffer(struct gl_context * ctx,
 		"%s(%p, fb %p, rb %p) \n",
 		__func__, ctx, fb, rb);
 
-   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb);
+   _mesa_FramebufferRenderbuffer_sw(ctx, fb, attachment, rb);
    radeon_draw_buffer(ctx, fb);
 }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index c45bb513dca..cba3d9c9689 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -1994,9 +1994,9 @@ GLboolean radeonValidateState( struct gl_context *ctx )
    GLuint new_state = rmesa->radeon.NewGLState;
 
    if (new_state & _NEW_BUFFERS) {
-     _mesa_update_framebuffer(ctx);
+     _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
      /* this updates the DrawBuffer's Width/Height if it's a FBO */
-     _mesa_update_draw_buffer_bounds(ctx);
+     _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
      RADEON_STATECHANGE(rmesa, ctx);
    }
 
diff --git a/src/mesa/drivers/dri/swrast/swrast.c b/src/mesa/drivers/dri/swrast/swrast.c
index 2ddb474dde7..2d4bb702fc2 100644
--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -62,7 +62,9 @@
 #include "swrast/s_context.h"
 
 #include <sys/types.h>
-#include <sys/sysctl.h>
+#ifdef HAVE_SYS_SYSCTL_H
+# include <sys/sysctl.h>
+#endif
 
 const __DRIextension **__driDriverGetExtensions_swrast(void);
 
@@ -958,6 +960,7 @@ static const __DRIextension *swrast_driver_extensions[] = {
     &driCoreExtension.base,
     &driSWRastExtension.base,
     &driCopySubBufferExtension.base,
+    &dri2ConfigQueryExtension.base,
     &swrast_vtable.base,
     NULL
 };
diff --git a/src/mesa/drivers/haiku/swrast/SConscript b/src/mesa/drivers/haiku/swrast/SConscript
deleted file mode 100644
index 907325e3252..00000000000
--- a/src/mesa/drivers/haiku/swrast/SConscript
+++ /dev/null
@@ -1,33 +0,0 @@
-Import('*')
-
-env = env.Clone()
-
-env.Append(CPPPATH = [
-    '#/src',
-    '#/src/mapi',
-    '#/src/mesa',
-    '#/src/mesa/main',
-    '#/include/HaikuGL',
-    '/boot/system/develop/headers/private',
-    Dir('../../../mapi'), # src/mapi build path for python-generated GL API files/headers
-])
-
-env.Prepend(LIBS = [
-    mesautil,
-    glsl,
-    mesa,
-])
-
-env.Prepend(LIBS = [libgl])
-
-sources = [
-	'SoftwareRast.cpp'
-]
-
-# Disallow undefined symbols
-#env.Append(SHLINKFLAGS = ['-Wl,-z,defs'])
-
-libswrast = env.SharedLibrary(
-    target = 'swrast',
-    source = sources
-)
diff --git a/src/mesa/drivers/haiku/swrast/SoftwareRast.cpp b/src/mesa/drivers/haiku/swrast/SoftwareRast.cpp
deleted file mode 100644
index 813ad1ff27d..00000000000
--- a/src/mesa/drivers/haiku/swrast/SoftwareRast.cpp
+++ /dev/null
@@ -1,697 +0,0 @@
-/*
- * Copyright 2006-2012, Haiku, Inc. All rights reserved.
- * Distributed under the terms of the MIT License.
- *
- * Authors:
- *		Jérôme Duval, [email protected]
- *		Philippe Houdoin, [email protected]
- *		Artur Wyszynski, [email protected]
- *		Alexander von Gluck, [email protected]
- */
-
-
-#include <kernel/image.h>
-#include "SoftwareRast.h"
-
-#include <Autolock.h>
-#include <interface/DirectWindowPrivate.h>
-#include <GraphicsDefs.h>
-#include <Screen.h>
-#include <stdio.h>
-#include <string.h>
-
-extern "C" {
-#include "extensions.h"
-#include "drivers/common/driverfuncs.h"
-#include "drivers/common/meta.h"
-#include "main/api_exec.h"
-#include "main/colormac.h"
-#include "main/cpuinfo.h"
-#include "main/buffers.h"
-#include "main/formats.h"
-#include "main/framebuffer.h"
-#include "main/renderbuffer.h"
-#include "main/version.h"
-#include "main/vtxfmt.h"
-#include "swrast/swrast.h"
-#include "swrast/s_renderbuffer.h"
-#include "swrast_setup/swrast_setup.h"
-#include "tnl/tnl.h"
-#include "tnl/t_context.h"
-#include "tnl/t_pipeline.h"
-#include "vbo/vbo.h"
-
-
-#ifdef DEBUG
-#	define TRACE(x...) printf("MesaSoftwareRast: " x)
-#	define CALLED() printf("MesaSoftwareRast: %s\n", __PRETTY_FUNCTION__)
-#else
-#	define TRACE(x...)
-#	define CALLED()
-#endif
-
-#define ERROR(x...) printf("MesaSoftwareRast: " x)
-}
-
-
-extern const char* color_space_name(color_space space);
-
-
-extern "C" _EXPORT BGLRenderer*
-instantiate_gl_renderer(BGLView* view, ulong options,
-	BGLDispatcher* dispatcher)
-{
-	return new MesaSoftwareRast(view, options, dispatcher);
-}
-
-
-MesaSoftwareRast::MesaSoftwareRast(BGLView* view, ulong options,
-	BGLDispatcher* dispatcher)
-	: BGLRenderer(view, options, dispatcher),
-	fBitmap(NULL),
-	fDirectModeEnabled(false),
-	fInfo(NULL),
-	fInfoLocker("info locker"),
-	fVisual(NULL),
-	fFrameBuffer(NULL),
-	fFrontRenderBuffer(NULL),
-	fBackRenderBuffer(NULL),
-	fColorSpace(B_NO_COLOR_SPACE)
-{
-	CALLED();
-
-	fColorSpace = BScreen(GLView()->Window()).ColorSpace();
-
-	// We force single buffering for the time being
-	options &= ~BGL_DOUBLE;
-
-	const GLboolean rgbFlag = ((options & BGL_INDEX) == 0);
-	const GLboolean alphaFlag = ((options & BGL_ALPHA) == BGL_ALPHA);
-	const GLboolean dblFlag = ((options & BGL_DOUBLE) == BGL_DOUBLE);
-	const GLboolean stereoFlag = false;
-	const GLint depth = (options & BGL_DEPTH) ? 16 : 0;
-	const GLint stencil = (options & BGL_STENCIL) ? 8 : 0;
-	const GLint accum = (options & BGL_ACCUM) ? 16 : 0;
-	const GLint red = rgbFlag ? 8 : 0;
-	const GLint green = rgbFlag ? 8 : 0;
-	const GLint blue = rgbFlag ? 8 : 0;
-	const GLint alpha = alphaFlag ? 8 : 0;
-
-	fOptions = options; // | BGL_INDIRECT;
-	struct dd_function_table functions;
-
-	fVisual = _mesa_create_visual(dblFlag, stereoFlag, red, green,
-		blue, alpha, depth, stencil, accum, accum, accum,
-		alpha ? accum : 0, 1);
-
-	// Initialize device driver function table
-	_mesa_init_driver_functions(&functions);
-
-	functions.GetString = _GetString;
-	functions.UpdateState = _UpdateState;
-	functions.MapRenderbuffer = _RenderBufferMap;
-	functions.Flush = _Flush;
-
-	// create core context
-	// We inherit gl_context to this class
-	_mesa_initialize_context(this, API_OPENGL_COMPAT, fVisual, NULL,
-		&functions);
-
-	/* Initialize the software rasterizer and helper modules. */
-	_swrast_CreateContext(this);
-	_vbo_CreateContext(this);
-	_tnl_CreateContext(this);
-	_swsetup_CreateContext(this);
-	_swsetup_Wakeup(this);
-
-	// Use default TCL pipeline
-	TNL_CONTEXT(this)->Driver.RunPipeline = _tnl_run_pipeline;
-
-	_mesa_meta_init(this);
-	_mesa_enable_sw_extensions(this);
-
-	_mesa_compute_version(this);
-
-	_mesa_initialize_dispatch_tables(this);
-	_mesa_initialize_vbo_vtxfmt(this);
-
-	// create core framebuffer
-	fFrameBuffer = _mesa_create_framebuffer(fVisual);
-	if (fFrameBuffer == NULL) {
-		ERROR("%s: Unable to calloc GL FrameBuffer!\n", __func__);
-		_mesa_destroy_visual(fVisual);
-		return;
-	}
-
-	// Setup front render buffer
-	fFrontRenderBuffer = _NewRenderBuffer(true);
-	if (fFrontRenderBuffer == NULL) {
-		ERROR("%s: FrontRenderBuffer is requested but unallocated!\n",
-			__func__);
-		_mesa_destroy_visual(fVisual);
-		free(fFrameBuffer);
-		return;
-	}
-	_mesa_add_renderbuffer(fFrameBuffer, BUFFER_FRONT_LEFT,
-		&fFrontRenderBuffer->Base);
-
-	// Setup back render buffer (if requested)
-	if (fVisual->doubleBufferMode) {
-		fBackRenderBuffer = _NewRenderBuffer(false);
-		if (fBackRenderBuffer == NULL) {
-			ERROR("%s: BackRenderBuffer is requested but unallocated!\n",
-				__func__);
-			_mesa_destroy_visual(fVisual);
-			free(fFrameBuffer);
-			return;
-		}
-		_mesa_add_renderbuffer(fFrameBuffer, BUFFER_BACK_LEFT,
-			&fBackRenderBuffer->Base);
-	}
-
-	_swrast_add_soft_renderbuffers(fFrameBuffer, GL_FALSE,
-		fVisual->haveDepthBuffer, fVisual->haveStencilBuffer,
-		fVisual->haveAccumBuffer, alphaFlag, GL_FALSE);
-
-	BRect bounds = view->Bounds();
-	fWidth = (GLint)bounds.Width();
-	fHeight = (GLint)bounds.Height();
-
-	// some stupid applications (Quake2) don't even think about calling LockGL()
-	// before using glGetString and its glGet*() friends...
-	// so make sure there is at least a valid context.
-
-	if (!_mesa_get_current_context()) {
-		LockGL();
-		// not needed, we don't have a looper yet: UnlockLooper();
-	}
-}
-
-
-MesaSoftwareRast::~MesaSoftwareRast()
-{
-	CALLED();
-	_swsetup_DestroyContext(this);
-	_swrast_DestroyContext(this);
-	_tnl_DestroyContext(this);
-	_vbo_DestroyContext(this);
-	_mesa_destroy_visual(fVisual);
-	_mesa_destroy_framebuffer(fFrameBuffer);
-	_mesa_destroy_context(this);
-
-	free(fInfo);
-	free(fFrameBuffer);
-
-	delete fBitmap;
-}
-
-
-void
-MesaSoftwareRast::LockGL()
-{
-	CALLED();
-	BGLRenderer::LockGL();
-
-	_mesa_make_current(this, fFrameBuffer, fFrameBuffer);
-
-	color_space colorSpace = BScreen(GLView()->Window()).ColorSpace();
-
-	GLuint width = fWidth;
-	GLuint height = fHeight;
-
-	BAutolock lock(fInfoLocker);
-	if (fDirectModeEnabled && fInfo != NULL) {
-		width = fInfo->window_bounds.right
-			- fInfo->window_bounds.left + 1;
-		height = fInfo->window_bounds.bottom
-			- fInfo->window_bounds.top + 1;
-	}
-
-	if (fColorSpace != colorSpace) {
-		fColorSpace = colorSpace;
-		_SetupRenderBuffer(&fFrontRenderBuffer->Base, fColorSpace);
-		if (fVisual->doubleBufferMode)
-			_SetupRenderBuffer(&fBackRenderBuffer->Base, fColorSpace);
-	}
-
-	_CheckResize(width, height);
-}
-
-
-void
-MesaSoftwareRast::UnlockGL()
-{
-	CALLED();
-	_mesa_make_current(this, NULL, NULL);
-	BGLRenderer::UnlockGL();
-}
-
-
-void
-MesaSoftwareRast::SwapBuffers(bool VSync)
-{
-	CALLED();
-
-	if (!fBitmap)
-		return;
-
-	if (fVisual->doubleBufferMode)
-		_mesa_notifySwapBuffers(this);
-
-	if (!fDirectModeEnabled || fInfo == NULL) {
-		if (GLView()->LockLooperWithTimeout(1000) == B_OK) {
-			GLView()->DrawBitmap(fBitmap, B_ORIGIN);
-			GLView()->UnlockLooper();
-		}
-	} else {
-		// TODO: Here the BGLView needs to be drawlocked.
-		_CopyToDirect();
-	}
-
-	if (VSync) {
-		BScreen screen(GLView()->Window());
-		screen.WaitForRetrace();
-	}
-}
-
-
-void
-MesaSoftwareRast::Draw(BRect updateRect)
-{
-	CALLED();
-	if (fBitmap && (!fDirectModeEnabled || (fInfo == NULL)))
-		GLView()->DrawBitmap(fBitmap, updateRect, updateRect);
-}
-
-
-status_t
-MesaSoftwareRast::CopyPixelsOut(BPoint location, BBitmap* bitmap)
-{
-	CALLED();
-	color_space scs = fBitmap->ColorSpace();
-	color_space dcs = bitmap->ColorSpace();
-
-	if (scs != dcs && (scs != B_RGBA32 || dcs != B_RGB32)) {
-		fprintf(stderr, "CopyPixelsOut(): incompatible color space: %s != %s\n",
-			color_space_name(scs),
-			color_space_name(dcs));
-		return B_BAD_TYPE;
-	}
-
-	BRect sr = fBitmap->Bounds();
-	BRect dr = bitmap->Bounds();
-
-	sr = sr & dr.OffsetBySelf(location);
-	dr = sr.OffsetByCopy(-location.x, -location.y);
-
-	uint8* ps = (uint8*)fBitmap->Bits();
-	uint8* pd = (uint8*)bitmap->Bits();
-	uint32* s;
-	uint32* d;
-	uint32 y;
-	for (y = (uint32)sr.top; y <= (uint32)sr.bottom; y++) {
-		s = (uint32*)(ps + y * fBitmap->BytesPerRow());
-		s += (uint32)sr.left;
-
-		d = (uint32*)(pd + (y + (uint32)(dr.top - sr.top))
-			* bitmap->BytesPerRow());
-		d += (uint32)dr.left;
-
-		memcpy(d, s, dr.IntegerWidth() * 4);
-	}
-	return B_OK;
-}
-
-
-status_t
-MesaSoftwareRast::CopyPixelsIn(BBitmap* bitmap, BPoint location)
-{
-	CALLED();
-	color_space scs = bitmap->ColorSpace();
-	color_space dcs = fBitmap->ColorSpace();
-
-	if (scs != dcs && (dcs != B_RGBA32 || scs != B_RGB32)) {
-		fprintf(stderr, "CopyPixelsIn(): incompatible color space: %s != %s\n",
-			color_space_name(scs),
-			color_space_name(dcs));
-		return B_BAD_TYPE;
-	}
-
-	BRect sr = bitmap->Bounds();
-	BRect dr = fBitmap->Bounds();
-
-	sr = sr & dr.OffsetBySelf(location);
-	dr = sr.OffsetByCopy(-location.x, -location.y);
-
-	uint8* ps = (uint8*)bitmap->Bits();
-	uint8* pd = (uint8*)fBitmap->Bits();
-	uint32* s;
-	uint32* d;
-	uint32 y;
-	for (y = (uint32)sr.top; y <= (uint32)sr.bottom; y++) {
-		s = (uint32*)(ps + y * bitmap->BytesPerRow());
-		s += (uint32)sr.left;
-
-		d = (uint32*)(pd + (y + (uint32)(dr.top - sr.top))
-			* fBitmap->BytesPerRow());
-		d += (uint32)dr.left;
-
-		memcpy(d, s, dr.IntegerWidth() * 4);
-	}
-	return B_OK;
-}
-
-
-void
-MesaSoftwareRast::EnableDirectMode(bool enabled)
-{
-	fDirectModeEnabled = enabled;
-}
-
-
-void
-MesaSoftwareRast::DirectConnected(direct_buffer_info* info)
-{
-	// TODO: I'm not sure we need to do this: BGLView already
-	// keeps a local copy of the direct_buffer_info passed by
-	// BDirectWindow::DirectConnected().
-	BAutolock lock(fInfoLocker);
-	if (info) {
-		if (!fInfo) {
-			fInfo = (direct_buffer_info*)malloc(DIRECT_BUFFER_INFO_AREA_SIZE);
-			if (!fInfo)
-				return;
-		}
-		memcpy(fInfo, info, DIRECT_BUFFER_INFO_AREA_SIZE);
-	} else if (fInfo) {
-		free(fInfo);
-		fInfo = NULL;
-	}
-}
-
-
-void
-MesaSoftwareRast::FrameResized(float width, float height)
-{
-	BAutolock lock(fInfoLocker);
-	_CheckResize((GLuint)width, (GLuint)height);
-}
-
-
-void
-MesaSoftwareRast::_CheckResize(GLuint newWidth, GLuint newHeight)
-{
-	CALLED();
-
-	if (fBitmap && newWidth == fWidth
-		&& newHeight == fHeight) {
-		return;
-	}
-
-	_mesa_resize_framebuffer(this, fFrameBuffer, newWidth, newHeight);
-	fHeight = newHeight;
-	fWidth = newWidth;
-
-	_AllocateBitmap();
-}
-
-
-void
-MesaSoftwareRast::_AllocateBitmap()
-{
-	CALLED();
-
-	// allocate new size of back buffer bitmap
-	delete fBitmap;
-	fBitmap = NULL;
-
-	if (fWidth < 1 || fHeight < 1) {
-		TRACE("%s: Cannot allocate bitmap < 1x1!\n", __func__);
-		return;
-	}
-
-	BRect rect(0.0, 0.0, fWidth - 1, fHeight - 1);
-	fBitmap = new BBitmap(rect, fColorSpace);
-
-	#if 0
-	// Used for platform optimized drawing
-	for (uint i = 0; i < fHeight; i++) {
-		fRowAddr[fHeight - i - 1] = (GLvoid *)((GLubyte *)fBitmap->Bits()
-			+ i * fBitmap->BytesPerRow());
-	}
-	#endif
-
-	fFrameBuffer->Width = fWidth;
-	fFrameBuffer->Height = fHeight;
-	TRACE("%s: Bitmap Size: %" B_PRIu32 "\n", __func__, fBitmap->BitsLength());
-
-	fFrontRenderBuffer->Buffer = (GLubyte*)fBitmap->Bits();
-}
-
-
-// #pragma mark - static
-
-
-const GLubyte*
-MesaSoftwareRast::_GetString(gl_context* ctx, GLenum name)
-{
-	switch (name) {
-		case GL_VENDOR:
-			return (const GLubyte*) "Mesa Project";
-		case GL_RENDERER:
-			return (const GLubyte*) "Software Rasterizer";
-		default:
-			// Let core library handle all other cases
-			return NULL;
-	}
-}
-
-
-void
-MesaSoftwareRast::_UpdateState(gl_context* ctx, GLuint new_state)
-{
-	if (!ctx)
-		return;
-
-	CALLED();
-	_swrast_InvalidateState(ctx, new_state);
-	_swsetup_InvalidateState(ctx, new_state);
-	_vbo_InvalidateState(ctx, new_state);
-	_tnl_InvalidateState(ctx, new_state);
-}
-
-
-GLboolean
-MesaSoftwareRast::_RenderBufferStorage(gl_context* ctx,
-	struct gl_renderbuffer* render, GLenum internalFormat,
-	GLuint width, GLuint height)
-{
-	CALLED();
-
-	render->Width = width;
-	render->Height = height;
-
-	struct swrast_renderbuffer *swRenderBuffer = swrast_renderbuffer(render);
-
-	swRenderBuffer->RowStride = width * _mesa_get_format_bytes(render->Format);
-
-	return GL_TRUE;
-}
-
-
-GLboolean
-MesaSoftwareRast::_RenderBufferStorageMalloc(gl_context* ctx,
-	struct gl_renderbuffer* render, GLenum internalFormat,
-	GLuint width, GLuint height)
-{
-	CALLED();
-
-	render->Width = width;
-	render->Height = height;
-
-	struct swrast_renderbuffer *swRenderBuffer = swrast_renderbuffer(render);
-
-	if (swRenderBuffer != NULL) {
-		free(swRenderBuffer->Buffer);
-		swRenderBuffer->RowStride
-			= width * _mesa_get_format_bytes(render->Format);
-
-		uint32 size = swRenderBuffer->RowStride * height;
-		TRACE("%s: Allocate %" B_PRIu32 " bytes for RenderBuffer\n",
-			__func__, size);
-		swRenderBuffer->Buffer = (GLubyte*)malloc(size);
-		if (!swRenderBuffer->Buffer) {
-			ERROR("%s: Memory allocation failure!\n", __func__);
-			return GL_FALSE;
-		}
-	} else {
-		ERROR("%s: Couldn't obtain software renderbuffer!\n",
-			__func__);
-		return GL_FALSE;
-	}
-
-	return GL_TRUE;
-}
-
-
-void
-MesaSoftwareRast::_Flush(gl_context* ctx)
-{
-	CALLED();
-	MesaSoftwareRast* driverContext = static_cast<MesaSoftwareRast*>(ctx);
-
-	//MesaSoftwareRast* driverContext = (MesaSoftwareRast*)ctx->DriverCtx;
-	if ((driverContext->fOptions & BGL_DOUBLE) == 0) {
-		// TODO: SwapBuffers() can call _CopyToDirect(), which should
-		// be always called with with the BGLView drawlocked.
-		// This is not always the case if called from here.
-		driverContext->SwapBuffers();
-	}
-}
-
-
-struct swrast_renderbuffer*
-MesaSoftwareRast::_NewRenderBuffer(bool front)
-{
-	CALLED();
-	struct swrast_renderbuffer *swRenderBuffer
-		= (struct swrast_renderbuffer*)calloc(1, sizeof *swRenderBuffer);
-
-	if (!swRenderBuffer) {
-		ERROR("%s: Failed calloc RenderBuffer\n", __func__);
-		return NULL;
-	}
-
-	_mesa_init_renderbuffer(&swRenderBuffer->Base, 0);
-
-	swRenderBuffer->Base.ClassID = HAIKU_SWRAST_RENDERBUFFER_CLASS;
-	swRenderBuffer->Base.RefCount = 1;
-	swRenderBuffer->Base.Delete = _RenderBufferDelete;
-
-	if (!front)
-		swRenderBuffer->Base.AllocStorage = _RenderBufferStorageMalloc;
-	else
-		swRenderBuffer->Base.AllocStorage = _RenderBufferStorage;
-
-	if (_SetupRenderBuffer(&swRenderBuffer->Base, fColorSpace) != B_OK) {
-		free(swRenderBuffer);
-		return NULL;
-	}
-
-	return swRenderBuffer;
-}
-
-
-status_t
-MesaSoftwareRast::_SetupRenderBuffer(struct gl_renderbuffer* rb,
-	color_space colorSpace)
-{
-	CALLED();
-
-	rb->InternalFormat = GL_RGBA;
-
-	switch (colorSpace) {
-		case B_RGBA32:
-			rb->_BaseFormat = GL_RGBA;
-			rb->Format = MESA_FORMAT_B8G8R8A8_UNORM;
-			break;
-		case B_RGB32:
-			rb->_BaseFormat = GL_RGB;
-			rb->Format = MESA_FORMAT_B8G8R8X8_UNORM;
-			break;
-		case B_RGB24:
-			rb->_BaseFormat = GL_RGB;
-			rb->Format = MESA_FORMAT_BGR_UNORM8;
-			break;
-		case B_RGB16:
-			rb->_BaseFormat = GL_RGB;
-			rb->Format = MESA_FORMAT_B5G6R5_UNORM;
-			break;
-		case B_RGB15:
-			rb->_BaseFormat = GL_RGB;
-			rb->Format = MESA_FORMAT_B5G5R5A1_UNORM;
-			break;
-		default:
-			fprintf(stderr, "Unsupported screen color space %s\n",
-				color_space_name(fColorSpace));
-			debugger("Unsupported OpenGL color space");
-			return B_ERROR;
-	}
-	return B_OK;
-}
-
-
-/*!	Y inverted Map RenderBuffer function
-	We use a BBitmap for storage which has Y inverted.
-	If the Mesa provided Map function ever allows external
-	control of this we can omit this function.
-*/
-void
-MesaSoftwareRast::_RenderBufferMap(gl_context *ctx,
-	struct gl_renderbuffer *rb, GLuint x, GLuint y, GLuint w, GLuint h,
-	GLbitfield mode, GLubyte **mapOut, GLint *rowStrideOut)
-{
-	if (rb->ClassID == HAIKU_SWRAST_RENDERBUFFER_CLASS) {
-		struct swrast_renderbuffer *srb = swrast_renderbuffer(rb);
-		const GLuint bpp = _mesa_get_format_bytes(rb->Format);
-		GLint rowStride = rb->Width * bpp; // in Bytes
-
-		y = rb->Height - y - 1;
-
-		*rowStrideOut = -rowStride;
-		*mapOut = (GLubyte *) srb->Buffer + y * rowStride + x * bpp;
-	} else {
-		_swrast_map_soft_renderbuffer(ctx, rb, x, y, w, h, mode,
-			mapOut, rowStrideOut);
-	}
-}
-
-
-void
-MesaSoftwareRast::_RenderBufferDelete(struct gl_context *ctx,
-	struct gl_renderbuffer* rb)
-{
-	CALLED();
-	if (rb != NULL) {
-		struct swrast_renderbuffer *swRenderBuffer
-			= swrast_renderbuffer(rb);
-		if (swRenderBuffer != NULL)
-			free(swRenderBuffer->Buffer);
-	}
-	free(rb);
-}
-
-
-void
-MesaSoftwareRast::_CopyToDirect()
-{
-	BAutolock lock(fInfoLocker);
-
-	// check the bitmap size still matches the size
-	if (fInfo->window_bounds.bottom - fInfo->window_bounds.top
-		!= fBitmap->Bounds().IntegerHeight()
-		|| fInfo->window_bounds.right - fInfo->window_bounds.left
-			!= fBitmap->Bounds().IntegerWidth())
-		return;
-
-	uint8 bytesPerPixel = fInfo->bits_per_pixel / 8;
-	uint32 bytesPerRow = fBitmap->BytesPerRow();
-	for (uint32 i = 0; i < fInfo->clip_list_count; i++) {
-		clipping_rect *clip = &fInfo->clip_list[i];
-		int32 height = clip->bottom - clip->top + 1;
-		int32 bytesWidth
-			= (clip->right - clip->left + 1) * bytesPerPixel;
-		uint8* p = (uint8*)fInfo->bits + clip->top
-			* fInfo->bytes_per_row + clip->left * bytesPerPixel;
-		uint8* b = (uint8*)fBitmap->Bits()
-			+ (clip->top - fInfo->window_bounds.top) * bytesPerRow
-			+ (clip->left - fInfo->window_bounds.left)
-				* bytesPerPixel;
-
-		for (int y = 0; y < height; y++) {
-			memcpy(p, b, bytesWidth);
-			p += fInfo->bytes_per_row;
-			b += bytesPerRow;
-		}
-	}
-}
diff --git a/src/mesa/drivers/haiku/swrast/SoftwareRast.h b/src/mesa/drivers/haiku/swrast/SoftwareRast.h
deleted file mode 100644
index 8f0f0184863..00000000000
--- a/src/mesa/drivers/haiku/swrast/SoftwareRast.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright 2006-2012, Haiku, Inc. All rights reserved.
- * Distributed under the terms of the MIT License.
- *
- * Authors:
- *		Jérôme Duval, [email protected]
- * 		Philippe Houdoin, [email protected]
- *		Artur Wyszynski, [email protected]
- */
-#ifndef MESASOFTWARERENDERER_H
-#define MESASOFTWARERENDERER_H
-
-
-#define HAIKU_SWRAST_RENDERBUFFER_CLASS 0x737752 // swR
-
-
-#include "GLRenderer.h"
-
-extern "C" {
-#include "context.h"
-#include "main/version.h"
-#include "swrast/s_chan.h"
-#include "swrast/s_context.h"
-}
-
-
-class MesaSoftwareRast : public BGLRenderer, public gl_context {
-public:
-							MesaSoftwareRast(BGLView* view,
-								ulong bgl_options,
-								BGLDispatcher* dispatcher);
-	virtual					~MesaSoftwareRast();
-
-	virtual	void			LockGL();
-	virtual	void 			UnlockGL();
-
-	virtual	void 			SwapBuffers(bool VSync = false);
-	virtual	void			Draw(BRect updateRect);
-	virtual	status_t		CopyPixelsOut(BPoint source, BBitmap* dest);
-	virtual	status_t		CopyPixelsIn(BBitmap* source, BPoint dest);
-	virtual void			FrameResized(float width, float height);
-
-	virtual	void			EnableDirectMode(bool enabled);
-	virtual	void			DirectConnected(direct_buffer_info* info);
-
-private:
-	static	const GLubyte*	_GetString(gl_context* ctx, GLenum name);
-			void			_CheckResize(GLuint newWidth, GLuint newHeight);
-	static	void			_UpdateState(gl_context* ctx, GLuint newState);
-	static	void			_Flush(gl_context *ctx);
-
-	struct	swrast_renderbuffer* _NewRenderBuffer(bool front);
-			status_t		_SetupRenderBuffer(struct gl_renderbuffer* rb,
-								color_space colorSpace);
-
-/* Mesa callbacks */
-	static	void			_RenderBufferDelete(struct gl_context *ctx,
-								struct gl_renderbuffer* rb);
-	static	GLboolean		_RenderBufferStorage(gl_context* ctx,
-								struct gl_renderbuffer* render,
-								GLenum internalFormat,
-								GLuint width, GLuint height);
-	static	GLboolean		_RenderBufferStorageMalloc(gl_context* ctx,
-								struct gl_renderbuffer* render,
-								GLenum internalFormat,
-								GLuint width, GLuint height);
-	static	void			_RenderBufferMap(gl_context *ctx,
-								struct gl_renderbuffer *rb,
-								GLuint x, GLuint y, GLuint w, GLuint h,
-								GLbitfield mode, GLubyte **mapOut,
-								GLint *rowStrideOut);
-
-			void			_AllocateBitmap();
-			void			_CopyToDirect();
-
-			BBitmap*		fBitmap;
-			bool			fDirectModeEnabled;
-			direct_buffer_info* fInfo;
-			BLocker			fInfoLocker;
-			ulong			fOptions;
-
-			gl_config*		fVisual;
-
-			struct gl_framebuffer* fFrameBuffer;
-			struct swrast_renderbuffer* fFrontRenderBuffer;
-			struct swrast_renderbuffer* fBackRenderBuffer;
-
-			GLuint			fWidth;
-			GLuint			fHeight;
-			color_space		fColorSpace;
-
-			void*			fRowAddr[SWRAST_MAX_HEIGHT];
-};
-
-#endif	// MESASOFTWARERENDERER_H
diff --git a/src/mesa/drivers/haiku/swrast/SoftwareRast.rdef b/src/mesa/drivers/haiku/swrast/SoftwareRast.rdef
deleted file mode 100644
index cb60332100c..00000000000
--- a/src/mesa/drivers/haiku/swrast/SoftwareRast.rdef
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2012, Haiku, Inc. All rights reserved.
- * Distributed under the terms of the MIT License.
- */
-
-resource app_signature "application/x-vnd.Haiku-swrast";
-
-resource app_version {
-    major  = 9,
-    middle = 0,
-    minor  = 0,
-    variety = 0,
-    internal = 0,
-    short_info = "Software Rasterizer",
-    long_info = "Haiku Mesa Software GL Rasterizer"
-};
-
-resource vector_icon {
-	$"6E6369660A0200140294A9FF18020014028DFFFF97058C0500020006023B10B7"
-	$"37F036BA1A993D466848C719BEBE2000919292FFD5D5D5020016023900000000"
-	$"000000003EE0004AE00048E0005EF884C702000203392E8D383001BAD97F3C12"
-	$"8B4786BD48B8AD0D97BBFFFF7B4168DBE9FF4168DB97020002023A0C1238D099"
-	$"BE44203F4BD14B38844678240DF56A7D9FE1EA064CC704016B0500090A044024"
-	$"2438404C5C380A044028243C40505C3C0A042438243B5C3C5C380608BFBE4D59"
-	$"4D59515957575659585560406044603C5E3A5C3CCB4FBFBA5E3ECA9DC11F564B"
-	$"584A544C504C0606AF0F2F3D2F3D393D4034BF593542324130432F42364432C0"
-	$"3FBC5A2F48354A2F480608AE9A22303EB5BD3AB42542B755422E412F3C29322D"
-	$"32223C0204263726372538263F253E263F304430443143303C313D303C02043D"
-	$"423D423C433D4A3C493D4A495049504A4F49474A484947060DAEAAAE014E445A"
-	$"3456365E325E3D5D3F5A3A5542544E4D573A4E364439463342324A2242310A0A"
-	$"0002020102403CA00C88888C8CC1401673C40D6544F2950A01010002403CA000"
-	$"0000000000401673C40D65446CF80A08020304023EC16A0000000000003EC16A"
-	$"45DD1844C6550A030105123EC16A0000000000003EC16A45DD1844C655011784"
-	$"22040A040105023EC16A0000000000003EC16A45DD1844C6550A030108123EC1"
-	$"6A0000000000003EC16A45DD1844C65501178422040A0503080706023EC16A00"
-	$"00000000003EC16A45DD1844C6550A030206071A3EC16A0000000000003EC16A"
-	$"45DD1844C65510FF0215810004178222040A060106023EC16A0000000000003E"
-	$"C16A45DD1844C6550A070107023EC16A0000000000003EC16A45DD1844C655"
-};
diff --git a/src/mesa/drivers/osmesa/Makefile.am b/src/mesa/drivers/osmesa/Makefile.am
index 9a388d64cd5..46332e16bd1 100644
--- a/src/mesa/drivers/osmesa/Makefile.am
+++ b/src/mesa/drivers/osmesa/Makefile.am
@@ -39,7 +39,6 @@ nodist_EXTRA_lib@OSMESA_LIB@_la_SOURCES = dummy.cpp
 lib@OSMESA_LIB@_la_SOURCES = osmesa.c
 
 lib@OSMESA_LIB@_la_LDFLAGS = \
-	-module \
 	-no-undefined \
 	-version-number @OSMESA_VERSION@ \
 	$(GC_SECTIONS) \
diff --git a/src/mesa/drivers/x11/Makefile.am b/src/mesa/drivers/x11/Makefile.am
index c0596f8119e..ba79f6981b9 100644
--- a/src/mesa/drivers/x11/Makefile.am
+++ b/src/mesa/drivers/x11/Makefile.am
@@ -25,6 +25,11 @@
 
 EXTRA_DIST = SConscript
 
+if HAVE_SHARED_GLAPI
+SHARED_GLAPI_CFLAGS = -DGLX_SHARED_GLAPI
+SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
+endif
+
 AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/mapi \
@@ -34,11 +39,10 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/mesa/main \
 	$(X11_INCLUDES) \
+	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES)
 
-if HAVE_X11_DRIVER
 lib_LTLIBRARIES = lib@[email protected]
-endif
 
 lib@GL_LIB@_la_SOURCES = \
 	glxapi.h \
@@ -66,6 +70,7 @@ GL_PATCH = 0
 lib@GL_LIB@_la_LIBADD = \
 	$(top_builddir)/src/mesa/libmesa.la \
 	$(top_builddir)/src/mapi/glapi/libglapi.la \
+	$(SHARED_GLAPI_LIB) \
 	$(GL_LIB_DEPS)
 
 lib@GL_LIB@_la_LDFLAGS = \
diff --git a/src/mesa/main/api_exec.h b/src/mesa/main/api_exec.h
index 12249fec228..655cb32d0a4 100644
--- a/src/mesa/main/api_exec.h
+++ b/src/mesa/main/api_exec.h
@@ -38,6 +38,9 @@ _mesa_initialize_exec_table(struct gl_context *ctx);
 extern void
 _mesa_initialize_dispatch_tables(struct gl_context *ctx);
 
+extern struct _glapi_table *
+_mesa_new_nop_table(unsigned numEntries);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c
index 9932a837336..a7fd82c531f 100644
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -1772,7 +1772,9 @@ _mesa_loopback_init_api_table(const struct gl_context *ctx,
       SET_VertexAttribI4sv(dest, _mesa_VertexAttribI4sv);
       SET_VertexAttribI4ubv(dest, _mesa_VertexAttribI4ubv);
       SET_VertexAttribI4usv(dest, _mesa_VertexAttribI4usv);
+   }
 
+   if (ctx->API == API_OPENGL_CORE) {
       /* GL 4.1 / GL_ARB_vertex_attrib_64bit */
       SET_VertexAttribL1d(dest, _mesa_VertexAttribL1d);
       SET_VertexAttribL2d(dest, _mesa_VertexAttribL2d);
diff --git a/src/mesa/main/attrib.c b/src/mesa/main/attrib.c
index b163c0aa699..53626e38be9 100644
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -177,6 +177,10 @@ struct texture_state
 };
 
 
+/** An unused GL_*_BIT value */
+#define DUMMY_BIT 0x10000000
+
+
 /**
  * Allocate new attribute node of given type/kind.  Attach payload data.
  * Insert it into the linked list named by 'head'.
@@ -253,6 +257,15 @@ _mesa_PushAttrib(GLbitfield mask)
    /* groups specified by the mask. */
    head = NULL;
 
+   if (mask == 0) {
+      /* if mask is zero we still need to push something so that we
+       * don't get a GL_STACK_UNDERFLOW error in glPopAttrib().
+       */
+      GLuint dummy = 0;
+      if (!push_attrib(ctx, &head, DUMMY_BIT, sizeof(dummy), &dummy))
+         goto end;
+   }
+
    if (mask & GL_ACCUM_BUFFER_BIT) {
       if (!push_attrib(ctx, &head, GL_ACCUM_BUFFER_BIT,
                        sizeof(struct gl_accum_attrib),
@@ -928,6 +941,10 @@ _mesa_PopAttrib(void)
       }
 
       switch (attr->kind) {
+         case DUMMY_BIT:
+            /* do nothing */
+            break;
+
          case GL_ACCUM_BUFFER_BIT:
             {
                const struct gl_accum_attrib *accum;
@@ -1074,6 +1091,11 @@ _mesa_PopAttrib(void)
                _mesa_ClearDepth(depth->Clear);
                _mesa_set_enable(ctx, GL_DEPTH_TEST, depth->Test);
                _mesa_DepthMask(depth->Mask);
+               if (ctx->Extensions.EXT_depth_bounds_test) {
+                  _mesa_set_enable(ctx, GL_DEPTH_BOUNDS_TEST_EXT,
+                                   depth->BoundsTest);
+                  _mesa_DepthBoundsEXT(depth->BoundsMin, depth->BoundsMax);
+               }
             }
             break;
          case GL_ENABLE_BIT:
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index 774fc888ec4..d869fa2aa09 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -769,7 +769,7 @@ _mesa_ClampColor(GLenum target, GLenum clamp)
       }
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       ctx->Light.ClampVertexColor = clamp;
-      _mesa_update_clamp_vertex_color(ctx);
+      _mesa_update_clamp_vertex_color(ctx, ctx->DrawBuffer);
       break;
    case GL_CLAMP_FRAGMENT_COLOR_ARB:
       if (ctx->API == API_OPENGL_CORE &&
@@ -778,7 +778,7 @@ _mesa_ClampColor(GLenum target, GLenum clamp)
       }
       FLUSH_VERTICES(ctx, _NEW_FRAG_CLAMP);
       ctx->Color.ClampFragmentColor = clamp;
-      _mesa_update_clamp_fragment_color(ctx);
+      _mesa_update_clamp_fragment_color(ctx, ctx->DrawBuffer);
       break;
    case GL_CLAMP_READ_COLOR_ARB:
       ctx->Color.ClampReadColor = clamp;
@@ -807,50 +807,55 @@ get_clamp_color(const struct gl_framebuffer *fb, GLenum clamp)
 }
 
 GLboolean
-_mesa_get_clamp_fragment_color(const struct gl_context *ctx)
+_mesa_get_clamp_fragment_color(const struct gl_context *ctx,
+                               const struct gl_framebuffer *drawFb)
 {
-   return get_clamp_color(ctx->DrawBuffer,
-                                ctx->Color.ClampFragmentColor);
+   return get_clamp_color(drawFb, ctx->Color.ClampFragmentColor);
 }
 
 GLboolean
-_mesa_get_clamp_vertex_color(const struct gl_context *ctx)
+_mesa_get_clamp_vertex_color(const struct gl_context *ctx,
+                             const struct gl_framebuffer *drawFb)
 {
-   return get_clamp_color(ctx->DrawBuffer, ctx->Light.ClampVertexColor);
+   return get_clamp_color(drawFb, ctx->Light.ClampVertexColor);
 }
 
 GLboolean
-_mesa_get_clamp_read_color(const struct gl_context *ctx)
+_mesa_get_clamp_read_color(const struct gl_context *ctx,
+                           const struct gl_framebuffer *readFb)
 {
-   return get_clamp_color(ctx->ReadBuffer, ctx->Color.ClampReadColor);
+   return get_clamp_color(readFb, ctx->Color.ClampReadColor);
 }
 
 /**
  * Update the ctx->Color._ClampFragmentColor field
  */
 void
-_mesa_update_clamp_fragment_color(struct gl_context *ctx)
+_mesa_update_clamp_fragment_color(struct gl_context *ctx,
+                                  const struct gl_framebuffer *drawFb)
 {
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-
    /* Don't clamp if:
     * - there is no colorbuffer
     * - all colorbuffers are unsigned normalized, so clamping has no effect
     * - there is an integer colorbuffer
     */
-   if (!fb || !fb->_HasSNormOrFloatColorBuffer || fb->_IntegerColor)
+   if (!drawFb || !drawFb->_HasSNormOrFloatColorBuffer ||
+       drawFb->_IntegerColor)
       ctx->Color._ClampFragmentColor = GL_FALSE;
    else
-      ctx->Color._ClampFragmentColor = _mesa_get_clamp_fragment_color(ctx);
+      ctx->Color._ClampFragmentColor =
+         _mesa_get_clamp_fragment_color(ctx, drawFb);
 }
 
 /**
  * Update the ctx->Color._ClampVertexColor field
  */
 void
-_mesa_update_clamp_vertex_color(struct gl_context *ctx)
+_mesa_update_clamp_vertex_color(struct gl_context *ctx,
+                                const struct gl_framebuffer *drawFb)
 {
-   ctx->Light._ClampVertexColor = _mesa_get_clamp_vertex_color(ctx);
+   ctx->Light._ClampVertexColor =
+         _mesa_get_clamp_vertex_color(ctx, drawFb);
 }
 
 /**
diff --git a/src/mesa/main/blend.h b/src/mesa/main/blend.h
index fe31a7440f0..8ab9e02fc13 100644
--- a/src/mesa/main/blend.h
+++ b/src/mesa/main/blend.h
@@ -37,6 +37,7 @@
 #include "formats.h"
 
 struct gl_context;
+struct gl_framebuffer;
 
 
 extern void GLAPIENTRY
@@ -101,19 +102,24 @@ extern void GLAPIENTRY
 _mesa_ClampColor(GLenum target, GLenum clamp);
 
 extern GLboolean
-_mesa_get_clamp_fragment_color(const struct gl_context *ctx);
+_mesa_get_clamp_fragment_color(const struct gl_context *ctx,
+                               const struct gl_framebuffer *drawFb);
 
 extern GLboolean
-_mesa_get_clamp_vertex_color(const struct gl_context *ctx);
+_mesa_get_clamp_vertex_color(const struct gl_context *ctx,
+                             const struct gl_framebuffer *drawFb);
 
 extern GLboolean
-_mesa_get_clamp_read_color(const struct gl_context *ctx);
+_mesa_get_clamp_read_color(const struct gl_context *ctx,
+                           const struct gl_framebuffer *readFb);
 
 extern void
-_mesa_update_clamp_fragment_color(struct gl_context *ctx);
+_mesa_update_clamp_fragment_color(struct gl_context *ctx,
+                                  const struct gl_framebuffer *drawFb);
 
 extern void
-_mesa_update_clamp_vertex_color(struct gl_context *ctx);
+_mesa_update_clamp_vertex_color(struct gl_context *ctx,
+                                const struct gl_framebuffer *drawFb);
 
 extern mesa_format
 _mesa_get_render_format(const struct gl_context *ctx, mesa_format format);
diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c
index 0694466eb75..db8fee5a414 100644
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -34,6 +34,7 @@
 #include "enums.h"
 #include "blit.h"
 #include "fbobject.h"
+#include "framebuffer.h"
 #include "glformats.h"
 #include "mtypes.h"
 #include "state.h"
@@ -148,38 +149,25 @@ is_valid_blit_filter(const struct gl_context *ctx, GLenum filter)
 }
 
 
-/**
- * Blit rectangular region, optionally from one framebuffer to another.
- *
- * Note, if the src buffer is multisampled and the dest is not, this is
- * when the samples must be resolved to a single color.
- */
-void GLAPIENTRY
-_mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                         GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                         GLbitfield mask, GLenum filter)
+void
+_mesa_blit_framebuffer(struct gl_context *ctx,
+                       struct gl_framebuffer *readFb,
+                       struct gl_framebuffer *drawFb,
+                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                       GLbitfield mask, GLenum filter, const char *func)
 {
    const GLbitfield legalMaskBits = (GL_COLOR_BUFFER_BIT |
                                      GL_DEPTH_BUFFER_BIT |
                                      GL_STENCIL_BUFFER_BIT);
-   const struct gl_framebuffer *readFb, *drawFb;
-   GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx,
-                  "glBlitFramebuffer(%d, %d, %d, %d,  %d, %d, %d, %d, 0x%x, %s)\n",
-                  srcX0, srcY0, srcX1, srcY1,
-                  dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_lookup_enum_by_nr(filter));
-
-   if (ctx->NewState) {
-      _mesa_update_state(ctx);
-   }
+   /* Update completeness status of readFb and drawFb. */
+   _mesa_update_framebuffer(ctx, readFb, drawFb);
 
-   readFb = ctx->ReadBuffer;
-   drawFb = ctx->DrawBuffer;
+   /* Make sure drawFb has an initialized bounding box. */
+   _mesa_update_draw_buffer_bounds(ctx, drawFb);
 
    if (!readFb || !drawFb) {
       /* This will normally never happen but someday we may want to
@@ -192,12 +180,12 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    if (drawFb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT ||
        readFb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
       _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
-                  "glBlitFramebufferEXT(incomplete draw/read buffers)");
+                  "%s(incomplete draw/read buffers)", func);
       return;
    }
 
    if (!is_valid_blit_filter(ctx, filter)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glBlitFramebufferEXT(%s)",
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid filter %s)", func,
                   _mesa_lookup_enum_by_nr(filter));
       return;
    }
@@ -205,13 +193,13 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    if ((filter == GL_SCALED_RESOLVE_FASTEST_EXT ||
         filter == GL_SCALED_RESOLVE_NICEST_EXT) &&
         (readFb->Visual.samples == 0 || drawFb->Visual.samples > 0)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glBlitFramebufferEXT(%s)",
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s: invalid samples)", func,
                   _mesa_lookup_enum_by_nr(filter));
       return;
    }
 
    if (mask & ~legalMaskBits) {
-      _mesa_error( ctx, GL_INVALID_VALUE, "glBlitFramebufferEXT(mask)");
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(invalid mask bits set)", func);
       return;
    }
 
@@ -219,13 +207,13 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    if ((mask & (GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT))
         && filter != GL_NEAREST) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-             "glBlitFramebufferEXT(depth/stencil requires GL_NEAREST filter)");
+             "%s(depth/stencil requires GL_NEAREST filter)", func);
       return;
    }
 
    /* get color read/draw renderbuffers */
    if (mask & GL_COLOR_BUFFER_BIT) {
-      const GLuint numColorDrawBuffers = ctx->DrawBuffer->_NumColorDrawBuffers;
+      const GLuint numColorDrawBuffers = drawFb->_NumColorDrawBuffers;
       const struct gl_renderbuffer *colorReadRb = readFb->_ColorReadBuffer;
       const struct gl_renderbuffer *colorDrawRb = NULL;
       GLuint i;
@@ -241,7 +229,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
       }
       else {
          for (i = 0; i < numColorDrawBuffers; i++) {
-            colorDrawRb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+            colorDrawRb = drawFb->_ColorDrawBuffers[i];
             if (!colorDrawRb)
                continue;
 
@@ -257,15 +245,15 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
              */
             if (_mesa_is_gles3(ctx) && (colorDrawRb == colorReadRb)) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glBlitFramebuffer(source and destination color "
-                           "buffer cannot be the same)");
+                           "%s(source and destination color "
+                           "buffer cannot be the same)", func);
                return;
             }
 
             if (!compatible_color_datatypes(colorReadRb->Format,
                                             colorDrawRb->Format)) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glBlitFramebufferEXT(color buffer datatypes mismatch)");
+                           "%s(color buffer datatypes mismatch)", func);
                return;
             }
             /* extra checks for multisample copies... */
@@ -273,7 +261,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                /* color formats must match */
                if (!compatible_resolve_formats(colorReadRb, colorDrawRb)) {
                   _mesa_error(ctx, GL_INVALID_OPERATION,
-                         "glBlitFramebufferEXT(bad src/dst multisample pixel formats)");
+                         "%s(bad src/dst multisample pixel formats)", func);
                   return;
                }
             }
@@ -286,7 +274,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
             GLenum type = _mesa_get_format_datatype(colorReadRb->Format);
             if (type == GL_INT || type == GL_UNSIGNED_INT) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glBlitFramebufferEXT(integer color type)");
+                           "%s(integer color type)", func);
                return;
             }
          }
@@ -306,15 +294,15 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
        *     ignored."
        */
       if ((readRb == NULL) || (drawRb == NULL)) {
-	 mask &= ~GL_STENCIL_BUFFER_BIT;
+         mask &= ~GL_STENCIL_BUFFER_BIT;
       }
       else {
          int read_z_bits, draw_z_bits;
 
          if (_mesa_is_gles3(ctx) && (drawRb == readRb)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebuffer(source and destination stencil "
-                        "buffer cannot be the same)");
+                        "%s(source and destination stencil "
+                        "buffer cannot be the same)", func);
             return;
          }
 
@@ -324,7 +312,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
              * there is only one: GL_UNSIGNED_INT.
              */
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebuffer(stencil attachment format mismatch)");
+                        "%s(stencil attachment format mismatch)", func);
             return;
          }
 
@@ -340,8 +328,8 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
               _mesa_get_format_datatype(readRb->Format) !=
               _mesa_get_format_datatype(drawRb->Format))) {
 
-            _mesa_error(ctx, GL_INVALID_OPERATION, "glBlitFramebuffer"
-                        "(stencil attachment depth format mismatch)");
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(stencil attachment depth format mismatch)", func);
             return;
          }
       }
@@ -360,15 +348,15 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
        *     ignored."
        */
       if ((readRb == NULL) || (drawRb == NULL)) {
-	 mask &= ~GL_DEPTH_BUFFER_BIT;
+         mask &= ~GL_DEPTH_BUFFER_BIT;
       }
       else {
          int read_s_bit, draw_s_bit;
 
          if (_mesa_is_gles3(ctx) && (drawRb == readRb)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebuffer(source and destination depth "
-                        "buffer cannot be the same)");
+                        "%s(source and destination depth "
+                        "buffer cannot be the same)", func);
             return;
          }
 
@@ -377,7 +365,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
              (_mesa_get_format_datatype(readRb->Format) !=
               _mesa_get_format_datatype(drawRb->Format))) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebuffer(depth attachment format mismatch)");
+                        "%s(depth attachment format mismatch)", func);
             return;
          }
 
@@ -389,8 +377,8 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
           * we should ignore the stencil format check.
           */
          if (read_s_bit > 0 && draw_s_bit > 0 && read_s_bit != draw_s_bit) {
-            _mesa_error(ctx, GL_INVALID_OPERATION, "glBlitFramebuffer"
-                        "(depth attachment stencil bits mismatch)");
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(depth attachment stencil bits mismatch)", func);
             return;
          }
       }
@@ -406,7 +394,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
        */
       if (drawFb->Visual.samples > 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBlitFramebuffer(destination samples must be 0)");
+                     "%s(destination samples must be 0)", func);
          return;
       }
 
@@ -426,7 +414,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
           && (srcX0 != dstX0 || srcY0 != dstY0
               || srcX1 != dstX1 || srcY1 != dstY1)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBlitFramebuffer(bad src/dst multisample region)");
+                     "%s(bad src/dst multisample region)", func);
          return;
       }
    } else {
@@ -434,7 +422,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
           drawFb->Visual.samples > 0 &&
           readFb->Visual.samples != drawFb->Visual.samples) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBlitFramebufferEXT(mismatched samples)");
+                     "%s(mismatched samples)", func);
          return;
       }
 
@@ -445,7 +433,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
          if (abs(srcX1 - srcX0) != abs(dstX1 - dstX0) ||
              abs(srcY1 - srcY0) != abs(dstY1 - dstY0)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebufferEXT(bad src/dst multisample region sizes)");
+                        "%s(bad src/dst multisample region sizes)", func);
             return;
          }
       }
@@ -457,43 +445,44 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
       const struct gl_renderbuffer *colorDrawRb = NULL;
       GLuint i = 0;
 
-      printf("glBlitFramebuffer(%d, %d, %d, %d,  %d, %d, %d, %d,"
-	     " 0x%x, 0x%x)\n",
-	     srcX0, srcY0, srcX1, srcY1,
-	     dstX0, dstY0, dstX1, dstY1,
-	     mask, filter);
+      printf("%s(%d, %d, %d, %d,  %d, %d, %d, %d,"
+             " 0x%x, 0x%x)\n", func,
+             srcX0, srcY0, srcX1, srcY1,
+             dstX0, dstY0, dstX1, dstY1,
+             mask, filter);
+
       if (colorReadRb) {
          const struct gl_renderbuffer_attachment *att;
 
          att = find_attachment(readFb, colorReadRb);
          printf("  Src FBO %u  RB %u (%dx%d)  ",
-		readFb->Name, colorReadRb->Name,
-		colorReadRb->Width, colorReadRb->Height);
+                readFb->Name, colorReadRb->Name,
+                colorReadRb->Width, colorReadRb->Height);
          if (att && att->Texture) {
             printf("Tex %u  tgt 0x%x  level %u  face %u",
-		   att->Texture->Name,
-		   att->Texture->Target,
-		   att->TextureLevel,
-		   att->CubeMapFace);
+                   att->Texture->Name,
+                   att->Texture->Target,
+                   att->TextureLevel,
+                   att->CubeMapFace);
          }
          printf("\n");
 
          /* Print all active color render buffers */
-         for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-            colorDrawRb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+         for (i = 0; i < drawFb->_NumColorDrawBuffers; i++) {
+            colorDrawRb = drawFb->_ColorDrawBuffers[i];
             if (!colorDrawRb)
                continue;
 
             att = find_attachment(drawFb, colorDrawRb);
             printf("  Dst FBO %u  RB %u (%dx%d)  ",
-		   drawFb->Name, colorDrawRb->Name,
-		   colorDrawRb->Width, colorDrawRb->Height);
+                   drawFb->Name, colorDrawRb->Name,
+                   colorDrawRb->Width, colorDrawRb->Height);
             if (att && att->Texture) {
                printf("Tex %u  tgt 0x%x  level %u  face %u",
-		      att->Texture->Name,
-		      att->Texture->Target,
-		      att->TextureLevel,
-		      att->CubeMapFace);
+                      att->Texture->Name,
+                      att->Texture->Target,
+                      att->TextureLevel,
+                      att->CubeMapFace);
             }
             printf("\n");
          }
@@ -507,8 +496,87 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    }
 
    assert(ctx->Driver.BlitFramebuffer);
-   ctx->Driver.BlitFramebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
+   ctx->Driver.BlitFramebuffer(ctx, readFb, drawFb,
                                srcX0, srcY0, srcX1, srcY1,
                                dstX0, dstY0, dstX1, dstY1,
                                mask, filter);
 }
+
+
+/**
+ * Blit rectangular region, optionally from one framebuffer to another.
+ *
+ * Note, if the src buffer is multisampled and the dest is not, this is
+ * when the samples must be resolved to a single color.
+ */
+void GLAPIENTRY
+_mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                      GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                      GLbitfield mask, GLenum filter)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx,
+                  "glBlitFramebuffer(%d, %d, %d, %d, "
+                  " %d, %d, %d, %d, 0x%x, %s)\n",
+                  srcX0, srcY0, srcX1, srcY1,
+                  dstX0, dstY0, dstX1, dstY1,
+                  mask, _mesa_lookup_enum_by_nr(filter));
+
+   _mesa_blit_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
+                          srcX0, srcY0, srcX1, srcY1,
+                          dstX0, dstY0, dstX1, dstY1,
+                          mask, filter, "glBlitFramebuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
+                           GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                           GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                           GLbitfield mask, GLenum filter)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *readFb, *drawFb;
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx,
+                  "glBlitNamedFramebuffer(%u %u %d, %d, %d, %d, "
+                  " %d, %d, %d, %d, 0x%x, %s)\n",
+                  readFramebuffer, drawFramebuffer,
+                  srcX0, srcY0, srcX1, srcY1,
+                  dstX0, dstY0, dstX1, dstY1,
+                  mask, _mesa_lookup_enum_by_nr(filter));
+
+   /*
+    * According to PDF page 533 of the OpenGL 4.5 core spec (30.10.2014,
+    * Section 18.3 Copying Pixels):
+    *   "... if readFramebuffer or drawFramebuffer is zero (for
+    *   BlitNamedFramebuffer), then the default read or draw framebuffer is
+    *   used as the corresponding source or destination framebuffer,
+    *   respectively."
+    */
+   if (readFramebuffer) {
+      readFb = _mesa_lookup_framebuffer_err(ctx, readFramebuffer,
+                                            "glBlitNamedFramebuffer");
+      if (!readFb)
+         return;
+   }
+   else
+      readFb = ctx->WinSysReadBuffer;
+
+   if (drawFramebuffer) {
+      drawFb = _mesa_lookup_framebuffer_err(ctx, drawFramebuffer,
+                                            "glBlitNamedFramebuffer");
+      if (!drawFb)
+         return;
+   }
+   else
+      drawFb = ctx->WinSysDrawBuffer;
+
+   _mesa_blit_framebuffer(ctx, readFb, drawFb,
+                          srcX0, srcY0, srcX1, srcY1,
+                          dstX0, dstY0, dstX1, dstY1,
+                          mask, filter, "glBlitNamedFramebuffer");
+}
diff --git a/src/mesa/main/blit.h b/src/mesa/main/blit.h
index 01a958af5a2..54b946e3192 100644
--- a/src/mesa/main/blit.h
+++ b/src/mesa/main/blit.h
@@ -28,11 +28,24 @@
 
 #include "glheader.h"
 
+extern void
+_mesa_blit_framebuffer(struct gl_context *ctx,
+                       struct gl_framebuffer *readFb,
+                       struct gl_framebuffer *drawFb,
+                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                       GLbitfield mask, GLenum filter, const char *func);
 
 extern void GLAPIENTRY
 _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                          GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                          GLbitfield mask, GLenum filter);
 
+extern void GLAPIENTRY
+_mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
+                           GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                           GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                           GLbitfield mask, GLenum filter);
+
 
 #endif /* BLIT_H */
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 37a9790923b..0536266d756 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -242,16 +242,16 @@ read_buffer_enum_to_index(GLenum buffer)
  *
  * See the GL_EXT_framebuffer_object spec for more info.
  */
-void GLAPIENTRY
-_mesa_DrawBuffer(GLenum buffer)
+void
+_mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLenum buffer, const char *caller)
 {
    GLbitfield destMask;
-   GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API) {
-      _mesa_debug(ctx, "glDrawBuffer %s\n", _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
    }
 
    if (buffer == GL_NONE) {
@@ -259,33 +259,60 @@ _mesa_DrawBuffer(GLenum buffer)
    }
    else {
       const GLbitfield supportedMask
-         = supported_buffer_bitmask(ctx, ctx->DrawBuffer);
+         = supported_buffer_bitmask(ctx, fb);
       destMask = draw_buffer_enum_to_bitmask(ctx, buffer);
       if (destMask == BAD_MASK) {
          /* totally bogus buffer */
-         _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glDrawBuffer(buffer=0x%x)", buffer);
+         _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)", caller,
+                     _mesa_lookup_enum_by_nr(buffer));
          return;
       }
       destMask &= supportedMask;
       if (destMask == 0x0) {
          /* none of the named color buffers exist! */
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glDrawBuffer(buffer=0x%x)", buffer);
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid buffer %s)",
+                     caller, _mesa_lookup_enum_by_nr(buffer));
          return;
       }
    }
 
    /* if we get here, there's no error so set new state */
-   _mesa_drawbuffers(ctx, 1, &buffer, &destMask);
+   _mesa_drawbuffers(ctx, fb, 1, &buffer, &destMask);
+
+   /* Call device driver function only if fb is the bound draw buffer */
+   if (fb == ctx->DrawBuffer) {
+      if (ctx->Driver.DrawBuffers)
+         ctx->Driver.DrawBuffers(ctx, 1, &buffer);
+      else if (ctx->Driver.DrawBuffer)
+         ctx->Driver.DrawBuffer(ctx, buffer);
+   }
+}
 
-   /*
-    * Call device driver function.
-    */
-   if (ctx->Driver.DrawBuffers)
-      ctx->Driver.DrawBuffers(ctx, 1, &buffer);
-   else if (ctx->Driver.DrawBuffer)
-      ctx->Driver.DrawBuffer(ctx, buffer);
+
+void GLAPIENTRY
+_mesa_DrawBuffer(GLenum buffer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_draw_buffer(ctx, ctx->DrawBuffer, buffer, "glDrawBuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferDrawBuffer(GLuint framebuffer, GLenum buf)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glNamedFramebufferDrawBuffer");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   _mesa_draw_buffer(ctx, fb, buf, "glNamedFramebufferDrawBuffer");
 }
 
 
@@ -298,13 +325,13 @@ _mesa_DrawBuffer(GLenum buffer)
  *                 names cannot specify more than one buffer.  For example,
  *                 GL_FRONT_AND_BACK is illegal.
  */
-void GLAPIENTRY
-_mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
+void
+_mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
+                   GLsizei n, const GLenum *buffers, const char *caller)
 {
    GLuint output;
    GLbitfield usedBufferMask, supportedMask;
    GLbitfield destMask[MAX_DRAW_BUFFERS];
-   GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
@@ -315,12 +342,18 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
     * "An INVALID_VALUE error is generated if n is greater than
     *  MAX_DRAW_BUFFERS."
     */
-   if (n < 0 || n > (GLsizei) ctx->Const.MaxDrawBuffers) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glDrawBuffersARB(n)");
+   if (n < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", caller);
+      return;
+   }
+
+   if (n > (GLsizei) ctx->Const.MaxDrawBuffers) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(n > maximum number of draw buffers)", caller);
       return;
    }
 
-   supportedMask = supported_buffer_bitmask(ctx, ctx->DrawBuffer);
+   supportedMask = supported_buffer_bitmask(ctx, fb);
    usedBufferMask = 0x0;
 
    /* From the ES 3.0 specification, page 180:
@@ -328,9 +361,9 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
     *  and the constant must be BACK or NONE."
     * (same restriction applies with GL_EXT_draw_buffers specification)
     */
-   if (ctx->API == API_OPENGLES2 && _mesa_is_winsys_fbo(ctx->DrawBuffer) &&
+   if (ctx->API == API_OPENGLES2 && _mesa_is_winsys_fbo(fb) &&
        (n != 1 || (buffers[0] != GL_NONE && buffers[0] != GL_BACK))) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawBuffers(buffer)");
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid buffers)", caller);
       return;
    }
 
@@ -362,9 +395,11 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           *     or equal to the value of MAX_COLOR_ATTACHMENTS, then the error
           *     INVALID_OPERATION results."
           */
-         if (_mesa_is_user_fbo(ctx->DrawBuffer) && buffers[output] >=
+         if (_mesa_is_user_fbo(fb) && buffers[output] >=
              GL_COLOR_ATTACHMENT0 + ctx->Const.MaxDrawBuffers) {
-            _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawBuffersARB(buffer)");
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(buffers[%d] >= maximum number of draw buffers)",
+                        caller, output);
             return;
          }
 
@@ -375,9 +410,10 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           *  4.5 or 4.6.  Otherwise, an INVALID_ENUM error is generated.
           */
          if (destMask[output] == BAD_MASK) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "glDrawBuffersARB(buffer)");
+            _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
-         }         
+         }
 
          /* From the OpenGL 4.0 specification, page 256:
           * "For both the default framebuffer and framebuffer objects, the
@@ -390,7 +426,8 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           *  but the Khronos conformance tests expect INVALID_ENUM.
           */
          if (_mesa_bitcount(destMask[output]) > 1) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "glDrawBuffersARB(buffer)");
+            _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
          }
 
@@ -407,7 +444,8 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
          destMask[output] &= supportedMask;
          if (destMask[output] == 0) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glDrawBuffersARB(unsupported buffer)");
+                        "%s(unsupported buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
          }
 
@@ -416,10 +454,12 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           *  in bufs must be COLOR_ATTACHMENTi or NONE. [...] INVALID_OPERATION."
           * (same restriction applies with GL_EXT_draw_buffers specification)
           */
-         if (ctx->API == API_OPENGLES2 && _mesa_is_user_fbo(ctx->DrawBuffer) &&
+         if (ctx->API == API_OPENGLES2 && _mesa_is_user_fbo(fb) &&
              buffers[output] != GL_NONE &&
              buffers[output] != GL_COLOR_ATTACHMENT0 + output) {
-            _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawBuffers(buffer)");
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(unsupported buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
          }
 
@@ -430,7 +470,8 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           */
          if (destMask[output] & usedBufferMask) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glDrawBuffersARB(duplicated buffer)");
+                        "%s(duplicated buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
          }
 
@@ -440,17 +481,48 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
    }
 
    /* OK, if we get here, there were no errors so set the new state */
-   _mesa_drawbuffers(ctx, n, buffers, destMask);
+   _mesa_drawbuffers(ctx, fb, n, buffers, destMask);
 
    /*
-    * Call device driver function.  Note that n can be equal to 0,
+    * Call device driver function if fb is the bound draw buffer.
+    * Note that n can be equal to 0,
     * in which case we don't want to reference buffers[0], which
     * may not be valid.
     */
-   if (ctx->Driver.DrawBuffers)
-      ctx->Driver.DrawBuffers(ctx, n, buffers);
-   else if (ctx->Driver.DrawBuffer)
-      ctx->Driver.DrawBuffer(ctx, n > 0 ? buffers[0] : GL_NONE);
+   if (fb == ctx->DrawBuffer) {
+      if (ctx->Driver.DrawBuffers)
+         ctx->Driver.DrawBuffers(ctx, n, buffers);
+      else if (ctx->Driver.DrawBuffer)
+         ctx->Driver.DrawBuffer(ctx, n > 0 ? buffers[0] : GL_NONE);
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_draw_buffers(ctx, ctx->DrawBuffer, n, buffers, "glDrawBuffers");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferDrawBuffers(GLuint framebuffer, GLsizei n,
+                                  const GLenum *bufs)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glNamedFramebufferDrawBuffers");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   _mesa_draw_buffers(ctx, fb, n, bufs, "glNamedFramebufferDrawBuffers");
 }
 
 
@@ -459,13 +531,11 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
  * actual change.
  */
 static void
-updated_drawbuffers(struct gl_context *ctx)
+updated_drawbuffers(struct gl_context *ctx, struct gl_framebuffer *fb)
 {
    FLUSH_VERTICES(ctx, _NEW_BUFFERS);
 
    if (ctx->API == API_OPENGL_COMPAT && !ctx->Extensions.ARB_ES2_compatibility) {
-      struct gl_framebuffer *fb = ctx->DrawBuffer;
-
       /* Flag the FBO as requiring validation. */
       if (_mesa_is_user_fbo(fb)) {
 	 fb->_Status = 0;
@@ -482,6 +552,7 @@ updated_drawbuffers(struct gl_context *ctx)
  * so nothing should go wrong at this point.
  *
  * \param ctx  current context
+ * \param fb   the desired draw buffer
  * \param n    number of color outputs to set
  * \param buffers  array[n] of colorbuffer names, like GL_LEFT.
  * \param destMask  array[n] of BUFFER_BIT_* bitmasks which correspond to the
@@ -489,10 +560,9 @@ updated_drawbuffers(struct gl_context *ctx)
  *                  BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT).
  */
 void
-_mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
-                  const GLbitfield *destMask)
+_mesa_drawbuffers(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLuint n, const GLenum *buffers, const GLbitfield *destMask)
 {
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
    GLbitfield mask[MAX_DRAW_BUFFERS];
    GLuint buf;
 
@@ -518,7 +588,7 @@ _mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
       while (destMask0) {
          GLint bufIndex = ffs(destMask0) - 1;
          if (fb->_ColorDrawBufferIndexes[count] != bufIndex) {
-            updated_drawbuffers(ctx);
+            updated_drawbuffers(ctx, fb);
             fb->_ColorDrawBufferIndexes[count] = bufIndex;
          }
          count++;
@@ -535,14 +605,14 @@ _mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
             /* only one bit should be set in the destMask[buf] field */
             assert(_mesa_bitcount(destMask[buf]) == 1);
             if (fb->_ColorDrawBufferIndexes[buf] != bufIndex) {
-	       updated_drawbuffers(ctx);
+	       updated_drawbuffers(ctx, fb);
                fb->_ColorDrawBufferIndexes[buf] = bufIndex;
             }
             count = buf + 1;
          }
          else {
             if (fb->_ColorDrawBufferIndexes[buf] != -1) {
-	       updated_drawbuffers(ctx);
+	       updated_drawbuffers(ctx, fb);
                fb->_ColorDrawBufferIndexes[buf] = -1;
             }
          }
@@ -554,7 +624,7 @@ _mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
    /* set remaining outputs to -1 (GL_NONE) */
    for (buf = fb->_NumColorDrawBuffers; buf < ctx->Const.MaxDrawBuffers; buf++) {
       if (fb->_ColorDrawBufferIndexes[buf] != -1) {
-         updated_drawbuffers(ctx);
+         updated_drawbuffers(ctx, fb);
          fb->_ColorDrawBufferIndexes[buf] = -1;
       }
    }
@@ -566,7 +636,7 @@ _mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
       /* also set context drawbuffer state */
       for (buf = 0; buf < ctx->Const.MaxDrawBuffers; buf++) {
          if (ctx->Color.DrawBuffer[buf] != fb->ColorDrawBuffer[buf]) {
-	    updated_drawbuffers(ctx);
+	    updated_drawbuffers(ctx, fb);
             ctx->Color.DrawBuffer[buf] = fb->ColorDrawBuffer[buf];
          }
       }
@@ -585,7 +655,7 @@ _mesa_update_draw_buffers(struct gl_context *ctx)
    /* should be a window system FBO */
    assert(_mesa_is_winsys_fbo(ctx->DrawBuffer));
 
-   _mesa_drawbuffers(ctx, ctx->Const.MaxDrawBuffers,
+   _mesa_drawbuffers(ctx, ctx->DrawBuffer, ctx->Const.MaxDrawBuffers,
                      ctx->Color.DrawBuffer, NULL);
 }
 
@@ -598,11 +668,10 @@ _mesa_update_draw_buffers(struct gl_context *ctx)
  * \param bufferIndex  the numerical index corresponding to 'buffer'
  */
 void
-_mesa_readbuffer(struct gl_context *ctx, GLenum buffer, GLint bufferIndex)
+_mesa_readbuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                 GLenum buffer, GLint bufferIndex)
 {
-   struct gl_framebuffer *fb = ctx->ReadBuffer;
-
-   if (_mesa_is_winsys_fbo(fb)) {
+   if ((fb == ctx->ReadBuffer) && _mesa_is_winsys_fbo(fb)) {
       /* Only update the per-context READ_BUFFER state if we're bound to
        * a window-system framebuffer.
        */
@@ -621,23 +690,17 @@ _mesa_readbuffer(struct gl_context *ctx, GLenum buffer, GLint bufferIndex)
  * Called by glReadBuffer to set the source renderbuffer for reading pixels.
  * \param mode color buffer such as GL_FRONT, GL_BACK, etc.
  */
-void GLAPIENTRY
-_mesa_ReadBuffer(GLenum buffer)
+void
+_mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLenum buffer, const char *caller)
 {
-   struct gl_framebuffer *fb;
    GLbitfield supportedMask;
    GLint srcBuffer;
-   GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glReadBuffer %s\n", _mesa_lookup_enum_by_nr(buffer));
-
-   fb = ctx->ReadBuffer;
-
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glReadBuffer %s\n", _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
 
    if (buffer == GL_NONE) {
       /* This is legal--it means that no buffer should be bound for reading. */
@@ -648,24 +711,53 @@ _mesa_ReadBuffer(GLenum buffer)
       srcBuffer = read_buffer_enum_to_index(buffer);
       if (srcBuffer == -1) {
          _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glReadBuffer(buffer=0x%x)", buffer);
+                     "%s(invalid buffer %s)", caller,
+                     _mesa_lookup_enum_by_nr(buffer));
          return;
       }
       supportedMask = supported_buffer_bitmask(ctx, fb);
       if (((1 << srcBuffer) & supportedMask) == 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glReadBuffer(buffer=0x%x)", buffer);
+                     "%s(invalid buffer %s)", caller,
+                     _mesa_lookup_enum_by_nr(buffer));
          return;
       }
    }
 
    /* OK, all error checking has been completed now */
 
-   _mesa_readbuffer(ctx, buffer, srcBuffer);
+   _mesa_readbuffer(ctx, fb, buffer, srcBuffer);
 
-   /*
-    * Call device driver function.
-    */
-   if (ctx->Driver.ReadBuffer)
-      (*ctx->Driver.ReadBuffer)(ctx, buffer);
+   /* Call the device driver function only if fb is the bound read buffer */
+   if (fb == ctx->ReadBuffer) {
+      if (ctx->Driver.ReadBuffer)
+         (*ctx->Driver.ReadBuffer)(ctx, buffer);
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_ReadBuffer(GLenum buffer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_read_buffer(ctx, ctx->ReadBuffer, buffer, "glReadBuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferReadBuffer(GLuint framebuffer, GLenum src)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glNamedFramebufferReadBuffer");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysReadBuffer;
+
+   _mesa_read_buffer(ctx, fb, src, "glNamedFramebufferReadBuffer");
 }
diff --git a/src/mesa/main/buffers.h b/src/mesa/main/buffers.h
index ebcfa1c1e74..5aa79fda54b 100644
--- a/src/mesa/main/buffers.h
+++ b/src/mesa/main/buffers.h
@@ -36,26 +36,51 @@
 #include "glheader.h"
 
 struct gl_context;
+struct gl_framebuffer;
+
+extern void
+_mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLenum buffer, const char *caller);
 
 extern void GLAPIENTRY
 _mesa_DrawBuffer( GLenum mode );
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferDrawBuffer(GLuint framebuffer, GLenum buf);
+
+extern void
+_mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
+                   GLsizei n, const GLenum *buffers, const char *caller);
+
+extern void GLAPIENTRY
 _mesa_DrawBuffers(GLsizei n, const GLenum *buffers);
 
+extern void GLAPIENTRY
+_mesa_NamedFramebufferDrawBuffers(GLuint framebuffer, GLsizei n,
+                                  const GLenum *bufs);
+
 extern void
-_mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
+_mesa_drawbuffers(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLuint n, const GLenum *buffers,
                   const GLbitfield *destMask);
 
 extern void
-_mesa_readbuffer(struct gl_context *ctx, GLenum buffer, GLint bufferIndex);
+_mesa_readbuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                 GLenum buffer, GLint bufferIndex);
 
 extern void
 _mesa_update_draw_buffers(struct gl_context *ctx);
 
 
+extern void
+_mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLenum buffer, const char *caller);
+
 extern void GLAPIENTRY
 _mesa_ReadBuffer( GLenum mode );
 
+extern void GLAPIENTRY
+_mesa_NamedFramebufferReadBuffer(GLuint framebuffer, GLenum src);
+
 
 #endif
diff --git a/src/mesa/main/clear.c b/src/mesa/main/clear.c
index 8d707bc34a1..426caea4709 100644
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -34,6 +34,8 @@
 #include "clear.h"
 #include "context.h"
 #include "enums.h"
+#include "fbobject.h"
+#include "get.h"
 #include "macros.h"
 #include "mtypes.h"
 #include "state.h"
@@ -400,6 +402,24 @@ _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
 
 
 /**
+ * The ClearBuffer framework is so complicated and so riddled with the
+ * assumption that the framebuffer is bound that, for now, we will just fake
+ * direct state access clearing for the user.
+ */
+void GLAPIENTRY
+_mesa_ClearNamedFramebufferiv(GLuint framebuffer, GLenum buffer,
+                              GLint drawbuffer, const GLint *value)
+{
+   GLint oldfb;
+
+   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &oldfb);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
+   _mesa_ClearBufferiv(buffer, drawbuffer, value);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, (GLuint) oldfb);
+}
+
+
+/**
  * New in GL 3.0
  * Clear unsigned integer color buffer (not depth, not stencil).
  */
@@ -472,6 +492,24 @@ _mesa_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value)
 
 
 /**
+ * The ClearBuffer framework is so complicated and so riddled with the
+ * assumption that the framebuffer is bound that, for now, we will just fake
+ * direct state access clearing for the user.
+ */
+void GLAPIENTRY
+_mesa_ClearNamedFramebufferuiv(GLuint framebuffer, GLenum buffer,
+                               GLint drawbuffer, const GLuint *value)
+{
+   GLint oldfb;
+
+   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &oldfb);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
+   _mesa_ClearBufferuiv(buffer, drawbuffer, value);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, (GLuint) oldfb);
+}
+
+
+/**
  * New in GL 3.0
  * Clear fixed-pt or float color buffer or depth buffer (not stencil).
  */
@@ -565,6 +603,24 @@ _mesa_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value)
 
 
 /**
+ * The ClearBuffer framework is so complicated and so riddled with the
+ * assumption that the framebuffer is bound that, for now, we will just fake
+ * direct state access clearing for the user.
+ */
+void GLAPIENTRY
+_mesa_ClearNamedFramebufferfv(GLuint framebuffer, GLenum buffer,
+                              GLint drawbuffer, const GLfloat *value)
+{
+   GLint oldfb;
+
+   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &oldfb);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
+   _mesa_ClearBufferfv(buffer, drawbuffer, value);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, (GLuint) oldfb);
+}
+
+
+/**
  * New in GL 3.0
  * Clear depth/stencil buffer only.
  */
@@ -626,3 +682,21 @@ _mesa_ClearBufferfi(GLenum buffer, GLint drawbuffer,
       ctx->Stencil.Clear = clearStencilSave;
    }
 }
+
+
+/**
+ * The ClearBuffer framework is so complicated and so riddled with the
+ * assumption that the framebuffer is bound that, for now, we will just fake
+ * direct state access clearing for the user.
+ */
+void GLAPIENTRY
+_mesa_ClearNamedFramebufferfi(GLuint framebuffer, GLenum buffer,
+                              GLfloat depth, GLint stencil)
+{
+   GLint oldfb;
+
+   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &oldfb);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
+   _mesa_ClearBufferfi(buffer, 0, depth, stencil);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, (GLuint) oldfb);
+}
diff --git a/src/mesa/main/clear.h b/src/mesa/main/clear.h
index 96ce47b929e..c29850676ca 100644
--- a/src/mesa/main/clear.h
+++ b/src/mesa/main/clear.h
@@ -52,13 +52,29 @@ extern void GLAPIENTRY
 _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value);
 
 extern void GLAPIENTRY
+_mesa_ClearNamedFramebufferiv(GLuint framebuffer, GLenum buffer,
+                              GLint drawbuffer, const GLint *value);
+
+extern void GLAPIENTRY
 _mesa_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value);
 
 extern void GLAPIENTRY
+_mesa_ClearNamedFramebufferuiv(GLuint framebuffer, GLenum buffer,
+                               GLint drawbuffer, const GLuint *value);
+
+extern void GLAPIENTRY
 _mesa_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value);
 
 extern void GLAPIENTRY
+_mesa_ClearNamedFramebufferfv(GLuint framebuffer, GLenum buffer,
+                              GLint drawbuffer, const GLfloat *value);
+
+extern void GLAPIENTRY
 _mesa_ClearBufferfi(GLenum buffer, GLint drawbuffer,
                     GLfloat depth, GLint stencil);
 
+extern void GLAPIENTRY
+_mesa_ClearNamedFramebufferfi(GLuint framebuffer, GLenum buffer,
+                              GLfloat depth, GLint stencil);
+
 #endif
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 5a66a4eec90..9c3baf4c6aa 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -213,19 +213,10 @@
 /** For GL_ARB_fragment_program */
 /*@{*/
 #define MAX_FRAGMENT_PROGRAM_ADDRESS_REGS 0
+#define MAX_FRAGMENT_PROGRAM_PARAMS       64
+#define MAX_FRAGMENT_PROGRAM_INPUTS       12
 /*@}*/
 
-/** For GL_NV_fragment_program */
-/*@{*/
-#define MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS 1024 /* 72 for GL_ARB_f_p */
-#define MAX_NV_FRAGMENT_PROGRAM_TEMPS         96
-#define MAX_NV_FRAGMENT_PROGRAM_PARAMS        64
-#define MAX_NV_FRAGMENT_PROGRAM_INPUTS        12
-#define MAX_NV_FRAGMENT_PROGRAM_OUTPUTS        3
-#define MAX_NV_FRAGMENT_PROGRAM_WRITE_ONLYS    2
-/*@}*/
-
-
 /** For GL_ARB_vertex_shader */
 /*@{*/
 #define MAX_VERTEX_GENERIC_ATTRIBS 16
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 0a192de8c0a..79fa01849e0 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -489,8 +489,8 @@ init_program_limits(struct gl_constants *consts, gl_shader_stage stage,
       prog->MaxOutputComponents = 16 * 4; /* old limit not to break tnl and swrast */
       break;
    case MESA_SHADER_FRAGMENT:
-      prog->MaxParameters = MAX_NV_FRAGMENT_PROGRAM_PARAMS;
-      prog->MaxAttribs = MAX_NV_FRAGMENT_PROGRAM_INPUTS;
+      prog->MaxParameters = MAX_FRAGMENT_PROGRAM_PARAMS;
+      prog->MaxAttribs = MAX_FRAGMENT_PROGRAM_INPUTS;
       prog->MaxAddressRegs = MAX_FRAGMENT_PROGRAM_ADDRESS_REGS;
       prog->MaxUniformComponents = 4 * MAX_UNIFORMS;
       prog->MaxInputComponents = 16 * 4; /* old limit not to break tnl and swrast */
@@ -883,6 +883,19 @@ update_default_objects(struct gl_context *ctx)
 }
 
 
+/* XXX this is temporary and should be removed at some point in the
+ * future when there's a reasonable expectation that the libGL library
+ * contains the _glapi_new_nop_table() and _glapi_set_nop_handler()
+ * functions which were added in Mesa 10.6.
+ */
+#if !defined(_WIN32)
+/* Avoid libGL / driver ABI break */
+#define USE_GLAPI_NOP_FEATURES 0
+#else
+#define USE_GLAPI_NOP_FEATURES 1
+#endif
+
+
 /**
  * This function is called by the glapi no-op functions.  For each OpenGL
  * function/entrypoint there's a simple no-op function.  These "no-op"
@@ -898,6 +911,7 @@ update_default_objects(struct gl_context *ctx)
  *
  * \param name  the name of the OpenGL function
  */
+#if USE_GLAPI_NOP_FEATURES
 static void
 nop_handler(const char *name)
 {
@@ -914,6 +928,7 @@ nop_handler(const char *name)
    }
 #endif
 }
+#endif
 
 
 /**
@@ -923,12 +938,52 @@ nop_handler(const char *name)
 static void GLAPIENTRY
 nop_glFlush(void)
 {
-   /* don't record an error like we do in _mesa_generic_nop() */
+   /* don't record an error like we do in nop_handler() */
+}
+#endif
+
+
+#if !USE_GLAPI_NOP_FEATURES
+static int
+generic_nop(void)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "unsupported function called "
+               "(unsupported extension or deprecated function?)");
+   return 0;
 }
 #endif
 
 
 /**
+ * Create a new API dispatch table in which all entries point to the
+ * generic_nop() function.  This will not work on Windows because of
+ * the __stdcall convention which requires the callee to clean up the
+ * call stack.  That's impossible with one generic no-op function.
+ */
+struct _glapi_table *
+_mesa_new_nop_table(unsigned numEntries)
+{
+   struct _glapi_table *table;
+
+#if !USE_GLAPI_NOP_FEATURES
+   table = malloc(numEntries * sizeof(_glapi_proc));
+   if (table) {
+      _glapi_proc *entry = (_glapi_proc *) table;
+      unsigned i;
+      for (i = 0; i < numEntries; i++) {
+         entry[i] = (_glapi_proc) generic_nop;
+      }
+   }
+#else
+   table = _glapi_new_nop_table(numEntries);
+#endif
+   return table;
+}
+
+
+/**
  * Allocate and initialize a new dispatch table.  The table will be
  * populated with pointers to "no-op" functions.  In turn, the no-op
  * functions will call nop_handler() above.
@@ -941,8 +996,9 @@ alloc_dispatch_table(void)
     * Mesa we do this to accommodate different versions of libGL and various
     * DRI drivers.
     */
-   GLint numEntries = MAX2(_glapi_get_dispatch_table_size(), _gloffset_COUNT);
-   struct _glapi_table *table = _glapi_new_nop_table(numEntries);
+   int numEntries = MAX2(_glapi_get_dispatch_table_size(), _gloffset_COUNT);
+
+   struct _glapi_table *table = _mesa_new_nop_table(numEntries);
 
 #if defined(_WIN32)
    if (table) {
@@ -966,7 +1022,9 @@ alloc_dispatch_table(void)
    }
 #endif
 
+#if USE_GLAPI_NOP_FEATURES
    _glapi_set_nop_handler(nop_handler);
+#endif
 
    return table;
 }
@@ -1111,9 +1169,7 @@ _mesa_initialize_context(struct gl_context *ctx,
       ctx->HasConfig = GL_FALSE;
    }
 
-   if (_mesa_is_desktop_gl(ctx)) {
-      _mesa_override_gl_version(ctx);
-   }
+   _mesa_override_gl_version(ctx);
 
    /* misc one-time initializations */
    one_time_init(ctx);
@@ -1275,7 +1331,6 @@ _mesa_free_context_data( struct gl_context *ctx )
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current, NULL);
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._TnlProgram, NULL);
 
-   _mesa_reference_geomprog(ctx, &ctx->GeometryProgram.Current, NULL);
    _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
 
    _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, NULL);
@@ -1565,7 +1620,8 @@ handle_first_current(struct gl_context *ctx)
          else
             buffer = GL_FRONT;
 
-         _mesa_drawbuffers(ctx, 1, &buffer, NULL /* destMask */);
+         _mesa_drawbuffers(ctx, ctx->DrawBuffer, 1, &buffer,
+                           NULL /* destMask */);
       }
 
       if (ctx->ReadBuffer != _mesa_get_incomplete_framebuffer()) {
@@ -1578,7 +1634,7 @@ handle_first_current(struct gl_context *ctx)
             bufferIndex = BUFFER_FRONT_LEFT;
          }
 
-         _mesa_readbuffer(ctx, buffer, bufferIndex);
+         _mesa_readbuffer(ctx, ctx->ReadBuffer, buffer, bufferIndex);
       }
    }
 
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index fd22f28892c..e8732c6175b 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -40,14 +40,25 @@ enum mesa_block_class {
    BLOCK_CLASS_64_BITS
 };
 
+/**
+ * Prepare the source or destination resource, including:
+ * - Error checking
+ * - Creating texture wrappers for renderbuffers
+ * \param name  the texture or renderbuffer name
+ * \param target  GL_TEXTURE target or GL_RENDERBUFFER.  For the later, will
+ *                be changed to a compatible GL_TEXTURE target.
+ * \param level  mipmap level
+ * \param tex_obj  returns a pointer to a texture object
+ * \param tex_image  returns a pointer to a texture image
+ * \param tmp_tex  returns temporary texture object name
+ * \return true if success, false if error
+ */
 static bool
 prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
                struct gl_texture_object **tex_obj,
                struct gl_texture_image **tex_image, GLuint *tmp_tex,
                const char *dbg_prefix)
 {
-   struct gl_renderbuffer *rb;
-
    if (name == 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glCopyImageSubData(%sName = %d)", dbg_prefix, name);
@@ -87,7 +98,7 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
    }
 
    if (*target == GL_RENDERBUFFER) {
-      rb = _mesa_lookup_renderbuffer(ctx, name);
+      struct gl_renderbuffer *rb = _mesa_lookup_renderbuffer(ctx, name);
       if (!rb) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "glCopyImageSubData(%sName = %u)", dbg_prefix, name);
@@ -169,8 +180,15 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
    return true;
 }
 
+
+/**
+ * Check that the x,y,z,width,height,region is within the texture image
+ * dimensions.
+ * \return true if bounds OK, false if regions is out of bounds
+ */
 static bool
-check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
+check_region_bounds(struct gl_context *ctx,
+                    const struct gl_texture_image *tex_image,
                     int x, int y, int z, int width, int height, int depth,
                     const char *dbg_prefix)
 {
@@ -188,6 +206,7 @@ check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
       return false;
    }
 
+   /* Check X direction */
    if (x + width > tex_image->Width) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glCopyImageSubData(%sX or %sWidth exceeds image bounds)",
@@ -195,6 +214,7 @@ check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
       return false;
    }
 
+   /* Check Y direction */
    switch (tex_image->TexObject->Target) {
    case GL_TEXTURE_1D:
    case GL_TEXTURE_1D_ARRAY:
@@ -215,6 +235,7 @@ check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
       break;
    }
 
+   /* Check Z direction */
    switch (tex_image->TexObject->Target) {
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
@@ -260,7 +281,7 @@ check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
 }
 
 static bool
-compressed_format_compatible(struct gl_context *ctx,
+compressed_format_compatible(const struct gl_context *ctx,
                              GLenum compressedFormat, GLenum otherFormat)
 {
    enum mesa_block_class compressedClass, otherClass;
@@ -348,8 +369,8 @@ compressed_format_compatible(struct gl_context *ctx,
 }
 
 static bool
-copy_format_compatible(struct gl_context *ctx,
-                                GLenum srcFormat, GLenum dstFormat)
+copy_format_compatible(const struct gl_context *ctx,
+                       GLenum srcFormat, GLenum dstFormat)
 {
    /*
     * From ARB_copy_image spec:
@@ -389,7 +410,7 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
    struct gl_texture_object *srcTexObj, *dstTexObj;
    struct gl_texture_image *srcTexImage, *dstTexImage;
    GLuint src_bw, src_bh, dst_bw, dst_bh;
-   int i, srcNewZ, dstNewZ;
+   int i;
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glCopyImageSubData(%u, %s, %d, %d, %d, %d, "
@@ -447,6 +468,8 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
    }
 
    for (i = 0; i < srcDepth; ++i) {
+      int srcNewZ, dstNewZ;
+
       if (srcTexObj->Target == GL_TEXTURE_CUBE_MAP) {
          srcTexImage = srcTexObj->Image[i + srcZ][srcLevel];
          srcNewZ = 0;
diff --git a/src/mesa/main/depth.c b/src/mesa/main/depth.c
index 29851ecb8a4..bb4591cf152 100644
--- a/src/mesa/main/depth.c
+++ b/src/mesa/main/depth.c
@@ -65,6 +65,9 @@ _mesa_DepthFunc( GLenum func )
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_lookup_enum_by_nr(func));
 
+   if (ctx->Depth.Func == func)
+      return;
+
    switch (func) {
    case GL_LESS:    /* (default) pass if incoming z < stored z */
    case GL_GEQUAL:
@@ -80,9 +83,6 @@ _mesa_DepthFunc( GLenum func )
       return;
    }
 
-   if (ctx->Depth.Func == func)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_DEPTH);
    ctx->Depth.Func = func;
 
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 431c4b48b79..aafe486fb60 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -7592,28 +7592,6 @@ save_FramebufferTexture(GLenum target, GLenum attachment,
    }
 }
 
-static void GLAPIENTRY
-save_FramebufferTextureFace(GLenum target, GLenum attachment,
-                            GLuint texture, GLint level, GLenum face)
-{
-   Node *n;
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
-   n = alloc_instruction(ctx, OPCODE_FRAMEBUFFER_TEXTURE_FACE, 5);
-   if (n) {
-      n[1].e = target;
-      n[2].e = attachment;
-      n[3].ui = texture;
-      n[4].i = level;
-      n[5].e = face;
-   }
-   if (ctx->ExecuteFlag) {
-      CALL_FramebufferTextureFaceARB(ctx->Exec, (target, attachment, texture,
-                                                 level, face));
-   }
-}
-
-
 
 static void GLAPIENTRY
 save_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
@@ -8873,11 +8851,6 @@ execute_list(struct gl_context *ctx, GLuint list)
             CALL_FramebufferTexture(ctx->Exec, (n[1].e, n[2].e,
                                                    n[3].ui, n[4].i));
             break;
-         case OPCODE_FRAMEBUFFER_TEXTURE_FACE:
-            CALL_FramebufferTextureFaceARB(ctx->Exec, (n[1].e, n[2].e,
-                                                       n[3].ui, n[4].i, n[5].e));
-            break;
-
          /* GL_ARB_sync */
          case OPCODE_WAIT_SYNC:
             {
@@ -9644,10 +9617,9 @@ _mesa_initialize_save_table(const struct gl_context *ctx)
    SET_BlendEquationiARB(table, save_BlendEquationi);
    SET_BlendEquationSeparateiARB(table, save_BlendEquationSeparatei);
 
-   /* GL_ARB_geometry_shader4 */
+   /* OpenGL 3.2 */
    SET_ProgramParameteri(table, save_ProgramParameteri);
    SET_FramebufferTexture(table, save_FramebufferTexture);
-   SET_FramebufferTextureFaceARB(table, save_FramebufferTextureFace);
 
    /* GL_NV_conditional_render */
    SET_BeginConditionalRender(table, save_BeginConditionalRender);
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index 2aa1deb635f..b3406665d94 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -39,6 +39,7 @@
 #include "mtypes.h"
 #include "version.h"
 #include "util/hash_table.h"
+#include "util/simple_list.h"
 
 static mtx_t DynamicIDMutex = _MTX_INITIALIZER_NP;
 static GLuint NextDynamicID = 1;
@@ -1412,6 +1413,26 @@ should_output(struct gl_context *ctx, GLenum error, const char *fmtString)
 
 
 void
+_mesa_gl_vdebug(struct gl_context *ctx,
+                GLuint *id,
+                enum mesa_debug_source source,
+                enum mesa_debug_type type,
+                enum mesa_debug_severity severity,
+                const char *fmtString,
+                va_list args)
+{
+   char s[MAX_DEBUG_MESSAGE_LENGTH];
+   int len;
+
+   debug_get_id(id);
+
+   len = _mesa_vsnprintf(s, MAX_DEBUG_MESSAGE_LENGTH, fmtString, args);
+
+   log_msg(ctx, source, type, *id, severity, len, s);
+}
+
+
+void
 _mesa_gl_debug(struct gl_context *ctx,
                GLuint *id,
                enum mesa_debug_source source,
@@ -1419,17 +1440,10 @@ _mesa_gl_debug(struct gl_context *ctx,
                enum mesa_debug_severity severity,
                const char *fmtString, ...)
 {
-   char s[MAX_DEBUG_MESSAGE_LENGTH];
-   int len;
    va_list args;
-
-   debug_get_id(id);
-
    va_start(args, fmtString);
-   len = _mesa_vsnprintf(s, MAX_DEBUG_MESSAGE_LENGTH, fmtString, args);
+   _mesa_gl_vdebug(ctx, id, source, type, severity, fmtString, args);
    va_end(args);
-
-   log_msg(ctx, source, type, *id, severity, len, s);
 }
 
 
diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index e6dc9b5f1b9..24f234f7f10 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -76,6 +76,15 @@ extern FILE *
 _mesa_get_log_file(void);
 
 extern void
+_mesa_gl_vdebug(struct gl_context *ctx,
+                GLuint *id,
+                enum mesa_debug_source source,
+                enum mesa_debug_type type,
+                enum mesa_debug_severity severity,
+                const char *fmtString,
+                va_list args);
+
+extern void
 _mesa_gl_debug(struct gl_context *ctx,
                GLuint *id,
                enum mesa_debug_source source,
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index f7ce0642aef..4176a69ed7c 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -104,7 +104,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_depth_clamp",                         o(ARB_depth_clamp),                         GL,             2003 },
    { "GL_ARB_depth_texture",                       o(ARB_depth_texture),                       GLL,            2001 },
    { "GL_ARB_derivative_control",                  o(ARB_derivative_control),                  GL,             2014 },
-   { "GL_ARB_direct_state_access",                 o(dummy_false),                             GL,             2014 },
+   { "GL_ARB_direct_state_access",                 o(dummy_true),                              GLC,            2014 },
    { "GL_ARB_draw_buffers",                        o(dummy_true),                              GL,             2002 },
    { "GL_ARB_draw_buffers_blend",                  o(ARB_draw_buffers_blend),                  GL,             2009 },
    { "GL_ARB_draw_elements_base_vertex",           o(ARB_draw_elements_base_vertex),           GL,             2009 },
@@ -117,6 +117,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_fragment_program",                    o(ARB_fragment_program),                    GLL,            2002 },
    { "GL_ARB_fragment_program_shadow",             o(ARB_fragment_program_shadow),             GLL,            2003 },
    { "GL_ARB_fragment_shader",                     o(ARB_fragment_shader),                     GL,             2002 },
+   { "GL_ARB_framebuffer_no_attachments",          o(ARB_framebuffer_no_attachments),          GL,             2012 },
    { "GL_ARB_framebuffer_object",                  o(ARB_framebuffer_object),                  GL,             2005 },
    { "GL_ARB_framebuffer_sRGB",                    o(EXT_framebuffer_sRGB),                    GL,             1998 },
    { "GL_ARB_get_program_binary",                  o(dummy_true),                              GL,             2010 },
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 27cf97f1778..f8dcf122d99 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -121,6 +121,27 @@ _mesa_lookup_renderbuffer(struct gl_context *ctx, GLuint id)
 
 
 /**
+ * A convenience function for direct state access that throws
+ * GL_INVALID_OPERATION if the renderbuffer doesn't exist.
+ */
+struct gl_renderbuffer *
+_mesa_lookup_renderbuffer_err(struct gl_context *ctx, GLuint id,
+                              const char *func)
+{
+   struct gl_renderbuffer *rb;
+
+   rb = _mesa_lookup_renderbuffer(ctx, id);
+   if (!rb || rb == &DummyRenderbuffer) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(non-existent renderbuffer %u)", func, id);
+      return NULL;
+   }
+
+   return rb;
+}
+
+
+/**
  * Helper routine for getting a gl_framebuffer.
  */
 struct gl_framebuffer *
@@ -138,6 +159,27 @@ _mesa_lookup_framebuffer(struct gl_context *ctx, GLuint id)
 
 
 /**
+ * A convenience function for direct state access that throws
+ * GL_INVALID_OPERATION if the framebuffer doesn't exist.
+ */
+struct gl_framebuffer *
+_mesa_lookup_framebuffer_err(struct gl_context *ctx, GLuint id,
+                             const char *func)
+{
+   struct gl_framebuffer *fb;
+
+   fb = _mesa_lookup_framebuffer(ctx, id);
+   if (!fb || fb == &DummyFramebuffer) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(non-existent framebuffer %u)", func, id);
+      return NULL;
+   }
+
+   return fb;
+}
+
+
+/**
  * Mark the given framebuffer as invalid.  This will force the
  * test for framebuffer completeness to be done before the framebuffer
  * is used.
@@ -423,7 +465,7 @@ set_texture_attachment(struct gl_context *ctx,
                        struct gl_framebuffer *fb,
                        struct gl_renderbuffer_attachment *att,
                        struct gl_texture_object *texObj,
-                       GLenum texTarget, GLuint level, GLuint zoffset,
+                       GLenum texTarget, GLuint level, GLuint layer,
                        GLboolean layered)
 {
    struct gl_renderbuffer *rb = att->Renderbuffer;
@@ -447,7 +489,7 @@ set_texture_attachment(struct gl_context *ctx,
    /* always update these fields */
    att->TextureLevel = level;
    att->CubeMapFace = _mesa_tex_target_to_face(texTarget);
-   att->Zoffset = zoffset;
+   att->Zoffset = layer;
    att->Layered = layered;
    att->Complete = GL_FALSE;
 
@@ -479,9 +521,10 @@ set_renderbuffer_attachment(struct gl_context *ctx,
  * Attach a renderbuffer object to a framebuffer object.
  */
 void
-_mesa_framebuffer_renderbuffer(struct gl_context *ctx,
-                               struct gl_framebuffer *fb,
-                               GLenum attachment, struct gl_renderbuffer *rb)
+_mesa_FramebufferRenderbuffer_sw(struct gl_context *ctx,
+                                 struct gl_framebuffer *fb,
+                                 GLenum attachment,
+                                 struct gl_renderbuffer *rb)
 {
    struct gl_renderbuffer_attachment *att;
 
@@ -914,6 +957,7 @@ _mesa_test_framebuffer_completeness(struct gl_context *ctx,
    fb->Height = 0;
    fb->_AllColorBuffersFixedPoint = GL_TRUE;
    fb->_HasSNormOrFloatColorBuffer = GL_FALSE;
+   fb->_HasAttachments = true;
 
    /* Start at -2 to more easily loop over all attachment points.
     *  -2: depth buffer
@@ -1112,14 +1156,48 @@ _mesa_test_framebuffer_completeness(struct gl_context *ctx,
       } else if (att_layer_count > max_layer_count) {
          max_layer_count = att_layer_count;
       }
+
+      /*
+       * The extension GL_ARB_framebuffer_no_attachments places additional
+       * requirement on each attachment. Those additional requirements are
+       * tighter that those of previous versions of GL. In interest of better
+       * compatibility, we will not enforce these restrictions. For the record
+       * those additional restrictions are quoted below:
+       *
+       * "The width and height of image are greater than zero and less than or
+       *  equal to the values of the implementation-dependent limits
+       *  MAX_FRAMEBUFFER_WIDTH and MAX_FRAMEBUFFER_HEIGHT, respectively."
+       *
+       * "If <image> is a three-dimensional texture or a one- or two-dimensional
+       *  array texture and the attachment is layered, the depth or layer count
+       *  of the texture is less than or equal to the implementation-dependent
+       *  limit MAX_FRAMEBUFFER_LAYERS."
+       *
+       * "If image has multiple samples, its sample count is less than or equal
+       *  to the value of the implementation-dependent limit
+       *  MAX_FRAMEBUFFER_SAMPLES."
+       *
+       * The same requirements are also in place for GL 4.5,
+       * Section 9.4.1 "Framebuffer Attachment Completeness", pg 310-311
+       */
    }
 
    fb->MaxNumLayers = max_layer_count;
 
    if (numImages == 0) {
-      fb->_Status = GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT;
-      fbo_incomplete(ctx, "no attachments", -1);
-      return;
+      fb->_HasAttachments = false;
+
+      if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+         fb->_Status = GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT;
+         fbo_incomplete(ctx, "no attachments", -1);
+         return;
+      }
+
+      if (fb->DefaultGeometry.Width == 0 || fb->DefaultGeometry.Height == 0) {
+         fb->_Status = GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT;
+         fbo_incomplete(ctx, "no attachments and default width or height is 0", -1);
+         return;
+      }
    }
 
    if (_mesa_is_desktop_gl(ctx) && !ctx->Extensions.ARB_ES2_compatibility) {
@@ -1184,8 +1262,10 @@ _mesa_test_framebuffer_completeness(struct gl_context *ctx,
        * renderbuffers/textures are different sizes, the framebuffer
        * width/height will be set to the smallest width/height.
        */
-      fb->Width = minWidth;
-      fb->Height = minHeight;
+      if (numImages != 0) {
+         fb->Width = minWidth;
+         fb->Height = minHeight;
+      }
 
       /* finally, update the visual info for the framebuffer */
       _mesa_update_framebuffer_visual(ctx, fb);
@@ -1291,6 +1371,131 @@ _mesa_BindRenderbufferEXT(GLenum target, GLuint renderbuffer)
    bind_renderbuffer(target, renderbuffer, true);
 }
 
+static void
+framebuffer_parameteri(struct gl_context *ctx, struct gl_framebuffer *fb,
+                       GLenum pname, GLint param, const char *func)
+{
+   switch (pname) {
+   case GL_FRAMEBUFFER_DEFAULT_WIDTH:
+      if (param < 0 || param > ctx->Const.MaxFramebufferWidth)
+        _mesa_error(ctx, GL_INVALID_VALUE, "%s", func);
+      else
+         fb->DefaultGeometry.Width = param;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_HEIGHT:
+      if (param < 0 || param > ctx->Const.MaxFramebufferHeight)
+        _mesa_error(ctx, GL_INVALID_VALUE, "%s", func);
+      else
+         fb->DefaultGeometry.Height = param;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_LAYERS:
+      if (param < 0 || param > ctx->Const.MaxFramebufferLayers)
+        _mesa_error(ctx, GL_INVALID_VALUE, "%s", func);
+      else
+         fb->DefaultGeometry.Layers = param;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_SAMPLES:
+      if (param < 0 || param > ctx->Const.MaxFramebufferSamples)
+        _mesa_error(ctx, GL_INVALID_VALUE, "%s", func);
+      else
+        fb->DefaultGeometry.NumSamples = param;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS:
+      fb->DefaultGeometry.FixedSampleLocations = param;
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "%s(pname=0x%x)", func, pname);
+   }
+}
+
+void GLAPIENTRY
+_mesa_FramebufferParameteri(GLenum target, GLenum pname, GLint param)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glFramebufferParameteriv not supported "
+                  "(ARB_framebuffer_no_attachments not implemented)");
+      return;
+   }
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferParameteri(target=0x%x)", target);
+      return;
+   }
+
+   /* check framebuffer binding */
+   if (_mesa_is_winsys_fbo(fb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glFramebufferParameteri");
+      return;
+   }
+
+   framebuffer_parameteri(ctx, fb, pname, param, "glFramebufferParameteri");
+}
+
+static void
+get_framebuffer_parameteriv(struct gl_context *ctx, struct gl_framebuffer *fb,
+                            GLenum pname, GLint *params, const char *func)
+{
+   switch (pname) {
+   case GL_FRAMEBUFFER_DEFAULT_WIDTH:
+      *params = fb->DefaultGeometry.Width;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_HEIGHT:
+      *params = fb->DefaultGeometry.Height;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_LAYERS:
+      *params = fb->DefaultGeometry.Layers;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_SAMPLES:
+      *params = fb->DefaultGeometry.NumSamples;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS:
+      *params = fb->DefaultGeometry.FixedSampleLocations;
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "%s(pname=0x%x)", func, pname);
+   }
+}
+
+void GLAPIENTRY
+_mesa_GetFramebufferParameteriv(GLenum target, GLenum pname, GLint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetFramebufferParameteriv not supported "
+                  "(ARB_framebuffer_no_attachments not implemented)");
+      return;
+   }
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glGetFramebufferParameteriv(target=0x%x)", target);
+      return;
+   }
+
+   /* check framebuffer binding */
+   if (_mesa_is_winsys_fbo(fb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetFramebufferParameteriv");
+      return;
+   }
+
+   get_framebuffer_parameteriv(ctx, fb, pname, params,
+                               "glGetFramebufferParameteriv");
+}
+
 
 /**
  * Remove the specified renderbuffer or texture from any attachment point in
@@ -2396,15 +2601,23 @@ _mesa_DeleteFramebuffers(GLsizei n, const GLuint *framebuffers)
 }
 
 
-void GLAPIENTRY
-_mesa_GenFramebuffers(GLsizei n, GLuint *framebuffers)
+/**
+ * This is the implementation for glGenFramebuffers and glCreateFramebuffers.
+ * It is not exposed to the rest of Mesa to encourage the use of
+ * nameless buffers in driver internals.
+ */
+static void
+create_framebuffers(GLsizei n, GLuint *framebuffers, bool dsa)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLuint first;
    GLint i;
+   struct gl_framebuffer *fb;
+
+   const char *func = dsa ? "glCreateFramebuffers" : "glGenFramebuffers";
 
    if (n < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glGenFramebuffersEXT(n)");
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", func);
       return;
    }
 
@@ -2416,31 +2629,43 @@ _mesa_GenFramebuffers(GLsizei n, GLuint *framebuffers)
    for (i = 0; i < n; i++) {
       GLuint name = first + i;
       framebuffers[i] = name;
-      /* insert dummy placeholder into hash table */
+
+      if (dsa) {
+         fb = ctx->Driver.NewFramebuffer(ctx, framebuffers[i]);
+         if (!fb) {
+            _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s", func);
+            return;
+         }
+      }
+      else
+         fb = &DummyFramebuffer;
+
       mtx_lock(&ctx->Shared->Mutex);
-      _mesa_HashInsert(ctx->Shared->FrameBuffers, name, &DummyFramebuffer);
+      _mesa_HashInsert(ctx->Shared->FrameBuffers, name, fb);
       mtx_unlock(&ctx->Shared->Mutex);
    }
 }
 
 
-GLenum GLAPIENTRY
-_mesa_CheckFramebufferStatus(GLenum target)
+void GLAPIENTRY
+_mesa_GenFramebuffers(GLsizei n, GLuint *framebuffers)
 {
-   struct gl_framebuffer *buffer;
-   GET_CURRENT_CONTEXT(ctx);
+   create_framebuffers(n, framebuffers, false);
+}
 
-   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glCheckFramebufferStatus(%s)\n",
-                  _mesa_lookup_enum_by_nr(target));
+void GLAPIENTRY
+_mesa_CreateFramebuffers(GLsizei n, GLuint *framebuffers)
+{
+   create_framebuffers(n, framebuffers, true);
+}
 
-   buffer = get_framebuffer_target(ctx, target);
-   if (!buffer) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glCheckFramebufferStatus(target)");
-      return 0;
-   }
+
+GLenum
+_mesa_check_framebuffer_status(struct gl_context *ctx,
+                               struct gl_framebuffer *buffer)
+{
+   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
    if (_mesa_is_winsys_fbo(buffer)) {
       /* EGL_KHR_surfaceless_context allows the winsys FBO to be incomplete. */
@@ -2461,6 +2686,67 @@ _mesa_CheckFramebufferStatus(GLenum target)
 }
 
 
+GLenum GLAPIENTRY
+_mesa_CheckFramebufferStatus(GLenum target)
+{
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glCheckFramebufferStatus(%s)\n",
+                  _mesa_lookup_enum_by_nr(target));
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glCheckFramebufferStatus(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return 0;
+   }
+
+   return _mesa_check_framebuffer_status(ctx, fb);
+}
+
+
+GLenum GLAPIENTRY
+_mesa_CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target)
+{
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   /* Validate the target (for conformance's sake) and grab a reference to the
+    * default framebuffer in case framebuffer = 0.
+    * Section 9.4 Framebuffer Completeness of the OpenGL 4.5 core spec
+    * (30.10.2014, PDF page 336) says:
+    *    "If framebuffer is zero, then the status of the default read or
+    *    draw framebuffer (as determined by target) is returned."
+    */
+   switch (target) {
+      case GL_DRAW_FRAMEBUFFER:
+      case GL_FRAMEBUFFER:
+         fb = ctx->WinSysDrawBuffer;
+         break;
+      case GL_READ_FRAMEBUFFER:
+         fb = ctx->WinSysReadBuffer;
+         break;
+      default:
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glCheckNamedFramebufferStatus(invalid target %s)",
+                     _mesa_lookup_enum_by_nr(target));
+         return 0;
+   }
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glCheckNamedFramebufferStatus");
+      if (!fb)
+         return 0;
+   }
+
+   return _mesa_check_framebuffer_status(ctx, fb);
+}
+
+
 /**
  * Replicate the src attachment point. Used by framebuffer_texture() when
  * the same texture is attached at GL_DEPTH_ATTACHMENT and
@@ -2487,144 +2773,308 @@ reuse_framebuffer_texture_attachment(struct gl_framebuffer *fb,
 
 
 /**
- * Common code called by glFramebufferTexture1D/2D/3D() and
- * glFramebufferTextureLayer().
+ * Common code called by gl*FramebufferTexture*() to retrieve the correct
+ * texture object pointer.
  *
- * \param textarget is the textarget that was passed to the
- * glFramebufferTexture...() function, or 0 if the corresponding function
- * doesn't have a textarget parameter.
+ * \param texObj where the pointer to the texture object is returned.  Note
+ * that a successful call may return texObj = NULL.
  *
- * \param layered is true if this function was called from
- * glFramebufferTexture(), false otherwise.
+ * \return true if no errors, false if errors
  */
-static void
-framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
-                    GLenum attachment, GLenum textarget, GLuint texture,
-                    GLint level, GLuint zoffset, GLboolean layered)
+static bool
+get_texture_for_framebuffer(struct gl_context *ctx, GLuint texture,
+                            bool layered, const char *caller,
+                            struct gl_texture_object **texObj)
 {
-   struct gl_renderbuffer_attachment *att;
-   struct gl_texture_object *texObj = NULL;
-   struct gl_framebuffer *fb;
-   GLenum maxLevelsTarget;
+   *texObj = NULL; /* This will get returned if texture = 0. */
 
-   fb = get_framebuffer_target(ctx, target);
-   if (!fb) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferTexture%s(target=0x%x)", caller, target);
-      return;
+   if (!texture)
+      return true;
+
+   *texObj = _mesa_lookup_texture(ctx, texture);
+   if (*texObj == NULL || (*texObj)->Target == 0) {
+      /* Can't render to a non-existent texture object.
+       *
+       * The OpenGL 4.5 core spec (02.02.2015) in Section 9.2 Binding and
+       * Managing Framebuffer Objects specifies a different error
+       * depending upon the calling function (PDF pages 325-328).
+       * *FramebufferTexture (where layered = GL_TRUE) throws invalid
+       * value, while the other commands throw invalid operation (where
+       * layered = GL_FALSE).
+       */
+      const GLenum error = layered ? GL_INVALID_VALUE :
+                           GL_INVALID_OPERATION;
+      _mesa_error(ctx, error,
+                  "%s(non-existent texture %u)", caller, texture);
+      return false;
    }
 
-   /* check framebuffer binding */
-   if (_mesa_is_winsys_fbo(fb)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glFramebufferTexture%s", caller);
-      return;
+   return true;
+}
+
+
+/**
+ * Common code called by gl*FramebufferTexture() to verify the texture target
+ * and decide whether or not the attachment should truly be considered
+ * layered.
+ *
+ * \param layered true if attachment should be considered layered, false if
+ * not
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_layered_texture_target(struct gl_context *ctx, GLenum target,
+                             const char *caller, GLboolean *layered)
+{
+   *layered = GL_TRUE;
+
+   switch (target) {
+   case GL_TEXTURE_3D:
+   case GL_TEXTURE_1D_ARRAY_EXT:
+   case GL_TEXTURE_2D_ARRAY_EXT:
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      return true;
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+      /* These texture types are valid to pass to
+       * glFramebufferTexture(), but since they aren't layered, it
+       * is equivalent to calling glFramebufferTexture{1D,2D}().
+       */
+      *layered = GL_FALSE;
+      return true;
    }
 
-   /* The textarget, level, and zoffset parameters are only validated if
-    * texture is non-zero.
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "%s(invalid texture target %s)", caller,
+               _mesa_lookup_enum_by_nr(target));
+   return false;
+}
+
+
+/**
+ * Common code called by gl*FramebufferTextureLayer() to verify the texture
+ * target.
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_texture_target(struct gl_context *ctx, GLenum target,
+                     const char *caller)
+{
+   /* We're being called by glFramebufferTextureLayer().
+    * The only legal texture types for that function are 3D,
+    * cube-map, and 1D/2D/cube-map array textures.
+    *
+    * We don't need to check for GL_ARB_texture_cube_map_array because the
+    * application wouldn't have been able to create a texture with a
+    * GL_TEXTURE_CUBE_MAP_ARRAY target if the extension were not enabled.
     */
-   if (texture) {
-      GLboolean err = GL_TRUE;
-
-      texObj = _mesa_lookup_texture(ctx, texture);
-      if (texObj != NULL) {
-         if (textarget == 0) {
-            if (layered) {
-               /* We're being called by glFramebufferTexture() and textarget
-                * is not used.
-                */
-               switch (texObj->Target) {
-               case GL_TEXTURE_3D:
-               case GL_TEXTURE_1D_ARRAY_EXT:
-               case GL_TEXTURE_2D_ARRAY_EXT:
-               case GL_TEXTURE_CUBE_MAP:
-               case GL_TEXTURE_CUBE_MAP_ARRAY:
-               case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-                  err = false;
-                  break;
-               case GL_TEXTURE_1D:
-               case GL_TEXTURE_2D:
-               case GL_TEXTURE_RECTANGLE:
-               case GL_TEXTURE_2D_MULTISAMPLE:
-                  /* These texture types are valid to pass to
-                   * glFramebufferTexture(), but since they aren't layered, it
-                   * is equivalent to calling glFramebufferTexture{1D,2D}().
-                   */
-                  err = false;
-                  layered = false;
-                  textarget = texObj->Target;
-                  break;
-               default:
-                  err = true;
-                  break;
-               }
-            } else {
-               /* We're being called by glFramebufferTextureLayer() and
-                * textarget is not used.  The only legal texture types for
-                * that function are 3D and 1D/2D arrays textures.
-                */
-               err = (texObj->Target != GL_TEXTURE_3D) &&
-                  (texObj->Target != GL_TEXTURE_1D_ARRAY_EXT) &&
-                  (texObj->Target != GL_TEXTURE_2D_ARRAY_EXT) &&
-                  (texObj->Target != GL_TEXTURE_CUBE_MAP_ARRAY) &&
-                  (texObj->Target != GL_TEXTURE_2D_MULTISAMPLE_ARRAY);
-            }
-         }
-         else {
-            /* Make sure textarget is consistent with the texture's type */
-            err = (texObj->Target == GL_TEXTURE_CUBE_MAP)
-                ? !_mesa_is_cube_face(textarget)
-                : (texObj->Target != textarget);
-         }
+   switch (target) {
+   case GL_TEXTURE_3D:
+   case GL_TEXTURE_1D_ARRAY:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      return true;
+   case GL_TEXTURE_CUBE_MAP:
+      /* We don't need to check the extension (GL_ARB_direct_state_access) or
+       * GL version (4.5) for GL_TEXTURE_CUBE_MAP because DSA is always
+       * enabled in core profile.  This can be called from
+       * _mesa_FramebufferTextureLayer in compatibility profile (OpenGL 3.0),
+       * so we do have to check the profile.
+       */
+      return ctx->API == API_OPENGL_CORE;
+   }
+
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "%s(invalid texture target %s)", caller,
+               _mesa_lookup_enum_by_nr(target));
+   return false;
+}
+
+
+/**
+ * Common code called by glFramebufferTexture*D() to verify the texture
+ * target.
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_textarget(struct gl_context *ctx, int dims, GLenum target,
+                GLenum textarget, const char *caller)
+{
+   bool err = false;
+
+   switch (dims) {
+   case 1:
+      switch (textarget) {
+      case GL_TEXTURE_1D:
+         break;
+      case GL_TEXTURE_1D_ARRAY:
+         err = !ctx->Extensions.EXT_texture_array;
+         break;
+      default:
+         err = true;
       }
-      else {
-         /* can't render to a non-existant texture */
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferTexture%s(non existant texture)",
-                     caller);
-         return;
+      break;
+   case 2:
+      switch (textarget) {
+      case GL_TEXTURE_2D:
+         break;
+      case GL_TEXTURE_RECTANGLE:
+         err = _mesa_is_gles(ctx)
+            || !ctx->Extensions.NV_texture_rectangle;
+         break;
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+         err = !ctx->Extensions.ARB_texture_cube_map;
+         break;
+      case GL_TEXTURE_2D_ARRAY:
+         err = (_mesa_is_gles(ctx) && ctx->Version < 30)
+               || !ctx->Extensions.EXT_texture_array;
+         break;
+      case GL_TEXTURE_2D_MULTISAMPLE:
+      case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+         err = _mesa_is_gles(ctx)
+               || !ctx->Extensions.ARB_texture_multisample;
+         break;
+      default:
+         err = true;
       }
+      break;
+   case 3:
+      if (textarget != GL_TEXTURE_3D)
+         err = true;
+      break;
+   default:
+      err = true;
+   }
 
-      if (err) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferTexture%s(texture target mismatch)",
-                     caller);
-         return;
-      }
+   if (err) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(invalid textarget %s)",
+                  caller, _mesa_lookup_enum_by_nr(textarget));
+      return false;
+   }
 
-      if (texObj->Target == GL_TEXTURE_3D) {
-         const GLuint maxSize = 1 << (ctx->Const.Max3DTextureLevels - 1);
-         if (zoffset >= maxSize) {
-            _mesa_error(ctx, GL_INVALID_VALUE,
-                        "glFramebufferTexture%s(zoffset)", caller);
-            return;
-         }
+   /* Make sure textarget is consistent with the texture's type */
+   err = (target == GL_TEXTURE_CUBE_MAP) ?
+          !_mesa_is_cube_face(textarget): (target != textarget);
+
+   if (err) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(mismatched texture target)", caller);
+      return false;
+   }
+
+   return true;
+}
+
+
+/**
+ * Common code called by gl*FramebufferTextureLayer() and
+ * glFramebufferTexture3D() to validate the layer.
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_layer(struct gl_context *ctx, GLenum target, GLint layer,
+            const char *caller)
+{
+   /* Page 306 (page 328 of the PDF) of the OpenGL 4.5 (Core Profile)
+    * spec says:
+    *
+    *    "An INVALID_VALUE error is generated if texture is non-zero
+    *     and layer is negative."
+    */
+   if (layer < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(layer %u < 0)", caller, layer);
+      return false;
+   }
+
+   if (target == GL_TEXTURE_3D) {
+      const GLuint maxSize = 1 << (ctx->Const.Max3DTextureLevels - 1);
+      if (layer >= maxSize) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(invalid layer %u)", caller, layer);
+         return false;
       }
-      else if ((texObj->Target == GL_TEXTURE_1D_ARRAY_EXT) ||
-               (texObj->Target == GL_TEXTURE_2D_ARRAY_EXT) ||
-               (texObj->Target == GL_TEXTURE_CUBE_MAP_ARRAY) ||
-               (texObj->Target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY)) {
-         if (zoffset >= ctx->Const.MaxArrayTextureLayers) {
-            _mesa_error(ctx, GL_INVALID_VALUE,
-                        "glFramebufferTexture%s(layer)", caller);
-            return;
-         }
+   }
+   else if ((target == GL_TEXTURE_1D_ARRAY) ||
+            (target == GL_TEXTURE_2D_ARRAY) ||
+            (target == GL_TEXTURE_CUBE_MAP_ARRAY) ||
+            (target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY)) {
+      if (layer >= ctx->Const.MaxArrayTextureLayers) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(layer %u >= GL_MAX_ARRAY_TEXTURE_LAYERS)",
+                     caller, layer);
+         return false;
       }
-
-      maxLevelsTarget = textarget ? textarget : texObj->Target;
-      if ((level < 0) ||
-          (level >= _mesa_max_texture_levels(ctx, maxLevelsTarget))) {
+   }
+   else if (target == GL_TEXTURE_CUBE_MAP) {
+      if (layer >= 6) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glFramebufferTexture%s(level)", caller);
-         return;
+                     "%s(layer %u >= 6)", caller, layer);
+         return false;
       }
    }
 
+   return true;
+}
+
+
+/**
+ * Common code called by all gl*FramebufferTexture*() entry points to verify
+ * the level.
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_level(struct gl_context *ctx, GLenum target, GLint level,
+            const char *caller)
+{
+   if ((level < 0) ||
+       (level >= _mesa_max_texture_levels(ctx, target))) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(invalid level %d)", caller, level);
+      return false;
+   }
+
+   return true;
+}
+
+
+void
+_mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
+                          GLenum attachment,
+                          struct gl_texture_object *texObj, GLenum textarget,
+                          GLint level, GLuint layer, GLboolean layered,
+                          const char *caller)
+{
+   struct gl_renderbuffer_attachment *att;
+
+   /* The window-system framebuffer object is immutable */
+   if (_mesa_is_winsys_fbo(fb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(window-system framebuffer)",
+                  caller);
+      return;
+   }
+
+   /* Not a hash lookup, so we can afford to get the attachment here. */
    att = get_attachment(ctx, fb, attachment);
    if (att == NULL) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferTexture%s(attachment)", caller);
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
+                  _mesa_lookup_enum_by_nr(attachment));
       return;
    }
 
@@ -2637,7 +3087,7 @@ framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
           level == fb->Attachment[BUFFER_STENCIL].TextureLevel &&
           _mesa_tex_target_to_face(textarget) ==
           fb->Attachment[BUFFER_STENCIL].CubeMapFace &&
-          zoffset == fb->Attachment[BUFFER_STENCIL].Zoffset) {
+          layer == fb->Attachment[BUFFER_STENCIL].Zoffset) {
          /* The texture object is already attached to the stencil attachment
           * point. Don't create a new renderbuffer; just reuse the stencil
           * attachment's. This is required to prevent a GL error in
@@ -2650,13 +3100,14 @@ framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
                  level == fb->Attachment[BUFFER_DEPTH].TextureLevel &&
                  _mesa_tex_target_to_face(textarget) ==
                  fb->Attachment[BUFFER_DEPTH].CubeMapFace &&
-                 zoffset == fb->Attachment[BUFFER_DEPTH].Zoffset) {
+                 layer == fb->Attachment[BUFFER_DEPTH].Zoffset) {
          /* As above, but with depth and stencil transposed. */
          reuse_framebuffer_texture_attachment(fb, BUFFER_STENCIL,
                                               BUFFER_DEPTH);
       } else {
          set_texture_attachment(ctx, fb, att, texObj, textarget,
-                                      level, zoffset, layered);
+                                level, layer, layered);
+
          if (attachment == GL_DEPTH_STENCIL_ATTACHMENT) {
             /* Above we created a new renderbuffer and attached it to the
              * depth attachment point. Now attach it to the stencil attachment
@@ -2692,116 +3143,157 @@ framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
 }
 
 
-void GLAPIENTRY
-_mesa_FramebufferTexture1D(GLenum target, GLenum attachment,
-                           GLenum textarget, GLuint texture, GLint level)
+static void
+framebuffer_texture_with_dims(int dims, GLenum target,
+                              GLenum attachment, GLenum textarget,
+                              GLuint texture, GLint level, GLint layer,
+                              const char *caller)
 {
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+
+   /* Get the framebuffer object */
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", caller,
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
 
-   if (texture != 0) {
-      GLboolean error;
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, false, caller, &texObj))
+      return;
 
-      switch (textarget) {
-      case GL_TEXTURE_1D:
-         error = GL_FALSE;
-         break;
-      case GL_TEXTURE_1D_ARRAY:
-         error = !ctx->Extensions.EXT_texture_array;
-         break;
-      default:
-         error = GL_TRUE;
-      }
+   if (texObj) {
+      if (!check_textarget(ctx, dims, texObj->Target, textarget, caller))
+         return;
 
-      if (error) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferTexture1D(textarget=%s)",
-                     _mesa_lookup_enum_by_nr(textarget));
+      if ((dims == 3) && !check_layer(ctx, texObj->Target, layer, caller))
          return;
-      }
    }
 
-   framebuffer_texture(ctx, "1D", target, attachment, textarget, texture,
-                       level, 0, GL_FALSE);
+   if (!check_level(ctx, textarget, level, caller))
+      return;
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
+                             layer, GL_FALSE, caller);
 }
 
 
 void GLAPIENTRY
-_mesa_FramebufferTexture2D(GLenum target, GLenum attachment,
+_mesa_FramebufferTexture1D(GLenum target, GLenum attachment,
                            GLenum textarget, GLuint texture, GLint level)
 {
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (texture != 0) {
-      GLboolean error;
-
-      switch (textarget) {
-      case GL_TEXTURE_2D:
-         error = GL_FALSE;
-         break;
-      case GL_TEXTURE_RECTANGLE:
-         error = _mesa_is_gles(ctx)
-            || !ctx->Extensions.NV_texture_rectangle;
-         break;
-      case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-      case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-      case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-      case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-         error = !ctx->Extensions.ARB_texture_cube_map;
-         break;
-      case GL_TEXTURE_2D_ARRAY:
-         error = (_mesa_is_gles(ctx) && ctx->Version < 30)
-            || !ctx->Extensions.EXT_texture_array;
-         break;
-      case GL_TEXTURE_2D_MULTISAMPLE:
-      case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-         error = _mesa_is_gles(ctx)
-            || !ctx->Extensions.ARB_texture_multisample;
-         break;
-      default:
-         error = GL_TRUE;
-      }
+   framebuffer_texture_with_dims(1, target, attachment, textarget, texture,
+                                 level, 0, "glFramebufferTexture1D");
+}
 
-      if (error) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferTexture2D(textarget=%s)",
-                     _mesa_lookup_enum_by_nr(textarget));
-         return;
-      }
-   }
 
-   framebuffer_texture(ctx, "2D", target, attachment, textarget, texture,
-                       level, 0, GL_FALSE);
+void GLAPIENTRY
+_mesa_FramebufferTexture2D(GLenum target, GLenum attachment,
+                           GLenum textarget, GLuint texture, GLint level)
+{
+   framebuffer_texture_with_dims(2, target, attachment, textarget, texture,
+                                 level, 0, "glFramebufferTexture2D");
 }
 
 
 void GLAPIENTRY
 _mesa_FramebufferTexture3D(GLenum target, GLenum attachment,
                            GLenum textarget, GLuint texture,
-                           GLint level, GLint zoffset)
+                           GLint level, GLint layer)
+{
+   framebuffer_texture_with_dims(3, target, attachment, textarget, texture,
+                                 level, layer, "glFramebufferTexture3D");
+}
+
+
+void GLAPIENTRY
+_mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
+                              GLuint texture, GLint level, GLint layer)
 {
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+   GLenum textarget = 0;
 
-   if ((texture != 0) && (textarget != GL_TEXTURE_3D)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glFramebufferTexture3D(textarget)");
+   const char *func = "glFramebufferTextureLayer";
+
+   /* Get the framebuffer object */
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferTextureLayer(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
       return;
    }
 
-   framebuffer_texture(ctx, "3D", target, attachment, textarget, texture,
-                       level, zoffset, GL_FALSE);
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, false, func, &texObj))
+      return;
+
+   if (texObj) {
+      if (!check_texture_target(ctx, texObj->Target, func))
+         return;
+
+      if (!check_layer(ctx, texObj->Target, layer, func))
+         return;
+
+      if (!check_level(ctx, texObj->Target, level, func))
+         return;
+
+      if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+         assert(layer >= 0 && layer < 6);
+         textarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + layer;
+         layer = 0;
+      }
+   }
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
+                             layer, GL_FALSE, func);
 }
 
 
 void GLAPIENTRY
-_mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
-                              GLuint texture, GLint level, GLint layer)
+_mesa_NamedFramebufferTextureLayer(GLuint framebuffer, GLenum attachment,
+                                   GLuint texture, GLint level, GLint layer)
 {
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+   GLenum textarget = 0;
+
+   const char *func = "glNamedFramebufferTextureLayer";
+
+   /* Get the framebuffer object */
+   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer, func);
+   if (!fb)
+      return;
+
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, false, func, &texObj))
+      return;
+
+   if (texObj) {
+      if (!check_texture_target(ctx, texObj->Target, func))
+         return;
 
-   framebuffer_texture(ctx, "Layer", target, attachment, 0, texture,
-                       level, layer, GL_FALSE);
+      if (!check_layer(ctx, texObj->Target, layer, func))
+         return;
+
+      if (!check_level(ctx, texObj->Target, level, func))
+         return;
+
+      if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+         assert(layer >= 0 && layer < 6);
+         textarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + layer;
+         layer = 0;
+      }
+   }
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
+                             layer, GL_FALSE, func);
 }
 
 
@@ -2810,82 +3302,115 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
                          GLuint texture, GLint level)
 {
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+   GLboolean layered;
 
-   if (_mesa_has_geometry_shaders(ctx)) {
-      framebuffer_texture(ctx, "", target, attachment, 0, texture,
-                          level, 0, GL_TRUE);
-   } else {
+   const char *func = "FramebufferTexture";
+
+   if (!_mesa_has_geometry_shaders(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "unsupported function (glFramebufferTexture) called");
+      return;
    }
+
+   /* Get the framebuffer object */
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferTexture(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
+
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, true, func, &texObj))
+      return;
+
+   if (texObj) {
+      if (!check_layered_texture_target(ctx, texObj->Target, func, &layered))
+         return;
+
+      if (!check_level(ctx, texObj->Target, level, func))
+         return;
+   }
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, 0, level,
+                             0, layered, func);
 }
 
 
 void GLAPIENTRY
-_mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
-                              GLenum renderbufferTarget,
-                              GLuint renderbuffer)
+_mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
+                              GLuint texture, GLint level)
 {
-   struct gl_renderbuffer_attachment *att;
-   struct gl_framebuffer *fb;
-   struct gl_renderbuffer *rb;
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+   GLboolean layered;
 
-   fb = get_framebuffer_target(ctx, target);
-   if (!fb) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferRenderbuffer(target)");
+   const char *func = "glNamedFramebufferTexture";
+
+   if (!_mesa_has_geometry_shaders(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "unsupported function (glNamedFramebufferTexture) called");
       return;
    }
 
-   if (renderbufferTarget != GL_RENDERBUFFER_EXT) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferRenderbuffer(renderbufferTarget)");
+   /* Get the framebuffer object */
+   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer, func);
+   if (!fb)
       return;
+
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, true, func, &texObj))
+      return;
+
+   if (texObj) {
+      if (!check_layered_texture_target(ctx, texObj->Target, func,
+                                        &layered))
+         return;
+
+      if (!check_level(ctx, texObj->Target, level, func))
+         return;
    }
 
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, 0, level,
+                             0, layered, func);
+}
+
+
+void
+_mesa_framebuffer_renderbuffer(struct gl_context *ctx,
+                               struct gl_framebuffer *fb,
+                               GLenum attachment,
+                               struct gl_renderbuffer *rb,
+                               const char *func)
+{
+   struct gl_renderbuffer_attachment *att;
+
    if (_mesa_is_winsys_fbo(fb)) {
       /* Can't attach new renderbuffers to a window system framebuffer */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glFramebufferRenderbuffer");
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(window-system framebuffer)", func);
       return;
    }
 
    att = get_attachment(ctx, fb, attachment);
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferRenderbuffer(invalid attachment %s)",
+                  "%s(invalid attachment %s)", func,
                   _mesa_lookup_enum_by_nr(attachment));
       return;
    }
 
-   if (renderbuffer) {
-      rb = _mesa_lookup_renderbuffer(ctx, renderbuffer);
-      if (!rb) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferRenderbuffer(non-existant"
-                     " renderbuffer %u)", renderbuffer);
-         return;
-      }
-      else if (rb == &DummyRenderbuffer) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferRenderbuffer(renderbuffer %u)",
-                     renderbuffer);
-         return;
-      }
-   }
-   else {
-      /* remove renderbuffer attachment */
-      rb = NULL;
-   }
-
    if (attachment == GL_DEPTH_STENCIL_ATTACHMENT &&
        rb && rb->Format != MESA_FORMAT_NONE) {
       /* make sure the renderbuffer is a depth/stencil format */
       const GLenum baseFormat = _mesa_get_format_base_format(rb->Format);
       if (baseFormat != GL_DEPTH_STENCIL) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferRenderbuffer(renderbuffer"
-                     " is not DEPTH_STENCIL format)");
+                     "%s(renderbuffer is not DEPTH_STENCIL format)", func);
          return;
       }
    }
@@ -2903,24 +3428,94 @@ _mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
 
 
 void GLAPIENTRY
-_mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
-                                          GLenum pname, GLint *params)
+_mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
+                              GLenum renderbuffertarget,
+                              GLuint renderbuffer)
 {
-   const struct gl_renderbuffer_attachment *att;
-   struct gl_framebuffer *buffer;
-   GLenum err;
+   struct gl_framebuffer *fb;
+   struct gl_renderbuffer *rb;
    GET_CURRENT_CONTEXT(ctx);
 
-   /* The error differs in GL and GLES. */
-   err = _mesa_is_desktop_gl(ctx) ? GL_INVALID_OPERATION : GL_INVALID_ENUM;
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferRenderbuffer(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
 
-   buffer = get_framebuffer_target(ctx, target);
-   if (!buffer) {
+   if (renderbuffertarget != GL_RENDERBUFFER) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetFramebufferAttachmentParameteriv(target)");
+                  "glFramebufferRenderbuffer(renderbuffertarget is not "
+                  "GL_RENDERBUFFER)");
       return;
    }
 
+   if (renderbuffer) {
+      rb = _mesa_lookup_renderbuffer_err(ctx, renderbuffer,
+                                         "glFramebufferRenderbuffer");
+      if (!rb)
+         return;
+   }
+   else {
+      /* remove renderbuffer attachment */
+      rb = NULL;
+   }
+
+   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb,
+                                  "glFramebufferRenderbuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferRenderbuffer(GLuint framebuffer, GLenum attachment,
+                                   GLenum renderbuffertarget,
+                                   GLuint renderbuffer)
+{
+   struct gl_framebuffer *fb;
+   struct gl_renderbuffer *rb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                     "glNamedFramebufferRenderbuffer");
+   if (!fb)
+      return;
+
+   if (renderbuffertarget != GL_RENDERBUFFER) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glNamedFramebufferRenderbuffer(renderbuffertarget is not "
+                  "GL_RENDERBUFFER)");
+      return;
+   }
+
+   if (renderbuffer) {
+      rb = _mesa_lookup_renderbuffer_err(ctx, renderbuffer,
+                                         "glNamedFramebufferRenderbuffer");
+      if (!rb)
+         return;
+   }
+   else {
+      /* remove renderbuffer attachment */
+      rb = NULL;
+   }
+
+   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb,
+                                  "glNamedFramebufferRenderbuffer");
+}
+
+
+void
+_mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
+                                           struct gl_framebuffer *buffer,
+                                           GLenum attachment, GLenum pname,
+                                           GLint *params, const char *caller)
+{
+   const struct gl_renderbuffer_attachment *att;
+   GLenum err;
+
+   /* The error differs in GL and GLES. */
+   err = _mesa_is_desktop_gl(ctx) ? GL_INVALID_OPERATION : GL_INVALID_ENUM;
+
    if (_mesa_is_winsys_fbo(buffer)) {
       /* Page 126 (page 136 of the PDF) of the OpenGL ES 2.0.25 spec
        * says:
@@ -2936,14 +3531,15 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
            !ctx->Extensions.ARB_framebuffer_object)
           && !_mesa_is_gles3(ctx)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetFramebufferAttachmentParameteriv(bound FBO = 0)");
+                     "%s(window-system framebuffer)", caller);
          return;
       }
 
       if (_mesa_is_gles3(ctx) && attachment != GL_BACK &&
           attachment != GL_DEPTH && attachment != GL_STENCIL) {
          _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glGetFramebufferAttachmentParameteriv(attachment)");
+                     "%s(invalid attachment %s)", caller,
+                     _mesa_lookup_enum_by_nr(attachment));
          return;
       }
       /* the default / window-system FBO */
@@ -2955,8 +3551,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
    }
 
    if (att == NULL) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetFramebufferAttachmentParameteriv(attachment)");
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
+                  _mesa_lookup_enum_by_nr(attachment));
       return;
    }
 
@@ -2970,9 +3566,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
           *    attachment, since it does not have a single format."
           */
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetFramebufferAttachmentParameteriv("
-                     "GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE"
-                     " is invalid for depth+stencil attachment)");
+                     "%s(GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE"
+                     " is invalid for depth+stencil attachment)", caller);
          return;
       }
       /* the depth and stencil attachments must point to the same buffer */
@@ -2980,8 +3575,7 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
       stencilAtt = get_attachment(ctx, buffer, GL_STENCIL_ATTACHMENT);
       if (depthAtt->Renderbuffer != stencilAtt->Renderbuffer) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetFramebufferAttachmentParameteriv(DEPTH/STENCIL"
-                     " attachments differ)");
+                     "%s(DEPTH/STENCIL attachments differ)", caller);
          return;
       }
    }
@@ -3014,8 +3608,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          *params = att->TextureLevel;
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3031,8 +3625,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          }
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3042,8 +3636,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
       if (ctx->API == API_OPENGLES) {
          goto invalid_pname_enum;
       } else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       } else if (att->Type == GL_TEXTURE) {
          if (att->Texture && (att->Texture->Target == GL_TEXTURE_3D ||
              att->Texture->Target == GL_TEXTURE_2D_ARRAY)) {
@@ -3064,8 +3658,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          goto invalid_pname_enum;
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else {
          if (ctx->Extensions.EXT_framebuffer_sRGB) {
@@ -3087,8 +3681,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          goto invalid_pname_enum;
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else {
          mesa_format format = att->Renderbuffer->Format;
@@ -3103,9 +3697,9 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          if (_mesa_is_gles3(ctx) &&
              attachment == GL_DEPTH_STENCIL_ATTACHMENT) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glGetFramebufferAttachmentParameteriv(cannot query "
+                        "%s(cannot query "
                         "GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE of "
-                        "GL_DEPTH_STENCIL_ATTACHMENT");
+                        "GL_DEPTH_STENCIL_ATTACHMENT)", caller);
             return;
          }
 
@@ -3139,8 +3733,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          goto invalid_pname_enum;
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else if (att->Texture) {
          const struct gl_texture_image *texImage =
@@ -3159,8 +3753,7 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
                                       att->Renderbuffer->Format);
       }
       else {
-         _mesa_problem(ctx, "glGetFramebufferAttachmentParameterivEXT:"
-                       " invalid FBO attachment structure");
+         _mesa_problem(ctx, "%s: invalid FBO attachment structure", caller);
       }
       return;
    case GL_FRAMEBUFFER_ATTACHMENT_LAYERED:
@@ -3169,8 +3762,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
       } else if (att->Type == GL_TEXTURE) {
          *params = att->Layered;
       } else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       } else {
          goto invalid_pname_enum;
       }
@@ -3182,30 +3775,144 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
    return;
 
 invalid_pname_enum:
-   _mesa_error(ctx, GL_INVALID_ENUM,
-               "glGetFramebufferAttachmentParameteriv(pname)");
+   _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname %s)", caller,
+               _mesa_lookup_enum_by_nr(pname));
    return;
 }
 
 
+void GLAPIENTRY
+_mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
+                                          GLenum pname, GLint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *buffer;
+
+   buffer = get_framebuffer_target(ctx, target);
+   if (!buffer) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glGetFramebufferAttachmentParameteriv(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
+
+   _mesa_get_framebuffer_attachment_parameter(ctx, buffer, attachment, pname,
+                                              params,
+                                    "glGetFramebufferAttachmentParameteriv");
+}
+
+
+void GLAPIENTRY
+_mesa_GetNamedFramebufferAttachmentParameteriv(GLuint framebuffer,
+                                               GLenum attachment,
+                                               GLenum pname, GLint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *buffer;
+
+   if (framebuffer) {
+      buffer = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                              "glGetNamedFramebufferAttachmentParameteriv");
+      if (!buffer)
+         return;
+   }
+   else {
+      /*
+       * Section 9.2 Binding and Managing Framebuffer Objects of the OpenGL
+       * 4.5 core spec (30.10.2014, PDF page 314):
+       *    "If framebuffer is zero, then the default draw framebuffer is
+       *    queried."
+       */
+      buffer = ctx->WinSysDrawBuffer;
+   }
+
+   _mesa_get_framebuffer_attachment_parameter(ctx, buffer, attachment, pname,
+                                              params,
+                              "glGetNamedFramebufferAttachmentParameteriv");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferParameteri(GLuint framebuffer, GLenum pname,
+                                 GLint param)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb = NULL;
+
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glNamedFramebufferParameteri("
+                  "ARB_framebuffer_no_attachments not implemented)");
+      return;
+   }
+
+   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                     "glNamedFramebufferParameteri");
+
+   if (fb) {
+      framebuffer_parameteri(ctx, fb, pname, param,
+                             "glNamedFramebufferParameteriv");
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_GetNamedFramebufferParameteriv(GLuint framebuffer, GLenum pname,
+                                     GLint *param)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glNamedFramebufferParameteriv("
+                  "ARB_framebuffer_no_attachments not implemented)");
+      return;
+   }
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glGetNamedFramebufferParameteriv");
+   } else {
+      fb = ctx->WinSysDrawBuffer;
+   }
+
+   if (fb) {
+      get_framebuffer_parameteriv(ctx, fb, pname, param,
+                                  "glGetNamedFramebufferParameteriv");
+   }
+}
+
+
 static void
-invalidate_framebuffer_storage(GLenum target, GLsizei numAttachments,
+invalidate_framebuffer_storage(struct gl_context *ctx,
+                               struct gl_framebuffer *fb,
+                               GLsizei numAttachments,
                                const GLenum *attachments, GLint x, GLint y,
                                GLsizei width, GLsizei height, const char *name)
 {
    int i;
-   struct gl_framebuffer *fb;
-   GET_CURRENT_CONTEXT(ctx);
 
-   fb = get_framebuffer_target(ctx, target);
-   if (!fb) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target)", name);
+   /* Section 17.4 Whole Framebuffer Operations of the OpenGL 4.5 Core
+    * Spec (2.2.2015, PDF page 522) says:
+    *    "An INVALID_VALUE error is generated if numAttachments, width, or
+    *    height is negative."
+    */
+   if (numAttachments < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(numAttachments < 0)", name);
       return;
    }
 
-   if (numAttachments < 0) {
+   if (width < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(numAttachments < 0)", name);
+                  "%s(width < 0)", name);
+      return;
+   }
+
+   if (height < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(height < 0)", name);
       return;
    }
 
@@ -3301,7 +4008,8 @@ invalidate_framebuffer_storage(GLenum target, GLsizei numAttachments,
    return;
 
 invalid_enum:
-   _mesa_error(ctx, GL_INVALID_ENUM, "%s(attachment)", name);
+   _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", name,
+               _mesa_lookup_enum_by_nr(attachments[i]));
    return;
 }
 
@@ -3311,16 +4019,67 @@ _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
                                const GLenum *attachments, GLint x, GLint y,
                                GLsizei width, GLsizei height)
 {
-   invalidate_framebuffer_storage(target, numAttachments, attachments,
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glInvalidateSubFramebuffer(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
+
+   invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
                                   x, y, width, height,
                                   "glInvalidateSubFramebuffer");
 }
 
 
 void GLAPIENTRY
+_mesa_InvalidateNamedFramebufferSubData(GLuint framebuffer,
+                                        GLsizei numAttachments,
+                                        const GLenum *attachments,
+                                        GLint x, GLint y,
+                                        GLsizei width, GLsizei height)
+{
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   /* The OpenGL 4.5 core spec (02.02.2015) says (in Section 17.4 Whole
+    * Framebuffer Operations, PDF page 522): "If framebuffer is zero, the
+    * default draw framebuffer is affected."
+    */
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glInvalidateNamedFramebufferSubData");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
+                                  x, y, width, height,
+                                  "glInvalidateNamedFramebufferSubData");
+}
+
+
+void GLAPIENTRY
 _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
                             const GLenum *attachments)
 {
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glInvalidateFramebuffer(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
+
    /* The GL_ARB_invalidate_subdata spec says:
     *
     *     "The command
@@ -3333,7 +4092,7 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
     *     <width>, <height> equal to 0, 0, <MAX_VIEWPORT_DIMS[0]>,
     *     <MAX_VIEWPORT_DIMS[1]> respectively."
     */
-   invalidate_framebuffer_storage(target, numAttachments, attachments,
+   invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
                                   0, 0,
                                   MAX_VIEWPORT_WIDTH, MAX_VIEWPORT_HEIGHT,
                                   "glInvalidateFramebuffer");
@@ -3341,6 +4100,46 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
 
 
 void GLAPIENTRY
+_mesa_InvalidateNamedFramebufferData(GLuint framebuffer,
+                                     GLsizei numAttachments,
+                                     const GLenum *attachments)
+{
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   /* The OpenGL 4.5 core spec (02.02.2015) says (in Section 17.4 Whole
+    * Framebuffer Operations, PDF page 522): "If framebuffer is zero, the
+    * default draw framebuffer is affected."
+    */
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glInvalidateNamedFramebufferData");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   /* The GL_ARB_invalidate_subdata spec says:
+    *
+    *     "The command
+    *
+    *        void InvalidateFramebuffer(enum target,
+    *                                   sizei numAttachments,
+    *                                   const enum *attachments);
+    *
+    *     is equivalent to the command InvalidateSubFramebuffer with <x>, <y>,
+    *     <width>, <height> equal to 0, 0, <MAX_VIEWPORT_DIMS[0]>,
+    *     <MAX_VIEWPORT_DIMS[1]> respectively."
+    */
+   invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
+                                  0, 0,
+                                  MAX_VIEWPORT_WIDTH, MAX_VIEWPORT_HEIGHT,
+                                  "glInvalidateNamedFramebufferData");
+}
+
+
+void GLAPIENTRY
 _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
                             const GLenum *attachments)
 {
diff --git a/src/mesa/main/fbobject.h b/src/mesa/main/fbobject.h
index 61aa1f50308..8dad0ff34e7 100644
--- a/src/mesa/main/fbobject.h
+++ b/src/mesa/main/fbobject.h
@@ -64,9 +64,17 @@ _mesa_get_incomplete_framebuffer(void);
 extern struct gl_renderbuffer *
 _mesa_lookup_renderbuffer(struct gl_context *ctx, GLuint id);
 
+extern struct gl_renderbuffer *
+_mesa_lookup_renderbuffer_err(struct gl_context *ctx, GLuint id,
+                              const char *func);
+
 extern struct gl_framebuffer *
 _mesa_lookup_framebuffer(struct gl_context *ctx, GLuint id);
 
+extern struct gl_framebuffer *
+_mesa_lookup_framebuffer_err(struct gl_context *ctx, GLuint id,
+                             const char *func);
+
 
 void
 _mesa_update_texture_renderbuffer(struct gl_context *ctx,
@@ -74,9 +82,17 @@ _mesa_update_texture_renderbuffer(struct gl_context *ctx,
                                   struct gl_renderbuffer_attachment *att);
 
 extern void
+_mesa_FramebufferRenderbuffer_sw(struct gl_context *ctx,
+                                 struct gl_framebuffer *fb,
+                                 GLenum attachment,
+                                 struct gl_renderbuffer *rb);
+
+extern void
 _mesa_framebuffer_renderbuffer(struct gl_context *ctx,
                                struct gl_framebuffer *fb,
-                               GLenum attachment, struct gl_renderbuffer *rb);
+                               GLenum attachment,
+                               struct gl_renderbuffer *rb,
+                               const char *func);
 
 extern void
 _mesa_validate_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb);
@@ -99,6 +115,24 @@ _mesa_detach_renderbuffer(struct gl_context *ctx,
                           struct gl_framebuffer *fb,
                           const void *att);
 
+extern void
+_mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
+                          GLenum attachment,
+                          struct gl_texture_object *texObj, GLenum textarget,
+                          GLint level, GLuint layer, GLboolean layered,
+                          const char *caller);
+
+extern GLenum
+_mesa_check_framebuffer_status(struct gl_context *ctx,
+                               struct gl_framebuffer *fb);
+
+extern void
+_mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
+                                           struct gl_framebuffer *buffer,
+                                           GLenum attachment, GLenum pname,
+                                           GLint *params, const char *caller);
+
+
 extern GLboolean GLAPIENTRY
 _mesa_IsRenderbuffer(GLuint renderbuffer);
 
@@ -165,9 +199,15 @@ _mesa_DeleteFramebuffers(GLsizei n, const GLuint *framebuffers);
 extern void GLAPIENTRY
 _mesa_GenFramebuffers(GLsizei n, GLuint *framebuffers);
 
+extern void GLAPIENTRY
+_mesa_CreateFramebuffers(GLsizei n, GLuint *framebuffers);
+
 extern GLenum GLAPIENTRY
 _mesa_CheckFramebufferStatus(GLenum target);
 
+extern GLenum GLAPIENTRY
+_mesa_CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target);
+
 extern void GLAPIENTRY
 _mesa_FramebufferTexture1D(GLenum target, GLenum attachment,
                               GLenum textarget, GLuint texture, GLint level);
@@ -179,24 +219,49 @@ _mesa_FramebufferTexture2D(GLenum target, GLenum attachment,
 extern void GLAPIENTRY
 _mesa_FramebufferTexture3D(GLenum target, GLenum attachment,
                               GLenum textarget, GLuint texture,
-                              GLint level, GLint zoffset);
+                              GLint level, GLint layer);
 
 extern void GLAPIENTRY
 _mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
                                  GLuint texture, GLint level, GLint layer);
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferTextureLayer(GLuint framebuffer, GLenum attachment,
+                                   GLuint texture, GLint level, GLint layer);
+
+extern void GLAPIENTRY
 _mesa_FramebufferTexture(GLenum target, GLenum attachment,
                          GLuint texture, GLint level);
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
+                              GLuint texture, GLint level);
+
+extern void GLAPIENTRY
 _mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
                                  GLenum renderbuffertarget,
                                  GLuint renderbuffer);
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferRenderbuffer(GLuint framebuffer, GLenum attachment,
+                                   GLenum renderbuffertarget,
+                                   GLuint renderbuffer);
+
+extern void GLAPIENTRY
 _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
                                              GLenum pname, GLint *params);
+extern void GLAPIENTRY
+_mesa_GetNamedFramebufferAttachmentParameteriv(GLuint framebuffer,
+                                               GLenum attachment,
+                                               GLenum pname, GLint *params);
+
+extern void GLAPIENTRY
+_mesa_NamedFramebufferParameteri(GLuint framebuffer, GLenum pname,
+                                 GLint param);
+
+extern void GLAPIENTRY
+_mesa_GetNamedFramebufferParameteriv(GLuint framebuffer, GLenum pname,
+                                     GLint *param);
 
 extern void GLAPIENTRY
 _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
@@ -204,11 +269,29 @@ _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
                                GLsizei width, GLsizei height);
 
 extern void GLAPIENTRY
+_mesa_InvalidateNamedFramebufferSubData(GLuint framebuffer,
+                                        GLsizei numAttachments,
+                                        const GLenum *attachments,
+                                        GLint x, GLint y,
+                                        GLsizei width, GLsizei height);
+
+extern void GLAPIENTRY
 _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
                             const GLenum *attachments);
 
 extern void GLAPIENTRY
+_mesa_InvalidateNamedFramebufferData(GLuint framebuffer,
+                                     GLsizei numAttachments,
+                                     const GLenum *attachments);
+
+extern void GLAPIENTRY
 _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
                             const GLenum *attachments);
 
+extern void GLAPIENTRY
+_mesa_FramebufferParameteri(GLenum target, GLenum pname, GLint param);
+
+extern void GLAPIENTRY
+_mesa_GetFramebufferParameteriv(GLenum target, GLenum pname, GLint *params);
+
 #endif /* FBOBJECT_H */
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index 8af44e90520..baeb1bfe5de 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -397,6 +397,11 @@ format_array_format_table_init(void)
    format_array_format_table = _mesa_hash_table_create(NULL, NULL,
                                                        array_formats_equal);
 
+   if (!format_array_format_table) {
+      _mesa_error_no_memory(__func__);
+      return;
+   }
+
    for (f = 1; f < MESA_FORMAT_COUNT; ++f) {
       info = _mesa_get_format_info(f);
       if (!info->ArrayFormat)
@@ -432,6 +437,12 @@ _mesa_format_from_array_format(uint32_t array_format)
 
    call_once(&format_array_format_table_exists, format_array_format_table_init);
 
+   if (!format_array_format_table) {
+      static const once_flag once_flag_init = ONCE_FLAG_INIT;
+      format_array_format_table_exists = once_flag_init;
+      return MESA_FORMAT_NONE;
+   }
+
    entry = _mesa_hash_table_search_pre_hashed(format_array_format_table,
                                               array_format,
                                               (void *)(intptr_t)array_format);
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 4f7736a64d0..77c04b8dab8 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -157,6 +157,7 @@ _mesa_initialize_window_framebuffer(struct gl_framebuffer *fb,
    fb->_Status = GL_FRAMEBUFFER_COMPLETE_EXT;
    fb->_AllColorBuffersFixedPoint = !visual->floatMode;
    fb->_HasSNormOrFloatColorBuffer = visual->floatMode;
+   fb->_HasAttachments = true;
 
    compute_depth_max(fb);
 }
@@ -312,7 +313,7 @@ _mesa_resize_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
 
    if (ctx) {
       /* update scissor / window bounds */
-      _mesa_update_draw_buffer_bounds(ctx);
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
       /* Signal new buffer state so that swrast will update its clipping
        * info (the CLIP_BIT flag).
        */
@@ -356,30 +357,20 @@ update_framebuffer_size(struct gl_context *ctx, struct gl_framebuffer *fb)
 }
 
 
+
 /**
- * Calculate the inclusive bounding box for the scissor of a specific viewport
+ * Given a bounding box, intersect the bounding box with the scissor of
+ * a specified vieport.
  *
  * \param ctx     GL context.
- * \param buffer  Framebuffer to be checked against
  * \param idx     Index of the desired viewport
  * \param bbox    Bounding box for the scissored viewport.  Stored as xmin,
  *                xmax, ymin, ymax.
- *
- * \warning This function assumes that the framebuffer dimensions are up to
- * date (e.g., update_framebuffer_size has been recently called on \c buffer).
- *
- * \sa _mesa_clip_to_region
  */
 void
-_mesa_scissor_bounding_box(const struct gl_context *ctx,
-                           const struct gl_framebuffer *buffer,
-                           unsigned idx, int *bbox)
+_mesa_intersect_scissor_bounding_box(const struct gl_context *ctx,
+                                     unsigned idx, int *bbox)
 {
-   bbox[0] = 0;
-   bbox[2] = 0;
-   bbox[1] = buffer->Width;
-   bbox[3] = buffer->Height;
-
    if (ctx->Scissor.EnableFlags & (1u << idx)) {
       if (ctx->Scissor.ScissorArray[idx].X > bbox[0]) {
          bbox[0] = ctx->Scissor.ScissorArray[idx].X;
@@ -401,6 +392,33 @@ _mesa_scissor_bounding_box(const struct gl_context *ctx,
          bbox[2] = bbox[3];
       }
    }
+}
+
+/**
+ * Calculate the inclusive bounding box for the scissor of a specific viewport
+ *
+ * \param ctx     GL context.
+ * \param buffer  Framebuffer to be checked against
+ * \param idx     Index of the desired viewport
+ * \param bbox    Bounding box for the scissored viewport.  Stored as xmin,
+ *                xmax, ymin, ymax.
+ *
+ * \warning This function assumes that the framebuffer dimensions are up to
+ * date (e.g., update_framebuffer_size has been recently called on \c buffer).
+ *
+ * \sa _mesa_clip_to_region
+ */
+void
+_mesa_scissor_bounding_box(const struct gl_context *ctx,
+                           const struct gl_framebuffer *buffer,
+                           unsigned idx, int *bbox)
+{
+   bbox[0] = 0;
+   bbox[2] = 0;
+   bbox[1] = buffer->Width;
+   bbox[3] = buffer->Height;
+
+   _mesa_intersect_scissor_bounding_box(ctx, idx, bbox);
 
    assert(bbox[0] <= bbox[1]);
    assert(bbox[2] <= bbox[3]);
@@ -413,9 +431,9 @@ _mesa_scissor_bounding_box(const struct gl_context *ctx,
  * \param ctx  the GL context.
  */
 void
-_mesa_update_draw_buffer_bounds(struct gl_context *ctx)
+_mesa_update_draw_buffer_bounds(struct gl_context *ctx,
+                                struct gl_framebuffer *buffer)
 {
-   struct gl_framebuffer *buffer = ctx->DrawBuffer;
    int bbox[4];
 
    if (!buffer)
@@ -652,7 +670,7 @@ update_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb)
        * context state (GL_READ_BUFFER too).
        */
       if (fb->ColorDrawBuffer[0] != ctx->Color.DrawBuffer[0]) {
-         _mesa_drawbuffers(ctx, ctx->Const.MaxDrawBuffers,
+         _mesa_drawbuffers(ctx, fb, ctx->Const.MaxDrawBuffers,
                            ctx->Color.DrawBuffer, NULL);
       }
    }
@@ -678,24 +696,21 @@ update_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb)
 
 
 /**
- * Update state related to the current draw/read framebuffers.
+ * Update state related to the draw/read framebuffers.
  */
 void
-_mesa_update_framebuffer(struct gl_context *ctx)
+_mesa_update_framebuffer(struct gl_context *ctx,
+                         struct gl_framebuffer *readFb,
+                         struct gl_framebuffer *drawFb)
 {
-   struct gl_framebuffer *drawFb;
-   struct gl_framebuffer *readFb;
-
    assert(ctx);
-   drawFb = ctx->DrawBuffer;
-   readFb = ctx->ReadBuffer;
 
    update_framebuffer(ctx, drawFb);
    if (readFb != drawFb)
       update_framebuffer(ctx, readFb);
 
-   _mesa_update_clamp_vertex_color(ctx);
-   _mesa_update_clamp_fragment_color(ctx);
+   _mesa_update_clamp_vertex_color(ctx, drawFb);
+   _mesa_update_clamp_fragment_color(ctx, drawFb);
 }
 
 
diff --git a/src/mesa/main/framebuffer.h b/src/mesa/main/framebuffer.h
index a4274216ec2..08e43222045 100644
--- a/src/mesa/main/framebuffer.h
+++ b/src/mesa/main/framebuffer.h
@@ -75,16 +75,50 @@ extern void
 _mesa_scissor_bounding_box(const struct gl_context *ctx,
                            const struct gl_framebuffer *buffer,
                            unsigned idx, int *bbox);
+extern void
+_mesa_intersect_scissor_bounding_box(const struct gl_context *ctx,
+                                     unsigned idx, int *bbox);
+
+static inline GLuint
+_mesa_geometric_width(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->Width : buffer->DefaultGeometry.Width;
+}
+
+static inline GLuint
+_mesa_geometric_height(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->Height : buffer->DefaultGeometry.Height;
+}
+
+static inline GLuint
+_mesa_geometric_samples(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->Visual.samples : buffer->DefaultGeometry.NumSamples;
+}
+
+static inline GLuint
+_mesa_geometric_layers(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->MaxNumLayers : buffer->DefaultGeometry.Layers;
+}
 
 extern void 
-_mesa_update_draw_buffer_bounds(struct gl_context *ctx);
+_mesa_update_draw_buffer_bounds(struct gl_context *ctx,
+                                struct gl_framebuffer *drawFb);
 
 extern void
 _mesa_update_framebuffer_visual(struct gl_context *ctx,
 				struct gl_framebuffer *fb);
 
 extern void
-_mesa_update_framebuffer(struct gl_context *ctx);
+_mesa_update_framebuffer(struct gl_context *ctx,
+                         struct gl_framebuffer *readFb,
+                         struct gl_framebuffer *drawFb);
 
 extern GLboolean
 _mesa_source_buffer_exists(struct gl_context *ctx, GLenum format);
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index a881bc589ba..3d6d63916b3 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -138,6 +138,7 @@ enum value_extra {
    EXTRA_API_GL_CORE,
    EXTRA_API_ES2,
    EXTRA_API_ES3,
+   EXTRA_API_ES31,
    EXTRA_NEW_BUFFERS, 
    EXTRA_NEW_FRAG_CLAMP,
    EXTRA_VALID_DRAW_BUFFER,
@@ -348,6 +349,12 @@ static const int extra_ARB_shader_image_load_store_and_geometry_shader[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_draw_indirect_es31[] = {
+   EXT(ARB_draw_indirect),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
@@ -393,6 +400,7 @@ EXTRA_EXT(INTEL_performance_query);
 EXTRA_EXT(ARB_explicit_uniform_location);
 EXTRA_EXT(ARB_clip_control);
 EXTRA_EXT(EXT_polygon_offset_clamp);
+EXTRA_EXT(ARB_framebuffer_no_attachments);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -909,13 +917,13 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       break;
 
    case GL_FOG_COLOR:
-      if (_mesa_get_clamp_fragment_color(ctx))
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
          COPY_4FV(v->value_float_4, ctx->Fog.Color);
       else
          COPY_4FV(v->value_float_4, ctx->Fog.ColorUnclamped);
       break;
    case GL_COLOR_CLEAR_VALUE:
-      if (_mesa_get_clamp_fragment_color(ctx)) {
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer)) {
          v->value_float_4[0] = CLAMP(ctx->Color.ClearColor.f[0], 0.0F, 1.0F);
          v->value_float_4[1] = CLAMP(ctx->Color.ClearColor.f[1], 0.0F, 1.0F);
          v->value_float_4[2] = CLAMP(ctx->Color.ClearColor.f[2], 0.0F, 1.0F);
@@ -924,13 +932,13 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
          COPY_4FV(v->value_float_4, ctx->Color.ClearColor.f);
       break;
    case GL_BLEND_COLOR_EXT:
-      if (_mesa_get_clamp_fragment_color(ctx))
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
          COPY_4FV(v->value_float_4, ctx->Color.BlendColor);
       else
          COPY_4FV(v->value_float_4, ctx->Color.BlendColorUnclamped);
       break;
    case GL_ALPHA_TEST_REF:
-      if (_mesa_get_clamp_fragment_color(ctx))
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
          v->value_float = ctx->Color.AlphaRef;
       else
          v->value_float = ctx->Color.AlphaRefUnclamped;
@@ -1078,6 +1086,11 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
          if (_mesa_is_gles3(ctx))
             api_found = GL_TRUE;
 	 break;
+      case EXTRA_API_ES31:
+         api_check = GL_TRUE;
+         if (_mesa_is_gles31(ctx))
+            api_found = GL_TRUE;
+	 break;
       case EXTRA_API_GL:
          api_check = GL_TRUE;
          if (_mesa_is_desktop_gl(ctx))
@@ -1911,6 +1924,7 @@ find_value_indexed(const char *func, GLenum pname, GLuint index, union value *v)
       if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs)
           goto invalid_value;
       v->value_int = ctx->Array.VAO->VertexBinding[VERT_ATTRIB_GENERIC(index)].Stride;
+      return TYPE_INT;
 
    /* ARB_shader_image_load_store */
    case GL_IMAGE_BINDING_NAME: {
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 41cb2c17b60..74ff3ba6619 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -409,6 +409,12 @@ descriptor=[
   [ "SAMPLER_BINDING", "LOC_CUSTOM, TYPE_INT, GL_SAMPLER_BINDING, NO_EXTRA" ],
 ]},
 
+# Enums in OpenGL Core profile and ES 3.1
+{ "apis": ["GL_CORE", "GLES3"], "params": [
+# GL_ARB_draw_indirect / GLES 3.1
+  [ "DRAW_INDIRECT_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_draw_indirect_es31" ],
+]},
+
 # Remaining enums are only in OpenGL
 { "apis": ["GL", "GL_CORE"], "params": [
   [ "ACCUM_RED_BITS", "BUFFER_INT(Visual.accumRedBits), NO_EXTRA" ],
@@ -793,19 +799,20 @@ descriptor=[
   [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader" ],
   [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader" ],
 
-# GL_ARB_gpu_shader5
-  [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ],
-  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
-  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
-  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
+# GL_ARB_framebuffer_no_attachments
+  ["MAX_FRAMEBUFFER_WIDTH", "CONTEXT_INT(Const.MaxFramebufferWidth), extra_ARB_framebuffer_no_attachments"],
+  ["MAX_FRAMEBUFFER_HEIGHT", "CONTEXT_INT(Const.MaxFramebufferHeight), extra_ARB_framebuffer_no_attachments"],
+  ["MAX_FRAMEBUFFER_LAYERS", "CONTEXT_INT(Const.MaxFramebufferLayers), extra_ARB_framebuffer_no_attachments"],
+  ["MAX_FRAMEBUFFER_SAMPLES", "CONTEXT_INT(Const.MaxFramebufferSamples), extra_ARB_framebuffer_no_attachments"],
+
+# GL_EXT_polygon_offset_clamp
+  [ "POLYGON_OFFSET_CLAMP_EXT", "CONTEXT_FLOAT(Polygon.OffsetClamp), extra_EXT_polygon_offset_clamp" ],
 ]},
 
 # Enums restricted to OpenGL Core profile
 { "apis": ["GL_CORE"], "params": [
 # GL_ARB_texture_buffer_range
   [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ],
-# GL_ARB_draw_indirect
-  [ "DRAW_INDIRECT_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_draw_indirect" ],
 
 # GL_ARB_viewport_array
   [ "MAX_VIEWPORTS", "CONTEXT_INT(Const.MaxViewports), extra_ARB_viewport_array" ],
@@ -814,8 +821,11 @@ descriptor=[
   [ "LAYER_PROVOKING_VERTEX", "CONTEXT_ENUM(Light.ProvokingVertex), extra_ARB_viewport_array" ],
   [ "VIEWPORT_INDEX_PROVOKING_VERTEX", "CONTEXT_ENUM(Light.ProvokingVertex), extra_ARB_viewport_array" ],
 
-# GL_EXT_polygon_offset_clamp
-  [ "POLYGON_OFFSET_CLAMP_EXT", "CONTEXT_FLOAT(Polygon.OffsetClamp), extra_EXT_polygon_offset_clamp" ],
+# GL_ARB_gpu_shader5
+  [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ],
+  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
+  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
+  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
 ]}
 
 ]
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 1b2c7f054f6..72d99ca4e22 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -72,10 +72,18 @@ shading_language_version(struct gl_context *ctx)
       break;
 
    case API_OPENGLES2:
-      return (ctx->Version < 30)
-         ? (const GLubyte *) "OpenGL ES GLSL ES 1.0.16"
-         : (const GLubyte *) "OpenGL ES GLSL ES 3.00";
-
+      switch (ctx->Version) {
+      case 20:
+         return (const GLubyte *) "OpenGL ES GLSL ES 1.0.16";
+      case 30:
+         return (const GLubyte *) "OpenGL ES GLSL ES 3.00";
+      case 31:
+         return (const GLubyte *) "OpenGL ES GLSL ES 3.10";
+      default:
+         _mesa_problem(ctx,
+                       "Invalid OpenGL ES version in shading_language_version()");
+         return (const GLubyte *) 0;
+      }
    case API_OPENGLES:
       /* fall-through */
 
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index 8ced5794938..ac69fabccaa 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -1200,7 +1200,7 @@ _mesa_is_depth_or_stencil_format(GLenum format)
  * \return GL_TRUE if compressed, GL_FALSE if uncompressed
  */
 GLboolean
-_mesa_is_compressed_format(struct gl_context *ctx, GLenum format)
+_mesa_is_compressed_format(const struct gl_context *ctx, GLenum format)
 {
    switch (format) {
    case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
@@ -1678,6 +1678,10 @@ _mesa_error_check_format_and_type(const struct gl_context *ctx,
       case GL_LUMINANCE:
       case GL_ALPHA:
          return GL_NO_ERROR;
+      case GL_RG:
+      case GL_RED:
+	 if (_mesa_is_gles3(ctx) || ctx->Extensions.ARB_texture_rg)
+            return GL_NO_ERROR;
       default:
          return GL_INVALID_OPERATION;
       }
@@ -2292,8 +2296,18 @@ _mesa_es3_error_check_format_and_type(const struct gl_context *ctx,
          break;
 
       case GL_HALF_FLOAT:
-         if (internalFormat != GL_RG16F)
-            return GL_INVALID_OPERATION;
+      case GL_HALF_FLOAT_OES:
+         switch (internalFormat) {
+            case GL_RG16F:
+               break;
+            case GL_RG:
+               if (ctx->Extensions.ARB_texture_rg &&
+                   ctx->Extensions.OES_texture_half_float)
+                  break;
+            /* fallthrough */
+            default:
+               return GL_INVALID_OPERATION;
+         }
          break;
 
       case GL_FLOAT:
@@ -2301,6 +2315,11 @@ _mesa_es3_error_check_format_and_type(const struct gl_context *ctx,
          case GL_RG16F:
          case GL_RG32F:
             break;
+         case GL_RG:
+            if (ctx->Extensions.ARB_texture_rg &&
+                ctx->Extensions.OES_texture_float)
+               break;
+            /* fallthrough */
          default:
             return GL_INVALID_OPERATION;
          }
@@ -2361,8 +2380,19 @@ _mesa_es3_error_check_format_and_type(const struct gl_context *ctx,
          break;
 
       case GL_HALF_FLOAT:
-         if (internalFormat != GL_R16F)
+      case GL_HALF_FLOAT_OES:
+         switch (internalFormat) {
+         case GL_R16F:
+            break;
+         case GL_RG:
+         case GL_RED:
+            if (ctx->Extensions.ARB_texture_rg &&
+                ctx->Extensions.OES_texture_half_float)
+               break;
+            /* fallthrough */
+         default:
             return GL_INVALID_OPERATION;
+         }
          break;
 
       case GL_FLOAT:
@@ -2370,6 +2400,11 @@ _mesa_es3_error_check_format_and_type(const struct gl_context *ctx,
          case GL_R16F:
          case GL_R32F:
             break;
+         case GL_RED:
+            if (ctx->Extensions.ARB_texture_rg &&
+                ctx->Extensions.OES_texture_float)
+               break;
+            /* fallthrough */
          default:
             return GL_INVALID_OPERATION;
          }
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index e1ecd64d5f9..8881cb7d86b 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -96,7 +96,7 @@ extern GLboolean
 _mesa_is_depth_or_stencil_format(GLenum format);
 
 extern GLboolean
-_mesa_is_compressed_format(struct gl_context *ctx, GLenum format);
+_mesa_is_compressed_format(const struct gl_context *ctx, GLenum format);
 
 extern GLenum
 _mesa_base_format_to_integer_format(GLenum format);
diff --git a/src/mesa/main/glheader.h b/src/mesa/main/glheader.h
index 7f7f9a39b3b..a2d98d4ddff 100644
--- a/src/mesa/main/glheader.h
+++ b/src/mesa/main/glheader.h
@@ -135,12 +135,6 @@ typedef void *GLeglImageOES;
 #define GL_SHADER_PROGRAM_MESA 0x9999
 
 
-/**
- * Internal token for geometry programs.
- * Use the value for GL_GEOMETRY_PROGRAM_NV for now.
- */
-#define MESA_GEOMETRY_PROGRAM 0x8c26
-
 /* Several fields of struct gl_config can take these as values.  Since
  * GLX header files may not be available everywhere they need to be used,
  * redefine them here.
diff --git a/src/mesa/main/hash.c b/src/mesa/main/hash.c
index d04cccd94d2..315b5d64004 100644
--- a/src/mesa/main/hash.c
+++ b/src/mesa/main/hash.c
@@ -389,34 +389,6 @@ _mesa_HashDeleteAll(struct _mesa_HashTable *table,
 
 
 /**
- * Clone all entries in a hash table, into a new table.
- *
- * \param table  the hash table to clone
- */
-struct _mesa_HashTable *
-_mesa_HashClone(const struct _mesa_HashTable *table)
-{
-   /* cast-away const */
-   struct _mesa_HashTable *table2 = (struct _mesa_HashTable *) table;
-   struct hash_entry *entry;
-   struct _mesa_HashTable *clonetable;
-
-   assert(table);
-   mtx_lock(&table2->Mutex);
-
-   clonetable = _mesa_NewHashTable();
-   assert(clonetable);
-   hash_table_foreach(table->ht, entry) {
-      _mesa_HashInsert(clonetable, (GLint)(uintptr_t)entry->key, entry->data);
-   }
-
-   mtx_unlock(&table2->Mutex);
-
-   return clonetable;
-}
-
-
-/**
  * Walk over all entries in a hash table, calling callback function for each.
  * Note: we use a separate mutex in this function to avoid a recursive
  * locking deadlock (in case the callback calls _mesa_HashRemove()) and to
diff --git a/src/mesa/main/hash.h b/src/mesa/main/hash.h
index e3e8f492e8b..da3b9973d24 100644
--- a/src/mesa/main/hash.h
+++ b/src/mesa/main/hash.h
@@ -59,9 +59,6 @@ _mesa_HashDeleteAll(struct _mesa_HashTable *table,
                     void (*callback)(GLuint key, void *data, void *userData),
                     void *userData);
 
-extern struct _mesa_HashTable *
-_mesa_HashClone(const struct _mesa_HashTable *table);
-
 extern void
 _mesa_HashWalk(const struct _mesa_HashTable *table,
                void (*callback)(GLuint key, void *data, void *userData),
diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h
index c4d917ebba4..9ffe3decd0f 100644
--- a/src/mesa/main/imports.h
+++ b/src/mesa/main/imports.h
@@ -230,38 +230,6 @@ static inline int IFLOOR(float f)
 }
 
 
-/** Return (as an integer) ceiling of float */
-static inline int ICEIL(float f)
-{
-#if defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__)
-   /*
-    * IEEE ceil for computers that round to nearest or even.
-    * 'f' must be between -4194304 and 4194303.
-    * This ceil operation is done by "(iround(f + .5) + iround(f - .5) + 1) >> 1",
-    * but uses some IEEE specific tricks for better speed.
-    * Contributed by Josh Vanderhoof
-    */
-   int ai, bi;
-   double af, bf;
-   af = (3 << 22) + 0.5 + (double)f;
-   bf = (3 << 22) + 0.5 - (double)f;
-   /* GCC generates an extra fstp/fld without this. */
-   __asm__ ("fstps %0" : "=m" (ai) : "t" (af) : "st");
-   __asm__ ("fstps %0" : "=m" (bi) : "t" (bf) : "st");
-   return (ai - bi + 1) >> 1;
-#else
-   int ai, bi;
-   double af, bf;
-   fi_type u;
-   af = (3 << 22) + 0.5 + (double)f;
-   bf = (3 << 22) + 0.5 - (double)f;
-   u.f = (float) af; ai = u.i;
-   u.f = (float) bf; bi = u.i;
-   return (ai - bi + 1) >> 1;
-#endif
-}
-
-
 /**
  * Is x a power of two?
  */
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index bd84113ea91..481fd5e7fdf 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -43,7 +43,6 @@
 #include "glapi/glapi.h"
 #include "math/m_matrix.h"	/* GLmatrix */
 #include "glsl/shader_enums.h"
-#include "util/simple_list.h"	/* struct simple_node */
 #include "main/formats.h"       /* MESA_FORMAT_COUNT */
 
 
@@ -398,7 +397,6 @@ struct gl_config
 {
    GLboolean rgbMode;
    GLboolean floatMode;
-   GLboolean colorIndexMode;  /* XXX is this used anywhere? */
    GLuint doubleBufferMode;
    GLuint stereoMode;
 
@@ -2099,8 +2097,6 @@ struct gl_program
    GLbitfield64 DoubleInputsRead;     /**< Bitmask of which input regs are read  and are doubles */
    GLbitfield64 OutputsWritten; /**< Bitmask of which output regs are written */
    GLbitfield SystemValuesRead;   /**< Bitmask of SYSTEM_VALUE_x inputs used */
-   GLbitfield InputFlags[MAX_PROGRAM_INPUTS];   /**< PROG_PARAM_BIT_x flags */
-   GLbitfield OutputFlags[MAX_PROGRAM_OUTPUTS]; /**< PROG_PARAM_BIT_x flags */
    GLbitfield TexturesUsed[MAX_COMBINED_TEXTURE_IMAGE_UNITS];  /**< TEXTURE_x_BIT bitmask */
    GLbitfield SamplersUsed;   /**< Bitfield of which samplers are used */
    GLbitfield ShadowSamplers; /**< Texture units used for shadow sampling. */
@@ -2275,16 +2271,10 @@ struct gl_vertex_program_state
  */
 struct gl_geometry_program_state
 {
-   GLboolean Enabled;               /**< GL_ARB_GEOMETRY_SHADER4 */
-   GLboolean _Enabled;              /**< Enabled and valid program? */
-   struct gl_geometry_program *Current;  /**< user-bound geometry program */
-
    /** Currently enabled and valid program (including internal programs
     * and compiled shader programs).
     */
    struct gl_geometry_program *_Current;
-
-   GLfloat Parameters[MAX_PROGRAM_ENV_PARAMS][4]; /**< Env params */
 };
 
 /**
@@ -2320,8 +2310,6 @@ struct gl_fragment_program_state
  */
 struct gl_compute_program_state
 {
-   struct gl_compute_program *Current;  /**< user-bound compute program */
-
    /** Currently enabled and valid program (including internal programs
     * and compiled shader programs).
     */
@@ -2733,7 +2721,7 @@ struct gl_shader_program
    } Comp;
 
    /* post-link info: */
-   unsigned NumUserUniformStorage;
+   unsigned NumUniformStorage;
    unsigned NumHiddenUniforms;
    struct gl_uniform_storage *UniformStorage;
 
@@ -2832,6 +2820,8 @@ struct gl_pipeline_object
 
    mtx_t Mutex;
 
+   GLchar *Label;   /**< GL_KHR_debug */
+
    /**
     * Programs used for rendering
     *
@@ -3009,7 +2999,6 @@ struct gl_shared_state
    struct _mesa_HashTable *Programs; /**< All vertex/fragment programs */
    struct gl_vertex_program *DefaultVertexProgram;
    struct gl_fragment_program *DefaultFragmentProgram;
-   struct gl_geometry_program *DefaultGeometryProgram;
    /*@}*/
 
    /* GL_ATI_fragment_shader */
@@ -3151,12 +3140,29 @@ struct gl_framebuffer
     */
    struct gl_config Visual;
 
-   GLuint Width, Height;	/**< size of frame buffer in pixels */
+   /**
+    * Size of frame buffer in pixels. If there are no attachments, then both
+    * of these are 0.
+    */
+   GLuint Width, Height;
 
-   /** \name  Drawing bounds (Intersection of buffer size and scissor box) */
+   /**
+    * In the case that the framebuffer has no attachment (i.e.
+    * GL_ARB_framebuffer_no_attachments) then the geometry of
+    * the framebuffer is specified by the default values.
+    */
+   struct {
+     GLuint Width, Height, Layers, NumSamples;
+     GLboolean FixedSampleLocations;
+   } DefaultGeometry;
+
+   /** \name  Drawing bounds (Intersection of buffer size and scissor box)
+    * The drawing region is given by [_Xmin, _Xmax) x [_Ymin, _Ymax),
+    * (inclusive for _Xmin and _Ymin while exclusive for _Xmax and _Ymax)
+    */
    /*@{*/
-   GLint _Xmin, _Xmax;  /**< inclusive */
-   GLint _Ymin, _Ymax;  /**< exclusive */
+   GLint _Xmin, _Xmax;
+   GLint _Ymin, _Ymax;
    /*@}*/
 
    /** \name  Derived Z buffer stuff */
@@ -3169,6 +3175,22 @@ struct gl_framebuffer
    /** One of the GL_FRAMEBUFFER_(IN)COMPLETE_* tokens */
    GLenum _Status;
 
+   /** Whether one of Attachment has Type != GL_NONE
+    * NOTE: the values for Width and Height are set to 0 in case of having
+    * no attachments, a backend driver supporting the extension
+    * GL_ARB_framebuffer_no_attachments must check for the flag _HasAttachments
+    * and if GL_FALSE, must then use the values in DefaultGeometry to initialize
+    * its viewport, scissor and so on (in particular _Xmin, _Xmax, _Ymin and
+    * _Ymax do NOT take into account _HasAttachments being false). To get the
+    * geometry of the framebuffer, the  helper functions
+    *   _mesa_geometric_width(),
+    *   _mesa_geometric_height(),
+    *   _mesa_geometric_samples() and
+    *   _mesa_geometric_layers()
+    * are available that check _HasAttachments.
+    */
+   bool _HasAttachments;
+
    /** Integer color values */
    GLboolean _IntegerColor;
 
@@ -3179,7 +3201,9 @@ struct gl_framebuffer
    /**
     * The maximum number of layers in the framebuffer, or 0 if the framebuffer
     * is not layered.  For cube maps and cube map arrays, each cube face
-    * counts as a layer.
+    * counts as a layer. As the case for Width, Height a backend driver
+    * supporting GL_ARB_framebuffer_no_attachments must use DefaultGeometry
+    * in the case that _HasAttachments is false
     */
    GLuint MaxNumLayers;
 
@@ -3358,6 +3382,14 @@ struct gl_constants
    GLuint MaxRenderbufferSize;   /**< GL_EXT_framebuffer_object */
    GLuint MaxSamples;            /**< GL_ARB_framebuffer_object */
 
+   /**
+    * GL_ARB_framebuffer_no_attachments
+    */
+   GLuint MaxFramebufferWidth;
+   GLuint MaxFramebufferHeight;
+   GLuint MaxFramebufferLayers;
+   GLuint MaxFramebufferSamples;
+
    /** Number of varying vectors between any two shader stages. */
    GLuint MaxVarying;
 
@@ -3635,6 +3667,7 @@ struct gl_extensions
    GLboolean ARB_fragment_program;
    GLboolean ARB_fragment_program_shadow;
    GLboolean ARB_fragment_shader;
+   GLboolean ARB_framebuffer_no_attachments;
    GLboolean ARB_framebuffer_object;
    GLboolean ARB_explicit_attrib_location;
    GLboolean ARB_explicit_uniform_location;
@@ -4422,7 +4455,12 @@ enum _debug
    DEBUG_INCOMPLETE_FBO         = (1 << 3)
 };
 
-
+static inline bool
+_mesa_active_fragment_shader_has_atomic_ops(const struct gl_context *ctx)
+{
+   return ctx->Shader._CurrentFragmentProgram != NULL &&
+      ctx->Shader._CurrentFragmentProgram->NumAtomicBuffers > 0;
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c
index aecb5b1fa51..5626054687b 100644
--- a/src/mesa/main/objectlabel.c
+++ b/src/mesa/main/objectlabel.c
@@ -30,6 +30,7 @@
 #include "enums.h"
 #include "fbobject.h"
 #include "objectlabel.h"
+#include "pipelineobj.h"
 #include "queryobj.h"
 #include "samplerobj.h"
 #include "shaderobj.h"
@@ -214,8 +215,13 @@ get_label_pointer(struct gl_context *ctx, GLenum identifier, GLuint name,
       }
       break;
    case GL_PROGRAM_PIPELINE:
-      /* requires GL 4.2 */
-      goto invalid_enum;
+      {
+         struct gl_pipeline_object *pipe =
+            _mesa_lookup_pipeline_object(ctx, name);
+         if (pipe)
+            labelPtr = &pipe->Label;
+      }
+      break;
    default:
       goto invalid_enum;
    }
diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 0fefa7d568b..279ae2078fe 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -65,6 +65,7 @@ _mesa_delete_pipeline_object(struct gl_context *ctx,
 
    _mesa_reference_shader_program(ctx, &obj->ActiveProgram, NULL);
    mtx_destroy(&obj->Mutex);
+   free(obj->Label);
    ralloc_free(obj);
 }
 
@@ -136,8 +137,8 @@ _mesa_free_pipeline_data(struct gl_context *ctx)
  * a non-existent ID.  The spec defines ID 0 as being technically
  * non-existent.
  */
-static inline struct gl_pipeline_object *
-lookup_pipeline_object(struct gl_context *ctx, GLuint id)
+struct gl_pipeline_object *
+_mesa_lookup_pipeline_object(struct gl_context *ctx, GLuint id)
 {
    if (id == 0)
       return NULL;
@@ -225,7 +226,7 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
    struct gl_shader_program *shProg = NULL;
    GLbitfield any_valid_stages;
 
@@ -337,7 +338,7 @@ _mesa_ActiveShaderProgram(GLuint pipeline, GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *shProg = NULL;
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    if (program != 0) {
       shProg = _mesa_lookup_shader_program_err(ctx, program,
@@ -399,7 +400,7 @@ _mesa_BindProgramPipeline(GLuint pipeline)
     */
    if (pipeline) {
       /* non-default pipeline object */
-      newObj = lookup_pipeline_object(ctx, pipeline);
+      newObj = _mesa_lookup_pipeline_object(ctx, pipeline);
       if (!newObj) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glBindProgramPipeline(non-gen name)");
@@ -468,7 +469,7 @@ _mesa_DeleteProgramPipelines(GLsizei n, const GLuint *pipelines)
 
    for (i = 0; i < n; i++) {
       struct gl_pipeline_object *obj =
-         lookup_pipeline_object(ctx, pipelines[i]);
+         _mesa_lookup_pipeline_object(ctx, pipelines[i]);
 
       if (obj) {
          assert(obj->Name == pipelines[i]);
@@ -568,7 +569,7 @@ _mesa_IsProgramPipeline(GLuint pipeline)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   struct gl_pipeline_object *obj = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *obj = _mesa_lookup_pipeline_object(ctx, pipeline);
    if (obj == NULL)
       return GL_FALSE;
 
@@ -582,7 +583,7 @@ void GLAPIENTRY
 _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    /* Are geometry shaders available in this context?
     */
@@ -673,6 +674,38 @@ program_stages_all_active(struct gl_pipeline_object *pipe,
    return status;
 }
 
+static bool
+program_stages_interleaved_illegally(const struct gl_pipeline_object *pipe)
+{
+   struct gl_shader_program *prev = NULL;
+   unsigned i, j;
+
+   /* Look for programs bound to stages: A -> B -> A, with any intervening
+    * sequence of unrelated programs or empty stages.
+    */
+   for (i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_shader_program *cur = pipe->CurrentProgram[i];
+
+      /* Empty stages anywhere in the pipe are OK */
+      if (!cur || cur == prev)
+         continue;
+
+      if (prev) {
+         /* We've seen an A -> B transition; look at the rest of the pipe
+          * to see if we ever see A again.
+          */
+         for (j = i + 1; j < MESA_SHADER_STAGES; j++) {
+            if (pipe->CurrentProgram[j] == prev)
+               return true;
+         }
+      }
+
+      prev = cur;
+   }
+
+   return false;
+}
+
 extern GLboolean
 _mesa_validate_program_pipeline(struct gl_context* ctx,
                                 struct gl_pipeline_object *pipe,
@@ -721,24 +754,13 @@ _mesa_validate_program_pipeline(struct gl_context* ctx,
     *         - One program object is active for at least two shader stages
     *           and a second program is active for a shader stage between two
     *           stages for which the first program was active."
-    *
-    * Without Tesselation, the only case where this can occur is the geometry
-    * shader between the fragment shader and vertex shader.
     */
-   if (pipe->CurrentProgram[MESA_SHADER_GEOMETRY]
-       && pipe->CurrentProgram[MESA_SHADER_FRAGMENT]
-       && pipe->CurrentProgram[MESA_SHADER_VERTEX]) {
-      if (pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name == pipe->CurrentProgram[MESA_SHADER_FRAGMENT]->Name &&
-          pipe->CurrentProgram[MESA_SHADER_GEOMETRY]->Name != pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name) {
-         pipe->InfoLog =
-            ralloc_asprintf(pipe,
-                            "Program %d is active for geometry stage between "
-                            "two stages for which another program %d is "
-                            "active",
-                            pipe->CurrentProgram[MESA_SHADER_GEOMETRY]->Name,
-                            pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name);
-         goto err;
-      }
+   if (program_stages_interleaved_illegally(pipe)) {
+      pipe->InfoLog =
+         ralloc_strdup(pipe,
+                       "Program is active for multiple shader stages with an "
+                       "intervening stage provided by another program");
+      goto err;
    }
 
    /* Section 2.11.11 (Shader Execution), subheading "Validation," of the
@@ -820,7 +842,7 @@ _mesa_ValidateProgramPipeline(GLuint pipeline)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    if (!pipe) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -838,7 +860,7 @@ _mesa_GetProgramPipelineInfoLog(GLuint pipeline, GLsizei bufSize,
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    if (!pipe) {
       _mesa_error(ctx, GL_INVALID_VALUE,
diff --git a/src/mesa/main/pipelineobj.h b/src/mesa/main/pipelineobj.h
index b57bcb99e5c..6dee775ab5e 100644
--- a/src/mesa/main/pipelineobj.h
+++ b/src/mesa/main/pipelineobj.h
@@ -45,6 +45,9 @@ _mesa_init_pipeline(struct gl_context *ctx);
 extern void
 _mesa_free_pipeline_data(struct gl_context *ctx);
 
+extern struct gl_pipeline_object *
+_mesa_lookup_pipeline_object(struct gl_context *ctx, GLuint id);
+
 extern void
 _mesa_reference_pipeline_object_(struct gl_context *ctx,
                                  struct gl_pipeline_object **ptr,
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index b15a13210c0..d857b84e60d 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -220,12 +220,12 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
    case GL_UNIFORM:
-   case GL_UNIFORM_BLOCK:
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      /* Validate name syntax for arrays. */
+      /* Validate name syntax for array variables */
       if (!valid_program_resource_index_name(name))
          return GL_INVALID_INDEX;
-
+      /* fall-through */
+   case GL_UNIFORM_BLOCK:
       res = _mesa_program_resource_find_name(shProg, programInterface, name);
       if (!res)
          return GL_INVALID_INDEX;
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index ed0104c9e46..a3357cd6419 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -46,15 +46,18 @@
 /**
  * Return true if the conversion L=R+G+B is needed.
  */
-static GLboolean
-need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
+GLboolean
+_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
 {
    GLenum baseTexFormat = _mesa_get_format_base_format(texFormat);
 
    return (baseTexFormat == GL_RG ||
            baseTexFormat == GL_RGB ||
            baseTexFormat == GL_RGBA) &&
-          (format == GL_LUMINANCE || format == GL_LUMINANCE_ALPHA);
+          (format == GL_LUMINANCE ||
+           format == GL_LUMINANCE_ALPHA ||
+           format == GL_LUMINANCE_INTEGER_EXT ||
+           format == GL_LUMINANCE_ALPHA_INTEGER_EXT);
 }
 
 
@@ -83,7 +86,7 @@ get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
    if (uses_blit) {
       /* For blit-based ReadPixels packing, the clamping is done automatically
        * unless the type is float. */
-      if (_mesa_get_clamp_read_color(ctx) &&
+      if (_mesa_get_clamp_read_color(ctx, ctx->ReadBuffer) &&
           (type == GL_FLOAT || type == GL_HALF_FLOAT)) {
          transferOps |= IMAGE_CLAMP_BIT;
       }
@@ -91,7 +94,7 @@ get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
    else {
       /* For CPU-based ReadPixels packing, the clamping must always be done
        * for non-float types, */
-      if (_mesa_get_clamp_read_color(ctx) ||
+      if (_mesa_get_clamp_read_color(ctx, ctx->ReadBuffer) ||
           (type != GL_FLOAT && type != GL_HALF_FLOAT)) {
          transferOps |= IMAGE_CLAMP_BIT;
       }
@@ -102,7 +105,7 @@ get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
     * have any effect anyway.
     */
    if (_mesa_get_format_datatype(texFormat) == GL_UNSIGNED_NORMALIZED &&
-       !need_rgb_to_luminance_conversion(texFormat, format)) {
+       !_mesa_need_rgb_to_luminance_conversion(texFormat, format)) {
       transferOps &= ~IMAGE_CLAMP_BIT;
    }
 
@@ -146,7 +149,7 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 
    default:
       /* Color formats. */
-      if (need_rgb_to_luminance_conversion(rb->Format, format)) {
+      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format)) {
          return GL_TRUE;
       }
 
@@ -418,7 +421,7 @@ read_rgba_pixels( struct gl_context *ctx,
                   const struct gl_pixelstore_attrib *packing )
 {
    GLbitfield transferOps;
-   bool dst_is_integer, dst_is_luminance, needs_rebase;
+   bool dst_is_integer, convert_rgb_to_lum, needs_rebase;
    int dst_stride, src_stride, rb_stride;
    uint32_t dst_format, src_format;
    GLubyte *dst, *map;
@@ -439,10 +442,8 @@ read_rgba_pixels( struct gl_context *ctx,
    dst_is_integer = _mesa_is_enum_format_integer(format);
    dst_stride = _mesa_image_row_stride(packing, width, format, type);
    dst_format = _mesa_format_from_format_and_type(format, type);
-   dst_is_luminance = format == GL_LUMINANCE ||
-                      format == GL_LUMINANCE_ALPHA ||
-                      format == GL_LUMINANCE_INTEGER_EXT ||
-                      format == GL_LUMINANCE_ALPHA_INTEGER_EXT;
+   convert_rgb_to_lum =
+      _mesa_need_rgb_to_luminance_conversion(rb->Format, format);
    dst = (GLubyte *) _mesa_image_address2d(packing, pixels, width, height,
                                            format, type, 0, 0);
 
@@ -490,7 +491,7 @@ read_rgba_pixels( struct gl_context *ctx,
     */
    assert(!transferOps || (transferOps && !dst_is_integer));
 
-   needs_rgba = transferOps || dst_is_luminance;
+   needs_rgba = transferOps || convert_rgb_to_lum;
    rgba = NULL;
    if (needs_rgba) {
       uint32_t rgba_format;
@@ -563,7 +564,7 @@ read_rgba_pixels( struct gl_context *ctx,
     * If the dst format is Luminance, we need to do the conversion by computing
     * L=R+G+B values.
     */
-   if (!dst_is_luminance) {
+   if (!convert_rgb_to_lum) {
       _mesa_format_convert(dst, dst_format, dst_stride,
                            src, src_format, src_stride,
                            width, height,
diff --git a/src/mesa/main/readpix.h b/src/mesa/main/readpix.h
index 4bb35e17e4d..1636dd9ce3e 100644
--- a/src/mesa/main/readpix.h
+++ b/src/mesa/main/readpix.h
@@ -37,6 +37,9 @@ extern GLboolean
 _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
                                  GLenum type, GLboolean uses_blit);
 
+extern GLboolean
+_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format);
+
 extern void
 _mesa_readpixels(struct gl_context *ctx,
                  GLint x, GLint y, GLsizei width, GLsizei height,
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 6e46553724b..a6246a39aad 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -28,6 +28,7 @@
  * \author Ian Romanick <[email protected]>
  */
 
+#include "main/context.h"
 #include "main/core.h"
 #include "glsl_symbol_table.h"
 #include "ir.h"
@@ -478,12 +479,20 @@ _mesa_GetFragDataLocation(GLuint program, const GLchar *name)
 const char*
 _mesa_program_resource_name(struct gl_program_resource *res)
 {
+   const ir_variable *var;
    switch (res->Type) {
    case GL_UNIFORM_BLOCK:
       return RESOURCE_UBO(res)->Name;
    case GL_TRANSFORM_FEEDBACK_VARYING:
       return RESOURCE_XFB(res)->Name;
    case GL_PROGRAM_INPUT:
+      var = RESOURCE_VAR(res);
+      /* Special case gl_VertexIDMESA -> gl_VertexID. */
+      if (var->data.mode == ir_var_system_value &&
+          var->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) {
+         return "gl_VertexID";
+      }
+   /* fallthrough */
    case GL_PROGRAM_OUTPUT:
       return RESOURCE_VAR(res)->name;
    case GL_UNIFORM:
@@ -538,6 +547,17 @@ struct gl_program_resource *
 _mesa_program_resource_find_name(struct gl_shader_program *shProg,
                                  GLenum programInterface, const char *name)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   const char *full_name = name;
+
+   /* When context has 'VertexID_is_zero_based' set, gl_VertexID has been
+    * lowered to gl_VertexIDMESA.
+    */
+   if (name && ctx->Const.VertexID_is_zero_based) {
+      if (strcmp(name, "gl_VertexID") == 0)
+         full_name = "gl_VertexIDMESA";
+   }
+
    struct gl_program_resource *res = shProg->ProgramResourceList;
    for (unsigned i = 0; i < shProg->NumProgramResourceList; i++, res++) {
       if (res->Type != programInterface)
@@ -562,7 +582,7 @@ _mesa_program_resource_find_name(struct gl_shader_program *shProg,
          break;
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
-         if (array_index_of_resource(res, name) >= 0)
+         if (array_index_of_resource(res, full_name) >= 0)
             return res;
          break;
       default:
@@ -727,6 +747,10 @@ program_resource_location(struct gl_shader_program *shProg,
          return -1;
    }
 
+   /* Built-in locations should report GL_INVALID_INDEX. */
+   if (is_gl_identifier(name))
+      return GL_INVALID_INDEX;
+
    /* VERT_ATTRIB_GENERIC0 and FRAG_RESULT_DATA0 are decremented as these
     * offsets are used internally to differentiate between built-in attributes
     * and user-defined attributes.
@@ -986,8 +1010,9 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
    case GL_ACTIVE_VARIABLES:
       return get_buffer_property(shProg, res, prop, val, caller);
    case GL_REFERENCED_BY_COMPUTE_SHADER:
-      if (!ctx->Extensions.ARB_compute_shader)
+      if (!_mesa_has_compute_shaders(ctx))
          goto invalid_enum;
+      /* fallthrough */
    case GL_REFERENCED_BY_VERTEX_SHADER:
    case GL_REFERENCED_BY_GEOMETRY_SHADER:
    case GL_REFERENCED_BY_FRAGMENT_SHADER:
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index a04b28711f7..a4296adf799 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -532,7 +532,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
    /* True if geometry shaders (of the form that was adopted into GLSL 1.50
     * and GL 3.2) are available in this context
     */
-   const bool has_core_gs = _mesa_is_desktop_gl(ctx) && ctx->Version >= 32;
+   const bool has_core_gs = _mesa_has_geometry_shaders(ctx);
 
    /* Are uniform buffer objects available in this context?
     */
@@ -569,13 +569,13 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
       *params = _mesa_longest_attribute_name_length(shProg);
       return;
    case GL_ACTIVE_UNIFORMS:
-      *params = shProg->NumUserUniformStorage - shProg->NumHiddenUniforms;
+      *params = shProg->NumUniformStorage - shProg->NumHiddenUniforms;
       return;
    case GL_ACTIVE_UNIFORM_MAX_LENGTH: {
       unsigned i;
       GLint max_len = 0;
       const unsigned num_uniforms =
-         shProg->NumUserUniformStorage - shProg->NumHiddenUniforms;
+         shProg->NumUniformStorage - shProg->NumHiddenUniforms;
 
       for (i = 0; i < num_uniforms; i++) {
 	 /* Add one for the terminating NUL character for a non-array, and
diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c
index e428960362d..110a18e1e2c 100644
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -282,10 +282,10 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
    unsigned i;
 
    if (shProg->UniformStorage) {
-      for (i = 0; i < shProg->NumUserUniformStorage; ++i)
+      for (i = 0; i < shProg->NumUniformStorage; ++i)
          _mesa_uniform_detach_all_driver_storage(&shProg->UniformStorage[i]);
       ralloc_free(shProg->UniformStorage);
-      shProg->NumUserUniformStorage = 0;
+      shProg->NumUniformStorage = 0;
       shProg->UniformStorage = NULL;
    }
 
diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c
index 0b76cc01218..d5ac9f1fb13 100644
--- a/src/mesa/main/shared.c
+++ b/src/mesa/main/shared.c
@@ -313,7 +313,6 @@ free_shared_state(struct gl_context *ctx, struct gl_shared_state *shared)
    _mesa_DeleteHashTable(shared->Programs);
 
    _mesa_reference_vertprog(ctx, &shared->DefaultVertexProgram, NULL);
-   _mesa_reference_geomprog(ctx, &shared->DefaultGeometryProgram, NULL);
    _mesa_reference_fragprog(ctx, &shared->DefaultFragmentProgram, NULL);
 
    _mesa_HashDeleteAll(shared->ATIShaders, delete_fragshader_cb, ctx);
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 99db37bafd7..bede7fe1d0e 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -225,7 +225,7 @@ update_program(struct gl_context *ctx)
    if (ctx->GeometryProgram._Current != prevGP) {
       new_state |= _NEW_PROGRAM;
       if (ctx->Driver.BindProgram) {
-         ctx->Driver.BindProgram(ctx, MESA_GEOMETRY_PROGRAM,
+         ctx->Driver.BindProgram(ctx, GL_GEOMETRY_PROGRAM_NV,
                             (struct gl_program *) ctx->GeometryProgram._Current);
       }
    }
@@ -266,15 +266,9 @@ update_program_constants(struct gl_context *ctx)
       }
    }
 
-   if (ctx->GeometryProgram._Current) {
-      const struct gl_program_parameter_list *params =
-         ctx->GeometryProgram._Current->Base.Parameters;
-      /*FIXME: StateFlags is always 0 because we have unnamed constant
-       *       not state changes */
-      if (params /*&& params->StateFlags & ctx->NewState*/) {
-         new_state |= _NEW_PROGRAM_CONSTANTS;
-      }
-   }
+   /* Don't handle geometry shaders here. They don't use any state
+    * constants.
+    */
 
    if (ctx->VertexProgram._Current) {
       const struct gl_program_parameter_list *params =
@@ -389,10 +383,10 @@ _mesa_update_state_locked( struct gl_context *ctx )
       update_frontbit( ctx );
 
    if (new_state & _NEW_BUFFERS)
-      _mesa_update_framebuffer(ctx);
+      _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
 
    if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
-      _mesa_update_draw_buffer_bounds( ctx );
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
 
    if (new_state & _NEW_LIGHT)
       _mesa_update_lighting( ctx );
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index ccd0124a2bb..800720b798e 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -68,10 +68,13 @@ struct function {
    int offset;
 };
 
+extern const struct function common_desktop_functions_possible[];
+extern const struct function gl_compatibility_functions_possible[];
 extern const struct function gl_core_functions_possible[];
 extern const struct function gles11_functions_possible[];
 extern const struct function gles2_functions_possible[];
 extern const struct function gles3_functions_possible[];
+extern const struct function gles31_functions_possible[];
 
 class DispatchSanity_test : public ::testing::Test {
 public:
@@ -96,7 +99,7 @@ DispatchSanity_test::SetUp()
    _mesa_init_driver_functions(&driver_functions);
 
    const unsigned size = _glapi_get_dispatch_table_size();
-   nop_table = (_glapi_proc *) _glapi_new_nop_table(size);
+   nop_table = (_glapi_proc *) _mesa_new_nop_table(size);
 }
 
 void
@@ -175,10 +178,19 @@ validate_nops(struct gl_context *ctx, const _glapi_proc *nop_table)
 TEST_F(DispatchSanity_test, GL31_CORE)
 {
    SetUpCtx(API_OPENGL_CORE, 31);
+   validate_functions(&ctx, common_desktop_functions_possible, nop_table);
    validate_functions(&ctx, gl_core_functions_possible, nop_table);
    validate_nops(&ctx, nop_table);
 }
 
+TEST_F(DispatchSanity_test, GL30)
+{
+   SetUpCtx(API_OPENGL_COMPAT, 30);
+   validate_functions(&ctx, common_desktop_functions_possible, nop_table);
+   validate_functions(&ctx, gl_compatibility_functions_possible, nop_table);
+   validate_nops(&ctx, nop_table);
+}
+
 TEST_F(DispatchSanity_test, GLES11)
 {
    SetUpCtx(API_OPENGLES, 11);
@@ -201,7 +213,16 @@ TEST_F(DispatchSanity_test, GLES3)
    validate_nops(&ctx, nop_table);
 }
 
-const struct function gl_core_functions_possible[] = {
+TEST_F(DispatchSanity_test, GLES31)
+{
+   SetUpCtx(API_OPENGLES2, 31);
+   validate_functions(&ctx, gles2_functions_possible, nop_table);
+   validate_functions(&ctx, gles3_functions_possible, nop_table);
+   validate_functions(&ctx, gles31_functions_possible, nop_table);
+   validate_nops(&ctx, nop_table);
+}
+
+const struct function common_desktop_functions_possible[] = {
    { "glCullFace", 10, -1 },
    { "glFrontFace", 10, -1 },
    { "glHint", 10, -1 },
@@ -213,8 +234,8 @@ const struct function gl_core_functions_possible[] = {
    { "glTexParameterfv", 10, -1 },
    { "glTexParameteri", 10, -1 },
    { "glTexParameteriv", 10, -1 },
-   { "glTexImage1D", 10, -1 },
-   { "glTexImage2D", 10, -1 },
+   { "glTexImage1D", 10, _gloffset_TexImage1D },
+   { "glTexImage2D", 10, _gloffset_TexImage2D },
    { "glDrawBuffer", 10, -1 },
    { "glClear", 10, -1 },
    { "glClearColor", 10, -1 },
@@ -482,7 +503,6 @@ const struct function gl_core_functions_possible[] = {
    /* GL 3.1 */
    { "glDrawArraysInstanced", 31, -1 },
    { "glDrawElementsInstanced", 31, -1 },
-   { "glTexBuffer", 31, -1 },
    { "glPrimitiveRestartIndex", 31, -1 },
 
    /* GL_ARB_shader_objects */
@@ -535,12 +555,8 @@ const struct function gl_core_functions_possible[] = {
    { "glGetInteger64i_v", 32, -1 },
    { "glGetBufferParameteri64v", 32, -1 },
    { "glFramebufferTexture", 32, -1 },
-
-   /* GL_ARB_geometry_shader4 */
-   { "glProgramParameteriARB", 32, -1 },
-   { "glFramebufferTextureARB", 32, -1 },
-   { "glFramebufferTextureLayerARB", 32, -1 },
-   { "glFramebufferTextureFaceARB", 32, -1 },
+   { "glProgramParameteri", 32, -1 },
+   { "glFramebufferTextureLayer", 32, -1 },
 
    /* GL 3.3 */
    { "glVertexAttribDivisor", 33, -1 },
@@ -673,34 +689,6 @@ const struct function gl_core_functions_possible[] = {
    { "glVertexAttribP4uiv", 43, -1 },
    { "glDrawArraysIndirect", 43, -1 },
    { "glDrawElementsIndirect", 43, -1 },
-   { "glUniform1d", 40, -1 },
-   { "glUniform2d", 40, -1 },
-   { "glUniform3d", 40, -1 },
-   { "glUniform4d", 40, -1 },
-   { "glUniform1dv", 40, -1 },
-   { "glUniform2dv", 40, -1 },
-   { "glUniform3dv", 40, -1 },
-   { "glUniform4dv", 40, -1 },
-   { "glUniformMatrix2dv", 40, -1 },
-   { "glUniformMatrix3dv", 40, -1 },
-   { "glUniformMatrix4dv", 40, -1 },
-   { "glUniformMatrix2x3dv", 40, -1 },
-   { "glUniformMatrix2x4dv", 40, -1 },
-   { "glUniformMatrix3x2dv", 40, -1 },
-   { "glUniformMatrix3x4dv", 40, -1 },
-   { "glUniformMatrix4x2dv", 40, -1 },
-   { "glUniformMatrix4x3dv", 40, -1 },
-   { "glGetUniformdv", 43, -1 },
-// { "glGetSubroutineUniformLocation", 43, -1 },        // XXX: Add to xml
-// { "glGetSubroutineIndex", 43, -1 },                  // XXX: Add to xml
-// { "glGetActiveSubroutineUniformiv", 43, -1 },        // XXX: Add to xml
-// { "glGetActiveSubroutineUniformName", 43, -1 },      // XXX: Add to xml
-// { "glGetActiveSubroutineName", 43, -1 },             // XXX: Add to xml
-// { "glUniformSubroutinesuiv", 43, -1 },               // XXX: Add to xml
-// { "glGetUniformSubroutineuiv", 43, -1 },             // XXX: Add to xml
-// { "glGetProgramStageiv", 43, -1 },                   // XXX: Add to xml
-// { "glPatchParameteri", 43, -1 },                     // XXX: Add to xml
-// { "glPatchParameterfv", 43, -1 },                    // XXX: Add to xml
    { "glBindTransformFeedback", 43, -1 },
    { "glDeleteTransformFeedbacks", 43, -1 },
    { "glGenTransformFeedbacks", 43, -1 },
@@ -728,12 +716,12 @@ const struct function gl_core_functions_possible[] = {
    { "glGenProgramPipelines", 43, -1 },
    { "glIsProgramPipeline", 43, -1 },
    { "glGetProgramPipelineiv", 43, -1 },
+   { "glProgramUniform1d", 43, -1 },
+   { "glProgramUniform1dv", 43, -1 },
    { "glProgramUniform1i", 43, -1 },
    { "glProgramUniform1iv", 43, -1 },
    { "glProgramUniform1f", 43, -1 },
    { "glProgramUniform1fv", 43, -1 },
-   { "glProgramUniform1d", 40, -1 },
-   { "glProgramUniform1dv", 40, -1 },
    { "glProgramUniform1ui", 43, -1 },
    { "glProgramUniform1uiv", 43, -1 },
    { "glProgramUniform2i", 43, -1 },
@@ -754,50 +742,32 @@ const struct function gl_core_functions_possible[] = {
    { "glProgramUniform3uiv", 43, -1 },
    { "glProgramUniform4i", 43, -1 },
    { "glProgramUniform4iv", 43, -1 },
+   { "glProgramUniform4d", 43, -1 },
+   { "glProgramUniform4dv", 43, -1 },
    { "glProgramUniform4f", 43, -1 },
    { "glProgramUniform4fv", 43, -1 },
-   { "glProgramUniform4d", 40, -1 },
-   { "glProgramUniform4dv", 40, -1 },
    { "glProgramUniform4ui", 43, -1 },
    { "glProgramUniform4uiv", 43, -1 },
+   { "glProgramUniformMatrix2dv", 43, -1 },
    { "glProgramUniformMatrix2fv", 43, -1 },
+   { "glProgramUniformMatrix3dv", 43, -1 },
    { "glProgramUniformMatrix3fv", 43, -1 },
+   { "glProgramUniformMatrix4dv", 43, -1 },
    { "glProgramUniformMatrix4fv", 43, -1 },
-   { "glProgramUniformMatrix2dv", 40, -1 },
-   { "glProgramUniformMatrix3dv", 40, -1 },
-   { "glProgramUniformMatrix4dv", 40, -1 },
+   { "glProgramUniformMatrix2x3dv", 43, -1 },
    { "glProgramUniformMatrix2x3fv", 43, -1 },
+   { "glProgramUniformMatrix3x2dv", 43, -1 },
    { "glProgramUniformMatrix3x2fv", 43, -1 },
+   { "glProgramUniformMatrix2x4dv", 43, -1 },
    { "glProgramUniformMatrix2x4fv", 43, -1 },
+   { "glProgramUniformMatrix4x2dv", 43, -1 },
    { "glProgramUniformMatrix4x2fv", 43, -1 },
+   { "glProgramUniformMatrix3x4dv", 43, -1 },
    { "glProgramUniformMatrix3x4fv", 43, -1 },
+   { "glProgramUniformMatrix4x3dv", 43, -1 },
    { "glProgramUniformMatrix4x3fv", 43, -1 },
-   { "glProgramUniformMatrix2x3dv", 40, -1 },
-   { "glProgramUniformMatrix3x2dv", 40, -1 },
-   { "glProgramUniformMatrix2x4dv", 40, -1 },
-   { "glProgramUniformMatrix4x2dv", 40, -1 },
-   { "glProgramUniformMatrix3x4dv", 40, -1 },
-   { "glProgramUniformMatrix4x3dv", 40, -1 },
    { "glValidateProgramPipeline", 43, -1 },
    { "glGetProgramPipelineInfoLog", 43, -1 },
-   { "glVertexAttribL1d", 41, -1 },
-   { "glVertexAttribL2d", 41, -1 },
-   { "glVertexAttribL3d", 41, -1 },
-   { "glVertexAttribL4d", 41, -1 },
-   { "glVertexAttribL1dv", 41, -1 },
-   { "glVertexAttribL2dv", 41, -1 },
-   { "glVertexAttribL3dv", 41, -1 },
-   { "glVertexAttribL4dv", 41, -1 },
-   { "glVertexAttribLPointer", 41, -1 },
-   { "glGetVertexAttribLdv", 41, -1 },
-   { "glViewportArrayv", 43, -1 },
-   { "glViewportIndexedf", 43, -1 },
-   { "glViewportIndexedfv", 43, -1 },
-   { "glScissorArrayv", 43, -1 },
-   { "glScissorIndexed", 43, -1 },
-   { "glScissorIndexedv", 43, -1 },
-   { "glDepthRangeArrayv", 43, -1 },
-   { "glDepthRangeIndexed", 43, -1 },
    { "glGetFloati_v", 43, -1 },
    { "glGetDoublei_v", 43, -1 },
 // { "glCreateSyncFromCLeventARB", 43, -1 },            // XXX: Add to xml
@@ -840,8 +810,6 @@ const struct function gl_core_functions_possible[] = {
    { "glClearBufferSubData", 43, -1 },
 // { "glClearNamedBufferDataEXT", 43, -1 },             // XXX: Add to xml
 // { "glClearNamedBufferSubDataEXT", 43, -1 },          // XXX: Add to xml
-   { "glDispatchCompute", 43, -1 },
-   { "glDispatchComputeIndirect", 43, -1 },
    { "glCopyImageSubData", 43, -1 },
    { "glTextureView", 43, -1 },
    { "glBindVertexBuffer", 43, -1 },
@@ -853,11 +821,10 @@ const struct function gl_core_functions_possible[] = {
 // { "glVertexArrayBindVertexBufferEXT", 43, -1 },      // XXX: Add to xml
 // { "glVertexArrayVertexAttribFormatEXT", 43, -1 },    // XXX: Add to xml
 // { "glVertexArrayVertexAttribIFormatEXT", 43, -1 },   // XXX: Add to xml
-// { "glVertexArrayVertexAttribLFormatEXT", 43, -1 },   // XXX: Add to xml
 // { "glVertexArrayVertexAttribBindingEXT", 43, -1 },   // XXX: Add to xml
 // { "glVertexArrayVertexBindingDivisorEXT", 43, -1 },  // XXX: Add to xml
-// { "glFramebufferParameteri", 43, -1 },               // XXX: Add to xml
-// { "glGetFramebufferParameteriv", 43, -1 },           // XXX: Add to xml
+   { "glFramebufferParameteri", 43, -1 },
+   { "glGetFramebufferParameteriv", 43, -1 },
 // { "glNamedFramebufferParameteriEXT", 43, -1 },       // XXX: Add to xml
 // { "glGetNamedFramebufferParameterivEXT", 43, -1 },   // XXX: Add to xml
 // { "glGetInternalformati64v", 43, -1 },               // XXX: Add to xml
@@ -876,7 +843,6 @@ const struct function gl_core_functions_possible[] = {
    { "glGetProgramResourceLocation", 43, -1 },
    { "glGetProgramResourceLocationIndex", 43, -1 },
 // { "glShaderStorageBlockBinding", 43, -1 },           // XXX: Add to xml
-   { "glTexBufferRange", 43, -1 },
 // { "glTextureBufferRangeEXT", 43, -1 },               // XXX: Add to xml
    { "glTexStorage2DMultisample", 43, -1 },
    { "glTexStorage3DMultisample", 43, -1 },
@@ -958,6 +924,814 @@ const struct function gl_core_functions_possible[] = {
    /* GL_ARB_clip_control */
    { "glClipControl", 45, -1 },
 
+   /* GL_ARB_compute_shader */
+   { "glDispatchCompute", 43, -1 },
+   { "glDispatchComputeIndirect", 43, -1 },
+
+   /* GL_EXT_polygon_offset_clamp */
+   { "glPolygonOffsetClampEXT", 11, -1 },
+   { NULL, 0, -1 }
+};
+
+const struct function gl_compatibility_functions_possible[] = {
+   { "glBindVertexArrayAPPLE", 10, -1 },
+   { "glGenVertexArraysAPPLE", 10, -1 },
+   { "glBindRenderbufferEXT", 10, -1 },
+   { "glBindFramebufferEXT", 10, -1 },
+   { "glNewList", 10, _gloffset_NewList },
+   { "glEndList", 10, _gloffset_EndList },
+   { "glCallList", 10, _gloffset_CallList },
+   { "glCallLists", 10, _gloffset_CallLists },
+   { "glDeleteLists", 10, _gloffset_DeleteLists },
+   { "glGenLists", 10, _gloffset_GenLists },
+   { "glListBase", 10, _gloffset_ListBase },
+   { "glBegin", 10, _gloffset_Begin },
+   { "glBitmap", 10, _gloffset_Bitmap },
+   { "glColor3b", 10, _gloffset_Color3b },
+   { "glColor3bv", 10, _gloffset_Color3bv },
+   { "glColor3d", 10, _gloffset_Color3d },
+   { "glColor3dv", 10, _gloffset_Color3dv },
+   { "glColor3f", 10, _gloffset_Color3f },
+   { "glColor3fv", 10, _gloffset_Color3fv },
+   { "glColor3i", 10, _gloffset_Color3i },
+   { "glColor3iv", 10, _gloffset_Color3iv },
+   { "glColor3s", 10, _gloffset_Color3s },
+   { "glColor3sv", 10, _gloffset_Color3sv },
+   { "glColor3ub", 10, _gloffset_Color3ub },
+   { "glColor3ubv", 10, _gloffset_Color3ubv },
+   { "glColor3ui", 10, _gloffset_Color3ui },
+   { "glColor3uiv", 10, _gloffset_Color3uiv },
+   { "glColor3us", 10, _gloffset_Color3us },
+   { "glColor3usv", 10, _gloffset_Color3usv },
+   { "glColor4b", 10, _gloffset_Color4b },
+   { "glColor4bv", 10, _gloffset_Color4bv },
+   { "glColor4d", 10, _gloffset_Color4d },
+   { "glColor4dv", 10, _gloffset_Color4dv },
+   { "glColor4f", 10, _gloffset_Color4f },
+   { "glColor4fv", 10, _gloffset_Color4fv },
+   { "glColor4i", 10, _gloffset_Color4i },
+   { "glColor4iv", 10, _gloffset_Color4iv },
+   { "glColor4s", 10, _gloffset_Color4s },
+   { "glColor4sv", 10, _gloffset_Color4sv },
+   { "glColor4ub", 10, _gloffset_Color4ub },
+   { "glColor4ubv", 10, _gloffset_Color4ubv },
+   { "glColor4ui", 10, _gloffset_Color4ui },
+   { "glColor4uiv", 10, _gloffset_Color4uiv },
+   { "glColor4us", 10, _gloffset_Color4us },
+   { "glColor4usv", 10, _gloffset_Color4usv },
+   { "glEdgeFlag", 10, _gloffset_EdgeFlag },
+   { "glEdgeFlagv", 10, _gloffset_EdgeFlagv },
+   { "glEnd", 10, _gloffset_End },
+   { "glIndexd", 10, _gloffset_Indexd },
+   { "glIndexdv", 10, _gloffset_Indexdv },
+   { "glIndexf", 10, _gloffset_Indexf },
+   { "glIndexfv", 10, _gloffset_Indexfv },
+   { "glIndexi", 10, _gloffset_Indexi },
+   { "glIndexiv", 10, _gloffset_Indexiv },
+   { "glIndexs", 10, _gloffset_Indexs },
+   { "glIndexsv", 10, _gloffset_Indexsv },
+   { "glNormal3b", 10, _gloffset_Normal3b },
+   { "glNormal3bv", 10, _gloffset_Normal3bv },
+   { "glNormal3d", 10, _gloffset_Normal3d },
+   { "glNormal3dv", 10, _gloffset_Normal3dv },
+   { "glNormal3f", 10, _gloffset_Normal3f },
+   { "glNormal3fv", 10, _gloffset_Normal3fv },
+   { "glNormal3i", 10, _gloffset_Normal3i },
+   { "glNormal3iv", 10, _gloffset_Normal3iv },
+   { "glNormal3s", 10, _gloffset_Normal3s },
+   { "glNormal3sv", 10, _gloffset_Normal3sv },
+   { "glRasterPos2d", 10, _gloffset_RasterPos2d },
+   { "glRasterPos2dv", 10, _gloffset_RasterPos2dv },
+   { "glRasterPos2f", 10, _gloffset_RasterPos2f },
+   { "glRasterPos2fv", 10, _gloffset_RasterPos2fv },
+   { "glRasterPos2i", 10, _gloffset_RasterPos2i },
+   { "glRasterPos2iv", 10, _gloffset_RasterPos2iv },
+   { "glRasterPos2s", 10, _gloffset_RasterPos2s },
+   { "glRasterPos2sv", 10, _gloffset_RasterPos2sv },
+   { "glRasterPos3d", 10, _gloffset_RasterPos3d },
+   { "glRasterPos3dv", 10, _gloffset_RasterPos3dv },
+   { "glRasterPos3f", 10, _gloffset_RasterPos3f },
+   { "glRasterPos3fv", 10, _gloffset_RasterPos3fv },
+   { "glRasterPos3i", 10, _gloffset_RasterPos3i },
+   { "glRasterPos3iv", 10, _gloffset_RasterPos3iv },
+   { "glRasterPos3s", 10, _gloffset_RasterPos3s },
+   { "glRasterPos3sv", 10, _gloffset_RasterPos3sv },
+   { "glRasterPos4d", 10, _gloffset_RasterPos4d },
+   { "glRasterPos4dv", 10, _gloffset_RasterPos4dv },
+   { "glRasterPos4f", 10, _gloffset_RasterPos4f },
+   { "glRasterPos4fv", 10, _gloffset_RasterPos4fv },
+   { "glRasterPos4i", 10, _gloffset_RasterPos4i },
+   { "glRasterPos4iv", 10, _gloffset_RasterPos4iv },
+   { "glRasterPos4s", 10, _gloffset_RasterPos4s },
+   { "glRasterPos4sv", 10, _gloffset_RasterPos4sv },
+   { "glRectd", 10, _gloffset_Rectd },
+   { "glRectdv", 10, _gloffset_Rectdv },
+   { "glRectf", 10, _gloffset_Rectf },
+   { "glRectfv", 10, _gloffset_Rectfv },
+   { "glRecti", 10, _gloffset_Recti },
+   { "glRectiv", 10, _gloffset_Rectiv },
+   { "glRects", 10, _gloffset_Rects },
+   { "glRectsv", 10, _gloffset_Rectsv },
+   { "glTexCoord1d", 10, _gloffset_TexCoord1d },
+   { "glTexCoord1dv", 10, _gloffset_TexCoord1dv },
+   { "glTexCoord1f", 10, _gloffset_TexCoord1f },
+   { "glTexCoord1fv", 10, _gloffset_TexCoord1fv },
+   { "glTexCoord1i", 10, _gloffset_TexCoord1i },
+   { "glTexCoord1iv", 10, _gloffset_TexCoord1iv },
+   { "glTexCoord1s", 10, _gloffset_TexCoord1s },
+   { "glTexCoord1sv", 10, _gloffset_TexCoord1sv },
+   { "glTexCoord2d", 10, _gloffset_TexCoord2d },
+   { "glTexCoord2dv", 10, _gloffset_TexCoord2dv },
+   { "glTexCoord2f", 10, _gloffset_TexCoord2f },
+   { "glTexCoord2fv", 10, _gloffset_TexCoord2fv },
+   { "glTexCoord2i", 10, _gloffset_TexCoord2i },
+   { "glTexCoord2iv", 10, _gloffset_TexCoord2iv },
+   { "glTexCoord2s", 10, _gloffset_TexCoord2s },
+   { "glTexCoord2sv", 10, _gloffset_TexCoord2sv },
+   { "glTexCoord3d", 10, _gloffset_TexCoord3d },
+   { "glTexCoord3dv", 10, _gloffset_TexCoord3dv },
+   { "glTexCoord3f", 10, _gloffset_TexCoord3f },
+   { "glTexCoord3fv", 10, _gloffset_TexCoord3fv },
+   { "glTexCoord3i", 10, _gloffset_TexCoord3i },
+   { "glTexCoord3iv", 10, _gloffset_TexCoord3iv },
+   { "glTexCoord3s", 10, _gloffset_TexCoord3s },
+   { "glTexCoord3sv", 10, _gloffset_TexCoord3sv },
+   { "glTexCoord4d", 10, _gloffset_TexCoord4d },
+   { "glTexCoord4dv", 10, _gloffset_TexCoord4dv },
+   { "glTexCoord4f", 10, _gloffset_TexCoord4f },
+   { "glTexCoord4fv", 10, _gloffset_TexCoord4fv },
+   { "glTexCoord4i", 10, _gloffset_TexCoord4i },
+   { "glTexCoord4iv", 10, _gloffset_TexCoord4iv },
+   { "glTexCoord4s", 10, _gloffset_TexCoord4s },
+   { "glTexCoord4sv", 10, _gloffset_TexCoord4sv },
+   { "glVertex2d", 10, _gloffset_Vertex2d },
+   { "glVertex2dv", 10, _gloffset_Vertex2dv },
+   { "glVertex2f", 10, _gloffset_Vertex2f },
+   { "glVertex2fv", 10, _gloffset_Vertex2fv },
+   { "glVertex2i", 10, _gloffset_Vertex2i },
+   { "glVertex2iv", 10, _gloffset_Vertex2iv },
+   { "glVertex2s", 10, _gloffset_Vertex2s },
+   { "glVertex2sv", 10, _gloffset_Vertex2sv },
+   { "glVertex3d", 10, _gloffset_Vertex3d },
+   { "glVertex3dv", 10, _gloffset_Vertex3dv },
+   { "glVertex3f", 10, _gloffset_Vertex3f },
+   { "glVertex3fv", 10, _gloffset_Vertex3fv },
+   { "glVertex3i", 10, _gloffset_Vertex3i },
+   { "glVertex3iv", 10, _gloffset_Vertex3iv },
+   { "glVertex3s", 10, _gloffset_Vertex3s },
+   { "glVertex3sv", 10, _gloffset_Vertex3sv },
+   { "glVertex4d", 10, _gloffset_Vertex4d },
+   { "glVertex4dv", 10, _gloffset_Vertex4dv },
+   { "glVertex4f", 10, _gloffset_Vertex4f },
+   { "glVertex4fv", 10, _gloffset_Vertex4fv },
+   { "glVertex4i", 10, _gloffset_Vertex4i },
+   { "glVertex4iv", 10, _gloffset_Vertex4iv },
+   { "glVertex4s", 10, _gloffset_Vertex4s },
+   { "glVertex4sv", 10, _gloffset_Vertex4sv },
+   { "glClipPlane", 10, _gloffset_ClipPlane },
+   { "glColorMaterial", 10, _gloffset_ColorMaterial },
+   { "glFogf", 10, _gloffset_Fogf },
+   { "glFogfv", 10, _gloffset_Fogfv },
+   { "glFogi", 10, _gloffset_Fogi },
+   { "glFogiv", 10, _gloffset_Fogiv },
+   { "glLightf", 10, _gloffset_Lightf },
+   { "glLightfv", 10, _gloffset_Lightfv },
+   { "glLighti", 10, _gloffset_Lighti },
+   { "glLightiv", 10, _gloffset_Lightiv },
+   { "glLightModelf", 10, _gloffset_LightModelf },
+   { "glLightModelfv", 10, _gloffset_LightModelfv },
+   { "glLightModeli", 10, _gloffset_LightModeli },
+   { "glLightModeliv", 10, _gloffset_LightModeliv },
+   { "glLineStipple", 10, _gloffset_LineStipple },
+   { "glMaterialf", 10, _gloffset_Materialf },
+   { "glMaterialfv", 10, _gloffset_Materialfv },
+   { "glMateriali", 10, _gloffset_Materiali },
+   { "glMaterialiv", 10, _gloffset_Materialiv },
+   { "glPolygonStipple", 10, _gloffset_PolygonStipple },
+   { "glShadeModel", 10, _gloffset_ShadeModel },
+   { "glTexEnvf", 10, _gloffset_TexEnvf },
+   { "glTexEnvfv", 10, _gloffset_TexEnvfv },
+   { "glTexEnvi", 10, _gloffset_TexEnvi },
+   { "glTexEnviv", 10, _gloffset_TexEnviv },
+   { "glTexGend", 10, _gloffset_TexGend },
+   { "glTexGendv", 10, _gloffset_TexGendv },
+   { "glTexGenf", 10, _gloffset_TexGenf },
+   { "glTexGenfv", 10, _gloffset_TexGenfv },
+   { "glTexGeni", 10, _gloffset_TexGeni },
+   { "glTexGeniv", 10, _gloffset_TexGeniv },
+   { "glFeedbackBuffer", 10, _gloffset_FeedbackBuffer },
+   { "glSelectBuffer", 10, _gloffset_SelectBuffer },
+   { "glRenderMode", 10, _gloffset_RenderMode },
+   { "glInitNames", 10, _gloffset_InitNames },
+   { "glLoadName", 10, _gloffset_LoadName },
+   { "glPassThrough", 10, _gloffset_PassThrough },
+   { "glPopName", 10, _gloffset_PopName },
+   { "glPushName", 10, _gloffset_PushName },
+   { "glClearAccum", 10, _gloffset_ClearAccum },
+   { "glClearIndex", 10, _gloffset_ClearIndex },
+   { "glIndexMask", 10, _gloffset_IndexMask },
+   { "glAccum", 10, _gloffset_Accum },
+   { "glPopAttrib", 10, _gloffset_PopAttrib },
+   { "glPushAttrib", 10, _gloffset_PushAttrib },
+   { "glMap1d", 10, _gloffset_Map1d },
+   { "glMap1f", 10, _gloffset_Map1f },
+   { "glMap2d", 10, _gloffset_Map2d },
+   { "glMap2f", 10, _gloffset_Map2f },
+   { "glMapGrid1d", 10, _gloffset_MapGrid1d },
+   { "glMapGrid1f", 10, _gloffset_MapGrid1f },
+   { "glMapGrid2d", 10, _gloffset_MapGrid2d },
+   { "glMapGrid2f", 10, _gloffset_MapGrid2f },
+   { "glEvalCoord1d", 10, _gloffset_EvalCoord1d },
+   { "glEvalCoord1dv", 10, _gloffset_EvalCoord1dv },
+   { "glEvalCoord1f", 10, _gloffset_EvalCoord1f },
+   { "glEvalCoord1fv", 10, _gloffset_EvalCoord1fv },
+   { "glEvalCoord2d", 10, _gloffset_EvalCoord2d },
+   { "glEvalCoord2dv", 10, _gloffset_EvalCoord2dv },
+   { "glEvalCoord2f", 10, _gloffset_EvalCoord2f },
+   { "glEvalCoord2fv", 10, _gloffset_EvalCoord2fv },
+   { "glEvalMesh1", 10, _gloffset_EvalMesh1 },
+   { "glEvalPoint1", 10, _gloffset_EvalPoint1 },
+   { "glEvalMesh2", 10, _gloffset_EvalMesh2 },
+   { "glEvalPoint2", 10, _gloffset_EvalPoint2 },
+   { "glAlphaFunc", 10, _gloffset_AlphaFunc },
+   { "glPixelZoom", 10, _gloffset_PixelZoom },
+   { "glPixelTransferf", 10, _gloffset_PixelTransferf },
+   { "glPixelTransferi", 10, _gloffset_PixelTransferi },
+   { "glPixelMapfv", 10, _gloffset_PixelMapfv },
+   { "glPixelMapuiv", 10, _gloffset_PixelMapuiv },
+   { "glPixelMapusv", 10, _gloffset_PixelMapusv },
+   { "glCopyPixels", 10, _gloffset_CopyPixels },
+   { "glDrawPixels", 10, _gloffset_DrawPixels },
+   { "glGetClipPlane", 10, _gloffset_GetClipPlane },
+   { "glGetLightfv", 10, _gloffset_GetLightfv },
+   { "glGetLightiv", 10, _gloffset_GetLightiv },
+   { "glGetMapdv", 10, _gloffset_GetMapdv },
+   { "glGetMapfv", 10, _gloffset_GetMapfv },
+   { "glGetMapiv", 10, _gloffset_GetMapiv },
+   { "glGetMaterialfv", 10, _gloffset_GetMaterialfv },
+   { "glGetMaterialiv", 10, _gloffset_GetMaterialiv },
+   { "glGetPixelMapfv", 10, _gloffset_GetPixelMapfv },
+   { "glGetPixelMapuiv", 10, _gloffset_GetPixelMapuiv },
+   { "glGetPixelMapusv", 10, _gloffset_GetPixelMapusv },
+   { "glGetPolygonStipple", 10, _gloffset_GetPolygonStipple },
+   { "glGetTexEnvfv", 10, _gloffset_GetTexEnvfv },
+   { "glGetTexEnviv", 10, _gloffset_GetTexEnviv },
+   { "glGetTexGendv", 10, _gloffset_GetTexGendv },
+   { "glGetTexGenfv", 10, _gloffset_GetTexGenfv },
+   { "glGetTexGeniv", 10, _gloffset_GetTexGeniv },
+   { "glIsList", 10, _gloffset_IsList },
+   { "glFrustum", 10, _gloffset_Frustum },
+   { "glLoadIdentity", 10, _gloffset_LoadIdentity },
+   { "glLoadMatrixf", 10, _gloffset_LoadMatrixf },
+   { "glLoadMatrixd", 10, _gloffset_LoadMatrixd },
+   { "glMatrixMode", 10, _gloffset_MatrixMode },
+   { "glMultMatrixf", 10, _gloffset_MultMatrixf },
+   { "glMultMatrixd", 10, _gloffset_MultMatrixd },
+   { "glOrtho", 10, _gloffset_Ortho },
+   { "glPopMatrix", 10, _gloffset_PopMatrix },
+   { "glPushMatrix", 10, _gloffset_PushMatrix },
+   { "glRotated", 10, _gloffset_Rotated },
+   { "glRotatef", 10, _gloffset_Rotatef },
+   { "glScaled", 10, _gloffset_Scaled },
+   { "glScalef", 10, _gloffset_Scalef },
+   { "glTranslated", 10, _gloffset_Translated },
+   { "glTranslatef", 10, _gloffset_Translatef },
+   { "glArrayElement", 10, _gloffset_ArrayElement },
+   { "glColorPointer", 10, _gloffset_ColorPointer },
+   { "glDisableClientState", 10, _gloffset_DisableClientState },
+   { "glEdgeFlagPointer", 10, _gloffset_EdgeFlagPointer },
+   { "glEnableClientState", 10, _gloffset_EnableClientState },
+   { "glIndexPointer", 10, _gloffset_IndexPointer },
+   { "glInterleavedArrays", 10, _gloffset_InterleavedArrays },
+   { "glNormalPointer", 10, _gloffset_NormalPointer },
+   { "glTexCoordPointer", 10, _gloffset_TexCoordPointer },
+   { "glVertexPointer", 10, _gloffset_VertexPointer },
+   { "glAreTexturesResident", 10, _gloffset_AreTexturesResident },
+   { "glPrioritizeTextures", 10, _gloffset_PrioritizeTextures },
+   { "glIndexub", 10, _gloffset_Indexub },
+   { "glIndexubv", 10, _gloffset_Indexubv },
+   { "glPopClientAttrib", 10, _gloffset_PopClientAttrib },
+   { "glPushClientAttrib", 10, _gloffset_PushClientAttrib },
+   { "glColorTable", 10, _gloffset_ColorTable },
+   { "glColorTableParameterfv", 10, _gloffset_ColorTableParameterfv },
+   { "glColorTableParameteriv", 10, _gloffset_ColorTableParameteriv },
+   { "glCopyColorTable", 10, _gloffset_CopyColorTable },
+   { "glGetColorTable", 10, _gloffset_GetColorTable },
+   { "glGetColorTableParameterfv", 10, _gloffset_GetColorTableParameterfv },
+   { "glGetColorTableParameteriv", 10, _gloffset_GetColorTableParameteriv },
+   { "glColorSubTable", 10, _gloffset_ColorSubTable },
+   { "glCopyColorSubTable", 10, _gloffset_CopyColorSubTable },
+   { "glConvolutionFilter1D", 10, _gloffset_ConvolutionFilter1D },
+   { "glConvolutionFilter2D", 10, _gloffset_ConvolutionFilter2D },
+   { "glConvolutionParameterf", 10, _gloffset_ConvolutionParameterf },
+   { "glConvolutionParameterfv", 10, _gloffset_ConvolutionParameterfv },
+   { "glConvolutionParameteri", 10, _gloffset_ConvolutionParameteri },
+   { "glConvolutionParameteriv", 10, _gloffset_ConvolutionParameteriv },
+   { "glCopyConvolutionFilter1D", 10, _gloffset_CopyConvolutionFilter1D },
+   { "glCopyConvolutionFilter2D", 10, _gloffset_CopyConvolutionFilter2D },
+   { "glGetConvolutionFilter", 10, _gloffset_GetConvolutionFilter },
+   { "glGetConvolutionParameterfv", 10, _gloffset_GetConvolutionParameterfv },
+   { "glGetConvolutionParameteriv", 10, _gloffset_GetConvolutionParameteriv },
+   { "glGetSeparableFilter", 10, _gloffset_GetSeparableFilter },
+   { "glSeparableFilter2D", 10, _gloffset_SeparableFilter2D },
+   { "glGetHistogram", 10, _gloffset_GetHistogram },
+   { "glGetHistogramParameterfv", 10, _gloffset_GetHistogramParameterfv },
+   { "glGetHistogramParameteriv", 10, _gloffset_GetHistogramParameteriv },
+   { "glGetMinmax", 10, _gloffset_GetMinmax },
+   { "glGetMinmaxParameterfv", 10, _gloffset_GetMinmaxParameterfv },
+   { "glGetMinmaxParameteriv", 10, _gloffset_GetMinmaxParameteriv },
+   { "glHistogram", 10, _gloffset_Histogram },
+   { "glMinmax", 10, _gloffset_Minmax },
+   { "glResetHistogram", 10, _gloffset_ResetHistogram },
+   { "glResetMinmax", 10, _gloffset_ResetMinmax },
+   { "glClientActiveTexture", 10, _gloffset_ClientActiveTexture },
+   { "glMultiTexCoord1d", 10, _gloffset_MultiTexCoord1d },
+   { "glMultiTexCoord1dv", 10, _gloffset_MultiTexCoord1dv },
+   { "glMultiTexCoord1f", 10, _gloffset_MultiTexCoord1fARB },
+   { "glMultiTexCoord1fv", 10, _gloffset_MultiTexCoord1fvARB },
+   { "glMultiTexCoord1i", 10, _gloffset_MultiTexCoord1i },
+   { "glMultiTexCoord1iv", 10, _gloffset_MultiTexCoord1iv },
+   { "glMultiTexCoord1s", 10, _gloffset_MultiTexCoord1s },
+   { "glMultiTexCoord1sv", 10, _gloffset_MultiTexCoord1sv },
+   { "glMultiTexCoord2d", 10, _gloffset_MultiTexCoord2d },
+   { "glMultiTexCoord2dv", 10, _gloffset_MultiTexCoord2dv },
+   { "glMultiTexCoord2f", 10, _gloffset_MultiTexCoord2fARB },
+   { "glMultiTexCoord2fv", 10, _gloffset_MultiTexCoord2fvARB },
+   { "glMultiTexCoord2i", 10, _gloffset_MultiTexCoord2i },
+   { "glMultiTexCoord2iv", 10, _gloffset_MultiTexCoord2iv },
+   { "glMultiTexCoord2s", 10, _gloffset_MultiTexCoord2s },
+   { "glMultiTexCoord2sv", 10, _gloffset_MultiTexCoord2sv },
+   { "glMultiTexCoord3d", 10, _gloffset_MultiTexCoord3d },
+   { "glMultiTexCoord3dv", 10, _gloffset_MultiTexCoord3dv },
+   { "glMultiTexCoord3f", 10, _gloffset_MultiTexCoord3fARB },
+   { "glMultiTexCoord3fv", 10, _gloffset_MultiTexCoord3fvARB },
+   { "glMultiTexCoord3i", 10, _gloffset_MultiTexCoord3i },
+   { "glMultiTexCoord3iv", 10, _gloffset_MultiTexCoord3iv },
+   { "glMultiTexCoord3s", 10, _gloffset_MultiTexCoord3s },
+   { "glMultiTexCoord3sv", 10, _gloffset_MultiTexCoord3sv },
+   { "glMultiTexCoord4d", 10, _gloffset_MultiTexCoord4d },
+   { "glMultiTexCoord4dv", 10, _gloffset_MultiTexCoord4dv },
+   { "glMultiTexCoord4f", 10, _gloffset_MultiTexCoord4fARB },
+   { "glMultiTexCoord4fv", 10, _gloffset_MultiTexCoord4fvARB },
+   { "glMultiTexCoord4i", 10, _gloffset_MultiTexCoord4i },
+   { "glMultiTexCoord4iv", 10, _gloffset_MultiTexCoord4iv },
+   { "glMultiTexCoord4s", 10, _gloffset_MultiTexCoord4s },
+   { "glMultiTexCoord4sv", 10, _gloffset_MultiTexCoord4sv },
+   { "glLoadTransposeMatrixf", 10, -1 },
+   { "glLoadTransposeMatrixd", 10, -1 },
+   { "glMultTransposeMatrixf", 10, -1 },
+   { "glMultTransposeMatrixd", 10, -1 },
+   { "glFogCoordf", 10, -1 },
+   { "glFogCoordfv", 10, -1 },
+   { "glFogCoordd", 10, -1 },
+   { "glFogCoorddv", 10, -1 },
+   { "glFogCoordPointer", 10, -1 },
+   { "glSecondaryColor3b", 10, -1 },
+   { "glSecondaryColor3bv", 10, -1 },
+   { "glSecondaryColor3d", 10, -1 },
+   { "glSecondaryColor3dv", 10, -1 },
+   { "glSecondaryColor3f", 10, -1 },
+   { "glSecondaryColor3fv", 10, -1 },
+   { "glSecondaryColor3i", 10, -1 },
+   { "glSecondaryColor3iv", 10, -1 },
+   { "glSecondaryColor3s", 10, -1 },
+   { "glSecondaryColor3sv", 10, -1 },
+   { "glSecondaryColor3ub", 10, -1 },
+   { "glSecondaryColor3ubv", 10, -1 },
+   { "glSecondaryColor3ui", 10, -1 },
+   { "glSecondaryColor3uiv", 10, -1 },
+   { "glSecondaryColor3us", 10, -1 },
+   { "glSecondaryColor3usv", 10, -1 },
+   { "glSecondaryColorPointer", 10, -1 },
+   { "glWindowPos2d", 10, -1 },
+   { "glWindowPos2dv", 10, -1 },
+   { "glWindowPos2f", 10, -1 },
+   { "glWindowPos2fv", 10, -1 },
+   { "glWindowPos2i", 10, -1 },
+   { "glWindowPos2iv", 10, -1 },
+   { "glWindowPos2s", 10, -1 },
+   { "glWindowPos2sv", 10, -1 },
+   { "glWindowPos3d", 10, -1 },
+   { "glWindowPos3dv", 10, -1 },
+   { "glWindowPos3f", 10, -1 },
+   { "glWindowPos3fv", 10, -1 },
+   { "glWindowPos3i", 10, -1 },
+   { "glWindowPos3iv", 10, -1 },
+   { "glWindowPos3s", 10, -1 },
+   { "glWindowPos3sv", 10, -1 },
+   { "glProgramStringARB", 10, -1 },
+   { "glProgramEnvParameter4dARB", 10, -1 },
+   { "glProgramEnvParameter4dvARB", 10, -1 },
+   { "glProgramEnvParameter4fARB", 10, -1 },
+   { "glProgramEnvParameter4fvARB", 10, -1 },
+   { "glProgramLocalParameter4dARB", 10, -1 },
+   { "glProgramLocalParameter4dvARB", 10, -1 },
+   { "glProgramLocalParameter4fARB", 10, -1 },
+   { "glProgramLocalParameter4fvARB", 10, -1 },
+   { "glGetProgramEnvParameterdvARB", 10, -1 },
+   { "glGetProgramEnvParameterfvARB", 10, -1 },
+   { "glGetProgramLocalParameterdvARB", 10, -1 },
+   { "glGetProgramLocalParameterfvARB", 10, -1 },
+   { "glGetProgramivARB", 10, -1 },
+   { "glGetProgramStringARB", 10, -1 },
+   { "glPolygonOffsetEXT", 10, -1 },
+   { "glColorPointerEXT", 10, -1 },
+   { "glEdgeFlagPointerEXT", 10, -1 },
+   { "glIndexPointerEXT", 10, -1 },
+   { "glNormalPointerEXT", 10, -1 },
+   { "glTexCoordPointerEXT", 10, -1 },
+   { "glVertexPointerEXT", 10, -1 },
+   { "glLockArraysEXT", 10, -1 },
+   { "glUnlockArraysEXT", 10, -1 },
+   { "glWindowPos4dMESA", 10, -1 },
+   { "glWindowPos4dvMESA", 10, -1 },
+   { "glWindowPos4fMESA", 10, -1 },
+   { "glWindowPos4fvMESA", 10, -1 },
+   { "glWindowPos4iMESA", 10, -1 },
+   { "glWindowPos4ivMESA", 10, -1 },
+   { "glWindowPos4sMESA", 10, -1 },
+   { "glWindowPos4svMESA", 10, -1 },
+   { "glBindProgramNV", 10, -1 },
+   { "glDeleteProgramsNV", 10, -1 },
+   { "glGenProgramsNV", 10, -1 },
+   { "glIsProgramNV", 10, -1 },
+   { "glVertexAttrib1sNV", 10, -1 },
+   { "glVertexAttrib1svNV", 10, -1 },
+   { "glVertexAttrib2sNV", 10, -1 },
+   { "glVertexAttrib2svNV", 10, -1 },
+   { "glVertexAttrib3sNV", 10, -1 },
+   { "glVertexAttrib3svNV", 10, -1 },
+   { "glVertexAttrib4sNV", 10, -1 },
+   { "glVertexAttrib4svNV", 10, -1 },
+   { "glVertexAttrib1fNV", 10, -1 },
+   { "glVertexAttrib1fvNV", 10, -1 },
+   { "glVertexAttrib2fNV", 10, -1 },
+   { "glVertexAttrib2fvNV", 10, -1 },
+   { "glVertexAttrib3fNV", 10, -1 },
+   { "glVertexAttrib3fvNV", 10, -1 },
+   { "glVertexAttrib4fNV", 10, -1 },
+   { "glVertexAttrib4fvNV", 10, -1 },
+   { "glVertexAttrib1dNV", 10, -1 },
+   { "glVertexAttrib1dvNV", 10, -1 },
+   { "glVertexAttrib2dNV", 10, -1 },
+   { "glVertexAttrib2dvNV", 10, -1 },
+   { "glVertexAttrib3dNV", 10, -1 },
+   { "glVertexAttrib3dvNV", 10, -1 },
+   { "glVertexAttrib4dNV", 10, -1 },
+   { "glVertexAttrib4dvNV", 10, -1 },
+   { "glVertexAttrib4ubNV", 10, -1 },
+   { "glVertexAttrib4ubvNV", 10, -1 },
+   { "glVertexAttribs1svNV", 10, -1 },
+   { "glVertexAttribs2svNV", 10, -1 },
+   { "glVertexAttribs3svNV", 10, -1 },
+   { "glVertexAttribs4svNV", 10, -1 },
+   { "glVertexAttribs1fvNV", 10, -1 },
+   { "glVertexAttribs2fvNV", 10, -1 },
+   { "glVertexAttribs3fvNV", 10, -1 },
+   { "glVertexAttribs4fvNV", 10, -1 },
+   { "glVertexAttribs1dvNV", 10, -1 },
+   { "glVertexAttribs2dvNV", 10, -1 },
+   { "glVertexAttribs3dvNV", 10, -1 },
+   { "glVertexAttribs4dvNV", 10, -1 },
+   { "glVertexAttribs4ubvNV", 10, -1 },
+   { "glGenFragmentShadersATI", 10, -1 },
+   { "glBindFragmentShaderATI", 10, -1 },
+   { "glDeleteFragmentShaderATI", 10, -1 },
+   { "glBeginFragmentShaderATI", 10, -1 },
+   { "glEndFragmentShaderATI", 10, -1 },
+   { "glPassTexCoordATI", 10, -1 },
+   { "glSampleMapATI", 10, -1 },
+   { "glColorFragmentOp1ATI", 10, -1 },
+   { "glColorFragmentOp2ATI", 10, -1 },
+   { "glColorFragmentOp3ATI", 10, -1 },
+   { "glAlphaFragmentOp1ATI", 10, -1 },
+   { "glAlphaFragmentOp2ATI", 10, -1 },
+   { "glAlphaFragmentOp3ATI", 10, -1 },
+   { "glSetFragmentShaderConstantATI", 10, -1 },
+   { "glActiveStencilFaceEXT", 10, -1 },
+   { "glStencilFuncSeparateATI", 10, -1 },
+   { "glProgramEnvParameters4fvEXT", 10, -1 },
+   { "glProgramLocalParameters4fvEXT", 10, -1 },
+   { "glPrimitiveRestartNV", 10, -1 },
+
+   { NULL, 0, -1 }
+};
+
+const struct function gl_core_functions_possible[] = {
+   /* GL 3.1 */
+   { "glTexBuffer", 31, -1 },
+
+   /* GL 3.2 */
+   { "glFramebufferTexture", 32, -1 },
+
+   /* GL 4.3 */
+   { "glIsRenderbuffer", 43, -1 },
+   { "glBindRenderbuffer", 43, -1 },
+   { "glDeleteRenderbuffers", 43, -1 },
+   { "glGenRenderbuffers", 43, -1 },
+   { "glRenderbufferStorage", 43, -1 },
+   { "glGetRenderbufferParameteriv", 43, -1 },
+   { "glIsFramebuffer", 43, -1 },
+   { "glBindFramebuffer", 43, -1 },
+   { "glDeleteFramebuffers", 43, -1 },
+   { "glGenFramebuffers", 43, -1 },
+   { "glCheckFramebufferStatus", 43, -1 },
+   { "glFramebufferTexture1D", 43, -1 },
+   { "glFramebufferTexture2D", 43, -1 },
+   { "glFramebufferTexture3D", 43, -1 },
+   { "glFramebufferRenderbuffer", 43, -1 },
+   { "glGetFramebufferAttachmentParameteriv", 43, -1 },
+   { "glGenerateMipmap", 43, -1 },
+   { "glBlitFramebuffer", 43, -1 },
+   { "glRenderbufferStorageMultisample", 43, -1 },
+   { "glFramebufferTextureLayer", 43, -1 },
+   { "glMapBufferRange", 43, -1 },
+   { "glFlushMappedBufferRange", 43, -1 },
+   { "glBindVertexArray", 43, -1 },
+   { "glDeleteVertexArrays", 43, -1 },
+   { "glGenVertexArrays", 43, -1 },
+   { "glIsVertexArray", 43, -1 },
+   { "glGetUniformIndices", 43, -1 },
+   { "glGetActiveUniformsiv", 43, -1 },
+   { "glGetActiveUniformName", 43, -1 },
+   { "glGetUniformBlockIndex", 43, -1 },
+   { "glGetActiveUniformBlockiv", 43, -1 },
+   { "glGetActiveUniformBlockName", 43, -1 },
+   { "glUniformBlockBinding", 43, -1 },
+   { "glCopyBufferSubData", 43, -1 },
+   { "glDrawElementsBaseVertex", 43, -1 },
+   { "glDrawRangeElementsBaseVertex", 43, -1 },
+   { "glDrawElementsInstancedBaseVertex", 43, -1 },
+   { "glMultiDrawElementsBaseVertex", 43, -1 },
+   { "glProvokingVertex", 43, -1 },
+   { "glFenceSync", 43, -1 },
+   { "glIsSync", 43, -1 },
+   { "glDeleteSync", 43, -1 },
+   { "glClientWaitSync", 43, -1 },
+   { "glWaitSync", 43, -1 },
+   { "glGetInteger64v", 43, -1 },
+   { "glGetSynciv", 43, -1 },
+   { "glTexImage2DMultisample", 43, -1 },
+   { "glTexImage3DMultisample", 43, -1 },
+   { "glGetMultisamplefv", 43, -1 },
+   { "glSampleMaski", 43, -1 },
+   { "glBlendEquationiARB", 43, -1 },
+   { "glBlendEquationSeparateiARB", 43, -1 },
+   { "glBlendFunciARB", 43, -1 },
+   { "glBlendFuncSeparateiARB", 43, -1 },
+   { "glMinSampleShadingARB", 43, -1 },                 // XXX: Add to xml
+// { "glNamedStringARB", 43, -1 },                      // XXX: Add to xml
+// { "glDeleteNamedStringARB", 43, -1 },                // XXX: Add to xml
+// { "glCompileShaderIncludeARB", 43, -1 },             // XXX: Add to xml
+// { "glIsNamedStringARB", 43, -1 },                    // XXX: Add to xml
+// { "glGetNamedStringARB", 43, -1 },                   // XXX: Add to xml
+// { "glGetNamedStringivARB", 43, -1 },                 // XXX: Add to xml
+   { "glBindFragDataLocationIndexed", 43, -1 },
+   { "glGetFragDataIndex", 43, -1 },
+   { "glGenSamplers", 43, -1 },
+   { "glDeleteSamplers", 43, -1 },
+   { "glIsSampler", 43, -1 },
+   { "glBindSampler", 43, -1 },
+   { "glSamplerParameteri", 43, -1 },
+   { "glSamplerParameteriv", 43, -1 },
+   { "glSamplerParameterf", 43, -1 },
+   { "glSamplerParameterfv", 43, -1 },
+   { "glSamplerParameterIiv", 43, -1 },
+   { "glSamplerParameterIuiv", 43, -1 },
+   { "glGetSamplerParameteriv", 43, -1 },
+   { "glGetSamplerParameterIiv", 43, -1 },
+   { "glGetSamplerParameterfv", 43, -1 },
+   { "glGetSamplerParameterIuiv", 43, -1 },
+   { "glQueryCounter", 43, -1 },
+   { "glGetQueryObjecti64v", 43, -1 },
+   { "glGetQueryObjectui64v", 43, -1 },
+   { "glVertexP2ui", 43, -1 },
+   { "glVertexP2uiv", 43, -1 },
+   { "glVertexP3ui", 43, -1 },
+   { "glVertexP3uiv", 43, -1 },
+   { "glVertexP4ui", 43, -1 },
+   { "glVertexP4uiv", 43, -1 },
+   { "glTexCoordP1ui", 43, -1 },
+   { "glTexCoordP1uiv", 43, -1 },
+   { "glTexCoordP2ui", 43, -1 },
+   { "glTexCoordP2uiv", 43, -1 },
+   { "glTexCoordP3ui", 43, -1 },
+   { "glTexCoordP3uiv", 43, -1 },
+   { "glTexCoordP4ui", 43, -1 },
+   { "glTexCoordP4uiv", 43, -1 },
+   { "glMultiTexCoordP1ui", 43, -1 },
+   { "glMultiTexCoordP1uiv", 43, -1 },
+   { "glMultiTexCoordP2ui", 43, -1 },
+   { "glMultiTexCoordP2uiv", 43, -1 },
+   { "glMultiTexCoordP3ui", 43, -1 },
+   { "glMultiTexCoordP3uiv", 43, -1 },
+   { "glMultiTexCoordP4ui", 43, -1 },
+   { "glMultiTexCoordP4uiv", 43, -1 },
+   { "glNormalP3ui", 43, -1 },
+   { "glNormalP3uiv", 43, -1 },
+   { "glColorP3ui", 43, -1 },
+   { "glColorP3uiv", 43, -1 },
+   { "glColorP4ui", 43, -1 },
+   { "glColorP4uiv", 43, -1 },
+   { "glVertexAttribP1ui", 43, -1 },
+   { "glVertexAttribP1uiv", 43, -1 },
+   { "glVertexAttribP2ui", 43, -1 },
+   { "glVertexAttribP2uiv", 43, -1 },
+   { "glVertexAttribP3ui", 43, -1 },
+   { "glVertexAttribP3uiv", 43, -1 },
+   { "glVertexAttribP4ui", 43, -1 },
+   { "glVertexAttribP4uiv", 43, -1 },
+   { "glDrawArraysIndirect", 43, -1 },
+   { "glDrawElementsIndirect", 43, -1 },
+
+   { "glUniform1d", 40, -1 },
+   { "glUniform2d", 40, -1 },
+   { "glUniform3d", 40, -1 },
+   { "glUniform4d", 40, -1 },
+   { "glUniform1dv", 40, -1 },
+   { "glUniform2dv", 40, -1 },
+   { "glUniform3dv", 40, -1 },
+   { "glUniform4dv", 40, -1 },
+   { "glUniformMatrix2dv", 40, -1 },
+   { "glUniformMatrix3dv", 40, -1 },
+   { "glUniformMatrix4dv", 40, -1 },
+   { "glUniformMatrix2x3dv", 40, -1 },
+   { "glUniformMatrix2x4dv", 40, -1 },
+   { "glUniformMatrix3x2dv", 40, -1 },
+   { "glUniformMatrix3x4dv", 40, -1 },
+   { "glUniformMatrix4x2dv", 40, -1 },
+   { "glUniformMatrix4x3dv", 40, -1 },
+   { "glGetUniformdv", 43, -1 },
+// { "glGetSubroutineUniformLocation", 43, -1 },        // XXX: Add to xml
+// { "glGetSubroutineIndex", 43, -1 },                  // XXX: Add to xml
+// { "glGetActiveSubroutineUniformiv", 43, -1 },        // XXX: Add to xml
+// { "glGetActiveSubroutineUniformName", 43, -1 },      // XXX: Add to xml
+// { "glGetActiveSubroutineName", 43, -1 },             // XXX: Add to xml
+// { "glUniformSubroutinesuiv", 43, -1 },               // XXX: Add to xml
+// { "glGetUniformSubroutineuiv", 43, -1 },             // XXX: Add to xml
+// { "glGetProgramStageiv", 43, -1 },                   // XXX: Add to xml
+// { "glPatchParameteri", 43, -1 },                     // XXX: Add to xml
+// { "glPatchParameterfv", 43, -1 },                    // XXX: Add to xml
+
+   { "glBindTransformFeedback", 43, -1 },
+   { "glDeleteTransformFeedbacks", 43, -1 },
+   { "glGenTransformFeedbacks", 43, -1 },
+   { "glIsTransformFeedback", 43, -1 },
+   { "glPauseTransformFeedback", 43, -1 },
+   { "glResumeTransformFeedback", 43, -1 },
+   { "glDrawTransformFeedback", 43, -1 },
+   { "glDrawTransformFeedbackStream", 43, -1 },
+   { "glBeginQueryIndexed", 43, -1 },
+   { "glEndQueryIndexed", 43, -1 },
+   { "glGetQueryIndexediv", 43, -1 },
+   { "glReleaseShaderCompiler", 43, -1 },
+   { "glShaderBinary", 43, -1 },
+   { "glGetShaderPrecisionFormat", 43, -1 },
+   { "glDepthRangef", 43, -1 },
+   { "glClearDepthf", 43, -1 },
+   { "glGetProgramBinary", 43, -1 },
+   { "glProgramBinary", 43, -1 },
+   { "glProgramParameteri", 43, -1 },
+   { "glUseProgramStages", 43, -1 },
+   { "glActiveShaderProgram", 43, -1 },
+   { "glCreateShaderProgramv", 43, -1 },
+   { "glBindProgramPipeline", 43, -1 },
+   { "glDeleteProgramPipelines", 43, -1 },
+   { "glGenProgramPipelines", 43, -1 },
+   { "glIsProgramPipeline", 43, -1 },
+   { "glGetProgramPipelineiv", 43, -1 },
+   { "glProgramUniform1i", 43, -1 },
+   { "glProgramUniform1iv", 43, -1 },
+   { "glProgramUniform1f", 43, -1 },
+   { "glProgramUniform1fv", 43, -1 },
+   { "glProgramUniform1d", 40, -1 },
+   { "glProgramUniform1dv", 40, -1 },
+   { "glProgramUniform1ui", 43, -1 },
+   { "glProgramUniform1uiv", 43, -1 },
+   { "glProgramUniform2i", 43, -1 },
+   { "glProgramUniform2iv", 43, -1 },
+   { "glProgramUniform2f", 43, -1 },
+   { "glProgramUniform2fv", 43, -1 },
+   { "glProgramUniform2d", 40, -1 },
+   { "glProgramUniform2dv", 40, -1 },
+   { "glProgramUniform2ui", 43, -1 },
+   { "glProgramUniform2uiv", 43, -1 },
+   { "glProgramUniform3i", 43, -1 },
+   { "glProgramUniform3iv", 43, -1 },
+   { "glProgramUniform3f", 43, -1 },
+   { "glProgramUniform3fv", 43, -1 },
+   { "glProgramUniform3d", 40, -1 },
+   { "glProgramUniform3dv", 40, -1 },
+   { "glProgramUniform3ui", 43, -1 },
+   { "glProgramUniform3uiv", 43, -1 },
+   { "glProgramUniform4i", 43, -1 },
+   { "glProgramUniform4iv", 43, -1 },
+   { "glProgramUniform4f", 43, -1 },
+   { "glProgramUniform4fv", 43, -1 },
+   { "glProgramUniform4d", 40, -1 },
+   { "glProgramUniform4dv", 40, -1 },
+   { "glProgramUniform4ui", 43, -1 },
+   { "glProgramUniform4uiv", 43, -1 },
+   { "glProgramUniformMatrix2fv", 43, -1 },
+   { "glProgramUniformMatrix3fv", 43, -1 },
+   { "glProgramUniformMatrix4fv", 43, -1 },
+   { "glProgramUniformMatrix2dv", 40, -1 },
+   { "glProgramUniformMatrix3dv", 40, -1 },
+   { "glProgramUniformMatrix4dv", 40, -1 },
+   { "glProgramUniformMatrix2x3fv", 43, -1 },
+   { "glProgramUniformMatrix3x2fv", 43, -1 },
+   { "glProgramUniformMatrix2x4fv", 43, -1 },
+   { "glProgramUniformMatrix4x2fv", 43, -1 },
+   { "glProgramUniformMatrix3x4fv", 43, -1 },
+   { "glProgramUniformMatrix4x3fv", 43, -1 },
+   { "glProgramUniformMatrix2x3dv", 40, -1 },
+   { "glProgramUniformMatrix3x2dv", 40, -1 },
+   { "glProgramUniformMatrix2x4dv", 40, -1 },
+   { "glProgramUniformMatrix4x2dv", 40, -1 },
+   { "glProgramUniformMatrix3x4dv", 40, -1 },
+   { "glProgramUniformMatrix4x3dv", 40, -1 },
+   { "glValidateProgramPipeline", 43, -1 },
+   { "glGetProgramPipelineInfoLog", 43, -1 },
+
+   { "glVertexAttribL1d", 41, -1 },
+   { "glVertexAttribL2d", 41, -1 },
+   { "glVertexAttribL3d", 41, -1 },
+   { "glVertexAttribL4d", 41, -1 },
+   { "glVertexAttribL1dv", 41, -1 },
+   { "glVertexAttribL2dv", 41, -1 },
+   { "glVertexAttribL3dv", 41, -1 },
+   { "glVertexAttribL4dv", 41, -1 },
+   { "glVertexAttribLPointer", 41, -1 },
+   { "glGetVertexAttribLdv", 41, -1 },
+   { "glViewportArrayv", 43, -1 },
+   { "glViewportIndexedf", 43, -1 },
+   { "glViewportIndexedfv", 43, -1 },
+   { "glScissorArrayv", 43, -1 },
+   { "glScissorIndexed", 43, -1 },
+   { "glScissorIndexedv", 43, -1 },
+   { "glDepthRangeArrayv", 43, -1 },
+   { "glDepthRangeIndexed", 43, -1 },
+
+// { "glCreateSyncFromCLeventARB", 43, -1 },            // XXX: Add to xml
+
+   { "glDrawArraysInstancedBaseInstance", 43, -1 },
+   { "glDrawElementsInstancedBaseInstance", 43, -1 },
+   { "glDrawElementsInstancedBaseVertexBaseInstance", 43, -1 },
+   { "glDrawTransformFeedbackInstanced", 43, -1 },
+   { "glDrawTransformFeedbackStreamInstanced", 43, -1 },
+   { "glGetActiveAtomicCounterBufferiv", 43, -1 },
+   { "glBindImageTexture", 43, -1 },
+   { "glMemoryBarrier", 43, -1 },
+   { "glTexStorage1D", 43, -1 },
+   { "glTexStorage2D", 43, -1 },
+   { "glTexStorage3D", 43, -1 },
+   { "glTextureStorage1DEXT", 43, -1 },
+   { "glTextureStorage2DEXT", 43, -1 },
+   { "glTextureStorage3DEXT", 43, -1 },
+   { "glClearBufferData", 43, -1 },
+   { "glClearBufferSubData", 43, -1 },
+// { "glClearNamedBufferDataEXT", 43, -1 },             // XXX: Add to xml
+// { "glClearNamedBufferSubDataEXT", 43, -1 },          // XXX: Add to xml
+   { "glCopyImageSubData", 43, -1 },
+   { "glTextureView", 43, -1 },
+   { "glBindVertexBuffer", 43, -1 },
+   { "glVertexAttribFormat", 43, -1 },
+   { "glVertexAttribIFormat", 43, -1 },
+   { "glVertexAttribBinding", 43, -1 },
+   { "glVertexBindingDivisor", 43, -1 },
+// { "glVertexArrayBindVertexBufferEXT", 43, -1 },      // XXX: Add to xml
+// { "glVertexArrayVertexAttribFormatEXT", 43, -1 },    // XXX: Add to xml
+// { "glVertexArrayVertexAttribIFormatEXT", 43, -1 },   // XXX: Add to xml
+// { "glVertexArrayVertexAttribLFormatEXT", 43, -1 },   // XXX: Add to xml
+// { "glVertexArrayVertexAttribBindingEXT", 43, -1 },   // XXX: Add to xml
+// { "glVertexArrayVertexBindingDivisorEXT", 43, -1 },  // XXX: Add to xml
+// { "glFramebufferParameteri", 43, -1 },               // XXX: Add to xml
+// { "glGetFramebufferParameteriv", 43, -1 },           // XXX: Add to xml
+// { "glNamedFramebufferParameteriEXT", 43, -1 },       // XXX: Add to xml
+// { "glGetNamedFramebufferParameterivEXT", 43, -1 },   // XXX: Add to xml
+// { "glGetInternalformati64v", 43, -1 },               // XXX: Add to xml
+   { "glInvalidateTexSubImage", 43, -1 },
+   { "glInvalidateTexImage", 43, -1 },
+   { "glInvalidateBufferSubData", 43, -1 },
+   { "glInvalidateBufferData", 43, -1 },
+   { "glInvalidateFramebuffer", 43, -1 },
+   { "glInvalidateSubFramebuffer", 43, -1 },
+   { "glMultiDrawArraysIndirect", 43, -1 },
+   { "glMultiDrawElementsIndirect", 43, -1 },
+   { "glGetProgramInterfaceiv", 43, -1 },
+   { "glGetProgramResourceIndex", 43, -1 },
+   { "glGetProgramResourceName", 43, -1 },
+   { "glGetProgramResourceiv", 43, -1 },
+   { "glGetProgramResourceLocation", 43, -1 },
+   { "glGetProgramResourceLocationIndex", 43, -1 },
+// { "glShaderStorageBlockBinding", 43, -1 },           // XXX: Add to xml
+   { "glTexBufferRange", 43, -1 },
+// { "glTextureBufferRangeEXT", 43, -1 },               // XXX: Add to xml
+   { "glTexStorage2DMultisample", 43, -1 },
+   { "glTexStorage3DMultisample", 43, -1 },
+// { "glTextureStorage2DMultisampleEXT", 43, -1 },      // XXX: Add to xml
+// { "glTextureStorage3DMultisampleEXT", 43, -1 },      // XXX: Add to xml
+
    /* GL_ARB_direct_state_access */
    { "glCreateTransformFeedbacks", 45, -1 },
    { "glTransformFeedbackBufferBase", 45, -1 },
@@ -980,6 +1754,24 @@ const struct function gl_core_functions_possible[] = {
    { "glGetNamedBufferParameteri64v", 45, -1 },
    { "glGetNamedBufferPointerv", 45, -1 },
    { "glGetNamedBufferSubData", 45, -1 },
+   { "glCreateFramebuffers", 45, -1 },
+   { "glNamedFramebufferRenderbuffer", 45, -1 },
+   { "glNamedFramebufferParameteri", 45, -1 },
+   { "glNamedFramebufferTexture", 45, -1 },
+   { "glNamedFramebufferTextureLayer", 45, -1 },
+   { "glNamedFramebufferDrawBuffer", 45, -1 },
+   { "glNamedFramebufferDrawBuffers", 45, -1 },
+   { "glNamedFramebufferReadBuffer", 45, -1 },
+   { "glInvalidateNamedFramebufferSubData", 45, -1 },
+   { "glInvalidateNamedFramebufferData", 45, -1 },
+   { "glClearNamedFramebufferiv", 45, -1 },
+   { "glClearNamedFramebufferuiv", 45, -1 },
+   { "glClearNamedFramebufferfv", 45, -1 },
+   { "glClearNamedFramebufferfi", 45, -1 },
+   { "glBlitNamedFramebuffer", 45, -1 },
+   { "glCheckNamedFramebufferStatus", 45, -1 },
+   { "glGetNamedFramebufferParameteriv", 45, -1 },
+   { "glGetNamedFramebufferAttachmentParameteriv", 45, -1 },
    { "glCreateRenderbuffers", 45, -1 },
    { "glNamedRenderbufferStorage", 45, -1 },
    { "glNamedRenderbufferStorageMultisample", 45, -1 },
@@ -1039,9 +1831,6 @@ const struct function gl_core_functions_possible[] = {
    { "glGetQueryBufferObjecti64v", 45, -1 },
    { "glGetQueryBufferObjectui64v", 45, -1 },
 
-   /* GL_EXT_polygon_offset_clamp */
-   { "glPolygonOffsetClampEXT", 11, -1 },
-
    { NULL, 0, -1 }
 };
 
@@ -1596,3 +2385,88 @@ const struct function gles3_functions_possible[] = {
 
    { NULL, 0, -1 }
 };
+
+const struct function gles31_functions_possible[] = {
+   { "glDispatchCompute", 31, -1 },
+   { "glDispatchComputeIndirect", 31, -1 },
+   { "glDrawArraysIndirect", 31, -1 },
+   { "glDrawElementsIndirect", 31, -1 },
+
+   // FINISHME: These two functions have not been implemented yet.  They come
+   // FINISHME: from the ARB_framebuffer_no_attachments extension.
+   // { "glFramebufferParameteri", 31, -1 },
+   // { "glGetFramebufferParameteriv", 31, -1 },
+
+   { "glGetProgramInterfaceiv", 31, -1 },
+   { "glGetProgramResourceIndex", 31, -1 },
+   { "glGetProgramResourceName", 31, -1 },
+   { "glGetProgramResourceiv", 31, -1 },
+   { "glGetProgramResourceLocation", 31, -1 },
+
+   // We check for the aliased EXT versions in GLES 2
+   // { "glUseProgramStages", 31, -1 },
+   // { "glActiveShaderProgram", 31, -1 },
+   // { "glCreateShaderProgramv", 31, -1 },
+   // { "glBindProgramPipeline", 31, -1 },
+   // { "glDeleteProgramPipelines", 31, -1 },
+   // { "glGenProgramPipelines", 31, -1 },
+   // { "glIsProgramPipeline", 31, -1 },
+   // { "glGetProgramPipelineiv", 31, -1 },
+   // { "glProgramUniform1i", 31, -1 },
+   // { "glProgramUniform2i", 31, -1 },
+   // { "glProgramUniform3i", 31, -1 },
+   // { "glProgramUniform4i", 31, -1 },
+   // { "glProgramUniform1f", 31, -1 },
+   // { "glProgramUniform2f", 31, -1 },
+   // { "glProgramUniform3f", 31, -1 },
+   // { "glProgramUniform4f", 31, -1 },
+   // { "glProgramUniform1iv", 31, -1 },
+   // { "glProgramUniform2iv", 31, -1 },
+   // { "glProgramUniform3iv", 31, -1 },
+   // { "glProgramUniform4iv", 31, -1 },
+   // { "glProgramUniform1fv", 31, -1 },
+   // { "glProgramUniform2fv", 31, -1 },
+   // { "glProgramUniform3fv", 31, -1 },
+   // { "glProgramUniform4fv", 31, -1 },
+   // { "glProgramUniformMatrix2fv", 31, -1 },
+   // { "glProgramUniformMatrix3fv", 31, -1 },
+   // { "glProgramUniformMatrix4fv", 31, -1 },
+   // { "glProgramUniformMatrix2x3fv", 31, -1 },
+   // { "glProgramUniformMatrix3x2fv", 31, -1 },
+   // { "glProgramUniformMatrix2x4fv", 31, -1 },
+   // { "glProgramUniformMatrix4x2fv", 31, -1 },
+   // { "glProgramUniformMatrix3x4fv", 31, -1 },
+   // { "glProgramUniformMatrix4x3fv", 31, -1 },
+   // { "glValidateProgramPipeline", 31, -1 },
+   // { "glGetProgramPipelineInfoLog", 31, -1 },
+
+   // We check for the aliased EXT versions in GLES 3
+   // { "glProgramUniform1ui", 31, -1 },
+   // { "glProgramUniform2ui", 31, -1 },
+   // { "glProgramUniform3ui", 31, -1 },
+   // { "glProgramUniform4ui", 31, -1 },
+   // { "glProgramUniform1uiv", 31, -1 },
+   // { "glProgramUniform2uiv", 31, -1 },
+   // { "glProgramUniform3uiv", 31, -1 },
+   // { "glProgramUniform4uiv", 31, -1 },
+
+   { "glBindImageTexture", 31, -1 },
+   { "glGetBooleani_v", 31, -1 },
+   { "glMemoryBarrier", 31, -1 },
+
+   // FINISHME: This function has not been implemented yet.
+   // { "glMemoryBarrierByRegion", 31, -1 },
+
+   { "glTexStorage2DMultisample", 31, -1 },
+   { "glGetMultisamplefv", 31, -1 },
+   { "glSampleMaski", 31, -1 },
+   { "glGetTexLevelParameteriv", 31, -1 },
+   { "glGetTexLevelParameterfv", 31, -1 },
+   { "glBindVertexBuffer", 31, -1 },
+   { "glVertexAttribFormat", 31, -1 },
+   { "glVertexAttribIFormat", 31, -1 },
+   { "glVertexAttribBinding", 31, -1 },
+   { "glVertexBindingDivisor", 31, -1 },
+
+   { NULL, 0, -1 },
+ };
diff --git a/src/mesa/main/texenv.c b/src/mesa/main/texenv.c
index ec521e6c6e5..3edafc0f776 100644
--- a/src/mesa/main/texenv.c
+++ b/src/mesa/main/texenv.c
@@ -646,7 +646,7 @@ _mesa_GetTexEnvfv( GLenum target, GLenum pname, GLfloat *params )
       if (pname == GL_TEXTURE_ENV_COLOR) {
          if(ctx->NewState & (_NEW_BUFFERS | _NEW_FRAG_CLAMP))
             _mesa_update_state(ctx);
-         if (_mesa_get_clamp_fragment_color(ctx))
+         if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
             COPY_4FV( params, texUnit->EnvColor );
          else
             COPY_4FV( params, texUnit->EnvColorUnclamped );
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 7bc1da7f805..3d85615fa45 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -222,7 +222,7 @@ _mesa_base_tex_format( struct gl_context *ctx, GLint internalFormat )
       }
    }
 
-   if (ctx->Extensions.ARB_stencil_texturing) {
+   if (ctx->Extensions.ARB_texture_stencil8) {
       switch (internalFormat) {
       case GL_STENCIL_INDEX:
       case GL_STENCIL_INDEX1:
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index b5d42d3047f..d74134f41b1 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1709,7 +1709,7 @@ get_tex_parameterfv(struct gl_context *ctx,
 
          if (ctx->NewState & (_NEW_BUFFERS | _NEW_FRAG_CLAMP))
             _mesa_update_state_locked(ctx);
-         if (_mesa_get_clamp_fragment_color(ctx)) {
+         if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer)) {
             params[0] = CLAMP(obj->Sampler.BorderColor.f[0], 0.0F, 1.0F);
             params[1] = CLAMP(obj->Sampler.BorderColor.f[1], 0.0F, 1.0F);
             params[2] = CLAMP(obj->Sampler.BorderColor.f[2], 0.0F, 1.0F);
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index cd87a27d2db..6b0aed4ea1a 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -167,7 +167,7 @@ static const struct internal_format_class_info s3tc_compatible_internal_formats[
  * \return VIEW_CLASS if internalformat found in table, false otherwise.
  */
 static GLenum
-lookup_view_class(struct gl_context *ctx, GLenum internalformat)
+lookup_view_class(const struct gl_context *ctx, GLenum internalformat)
 {
    GLuint i;
 
@@ -176,9 +176,11 @@ lookup_view_class(struct gl_context *ctx, GLenum internalformat)
          return compatible_internal_formats[i].view_class;
    }
 
-   if (ctx->Extensions.EXT_texture_compression_s3tc && ctx->Extensions.EXT_texture_sRGB) {
+   if (ctx->Extensions.EXT_texture_compression_s3tc &&
+       ctx->Extensions.EXT_texture_sRGB) {
       for (i = 0; i < ARRAY_SIZE(s3tc_compatible_internal_formats); i++) {
-         if (s3tc_compatible_internal_formats[i].internal_format == internalformat)
+         if (s3tc_compatible_internal_formats[i].internal_format
+             == internalformat)
             return s3tc_compatible_internal_formats[i].view_class;
       }
    }
@@ -226,7 +228,8 @@ initialize_texture_fields(struct gl_context *ctx,
                                     0, internalFormat, texFormat);
       }
 
-      _mesa_next_mipmap_level_size(target, 0, levelWidth, levelHeight, levelDepth,
+      _mesa_next_mipmap_level_size(target, 0,
+                                   levelWidth, levelHeight, levelDepth,
                                    &levelWidth, &levelHeight, &levelDepth);
    }
 
@@ -320,8 +323,8 @@ target_valid(struct gl_context *ctx, GLenum origTarget, GLenum newTarget)
  * If an error is found, record it with _mesa_error()
  * \return false if any error, true otherwise.
  */
-GLboolean
-_mesa_texture_view_compatible_format(struct gl_context *ctx,
+bool
+_mesa_texture_view_compatible_format(const struct gl_context *ctx,
                                      GLenum origInternalFormat,
                                      GLenum newInternalFormat)
 {
@@ -334,15 +337,16 @@ _mesa_texture_view_compatible_format(struct gl_context *ctx,
     * or an INVALID_OPERATION error is generated.
     */
    if (origInternalFormat == newInternalFormat)
-      return GL_TRUE;
+      return true;
 
    origViewClass = lookup_view_class(ctx, origInternalFormat);
    newViewClass = lookup_view_class(ctx, newInternalFormat);
    if ((origViewClass == newViewClass) && origViewClass != false)
-      return GL_TRUE;
+      return true;
 
-   return GL_FALSE;
+   return false;
 }
+
 /**
  * Helper function for TexStorage and teximagemultisample to set immutable
  * texture state needed by ARB_texture_view.
@@ -357,17 +361,19 @@ _mesa_set_texture_view_state(struct gl_context *ctx,
    /* Get a reference to what will become this View's base level */
    texImage = _mesa_select_tex_image(texObj, target, 0);
 
-   /* When an immutable texture is created via glTexStorage or glTexImageMultisample,
+   /* When an immutable texture is created via glTexStorage or
+    * glTexImageMultisample,
     * TEXTURE_IMMUTABLE_FORMAT becomes TRUE.
     * TEXTURE_IMMUTABLE_LEVELS and TEXTURE_VIEW_NUM_LEVELS become levels.
     * If the texture target is TEXTURE_1D_ARRAY then
     * TEXTURE_VIEW_NUM_LAYERS becomes height.
     * If the texture target is TEXTURE_2D_ARRAY, TEXTURE_CUBE_MAP_ARRAY,
-    * or TEXTURE_2D_MULTISAMPLE_ARRAY then TEXTURE_VIEW_NUM_LAYERS becomes depth.
+    * or TEXTURE_2D_MULTISAMPLE_ARRAY then TEXTURE_VIEW_NUM_LAYERS becomes
+    * depth.
     * If the texture target is TEXTURE_CUBE_MAP, then
     * TEXTURE_VIEW_NUM_LAYERS becomes 6.
     * For any other texture target, TEXTURE_VIEW_NUM_LAYERS becomes 1.
-    * 
+    *
     * ARB_texture_multisample: Multisample textures do
     * not have multiple image levels.
     */
@@ -401,7 +407,6 @@ _mesa_set_texture_view_state(struct gl_context *ctx,
    case GL_TEXTURE_CUBE_MAP:
       texObj->NumLayers = 6;
       break;
-
    }
 }
 
@@ -435,16 +440,20 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
                   minlevel, numlevels, minlayer, numlayers);
 
    if (origtexture == 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(origtexture = %u)", origtexture);
+      _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(origtexture = %u)",
+                  origtexture);
       return;
    }
 
    /* Need original texture information to validate arguments */
    origTexObj = _mesa_lookup_texture(ctx, origtexture);
 
-   /* If <origtexture> is not the name of a texture, INVALID_VALUE is generated. */
+   /* If <origtexture> is not the name of a texture, INVALID_VALUE
+    * is generated.
+    */
    if (!origTexObj) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(origtexture = %u)", origtexture);
+      _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(origtexture = %u)",
+                  origtexture);
       return;
    }
 
@@ -452,7 +461,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
     * INVALID_OPERATION is generated.
     */
    if (!origTexObj->Immutable) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(origtexture not immutable)");
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(origtexture not immutable)");
       return;
    }
 
@@ -467,7 +477,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
     */
    texObj = _mesa_lookup_texture(ctx, texture);
    if (texObj == NULL) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(texture = %u non-gen name)", texture);
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(texture = %u non-gen name)", texture);
       return;
    }
 
@@ -475,7 +486,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
     * the error INVALID_OPERATION is generated.
     */
    if (texObj->Target) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(texture = %u already bound)", texture);
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(texture = %u already bound)", texture);
       return;
    }
 
@@ -484,33 +496,35 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
       return; /* error was recorded */
    }
 
-   /* minlevel and minlayer are relative to the view of origtexture
+   /* minlevel and minlayer are relative to the view of origtexture.
     * If minlevel or minlayer is greater than level or layer, respectively,
-    * of origtexture return INVALID_VALUE.
+    * return INVALID_VALUE.
     */
    newViewMinLevel = origTexObj->MinLevel + minlevel;
    newViewMinLayer = origTexObj->MinLayer + minlayer;
    if (newViewMinLevel >= (origTexObj->MinLevel + origTexObj->NumLevels)) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glTextureView(new minlevel (%d) > orig minlevel (%d) + orig numlevels (%d))",
+                  "glTextureView(new minlevel (%d) > orig minlevel (%d)"
+                  " + orig numlevels (%d))",
                   newViewMinLevel, origTexObj->MinLevel, origTexObj->NumLevels);
       return;
    }
 
    if (newViewMinLayer >= (origTexObj->MinLayer + origTexObj->NumLayers)) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glTextureView(new minlayer (%d) > orig minlayer (%d) + orig numlayers (%d))",
+                  "glTextureView(new minlayer (%d) > orig minlayer (%d)"
+                  " + orig numlayers (%d))",
                   newViewMinLayer, origTexObj->MinLayer, origTexObj->NumLayers);
       return;
    }
 
    if (!_mesa_texture_view_compatible_format(ctx,
-                                             origTexObj->Image[0][0]->InternalFormat,
-                                             internalformat)) {
+                                   origTexObj->Image[0][0]->InternalFormat,
+                                   internalformat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glTextureView(internalformat %s not compatible with origtexture %s)",
-                  _mesa_lookup_enum_by_nr(internalformat),
-                  _mesa_lookup_enum_by_nr(origTexObj->Image[0][0]->InternalFormat));
+          "glTextureView(internalformat %s not compatible with origtexture %s)",
+          _mesa_lookup_enum_by_nr(internalformat),
+          _mesa_lookup_enum_by_nr(origTexObj->Image[0][0]->InternalFormat));
       return;
    }
 
@@ -569,14 +583,16 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
    dimensionsOK = _mesa_legal_texture_dimensions(ctx, target, 0,
                                                  width, height, depth, 0);
    if (!dimensionsOK) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(invalid width or height or depth)");
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(invalid width or height or depth)");
       return;
    }
 
    sizeOK = ctx->Driver.TestProxyTexImage(ctx, target, 0, texFormat,
                                           width, height, depth, 0);
    if (!sizeOK) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(invalid texture size)");
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(invalid texture size)");
       return;
    }
 
@@ -591,17 +607,19 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
    case GL_TEXTURE_RECTANGLE:
    case GL_TEXTURE_2D_MULTISAMPLE:
       if (numlayers != 1) {
-         _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(numlayers %d != 1)", numlayers);
+         _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(numlayers %d != 1)",
+                     numlayers);
          return;
       }
       break;
 
    case GL_TEXTURE_CUBE_MAP:
-      /* If the new texture's target is TEXTURE_CUBE_MAP, the clamped <numlayers>
-       * must be equal to 6.
+      /* If the new texture's target is TEXTURE_CUBE_MAP, the clamped
+       * <numlayers> must be equal to 6.
        */
       if (newViewNumLayers != 6) {
-         _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(clamped numlayers %d != 6)",
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glTextureView(clamped numlayers %d != 6)",
                      newViewNumLayers);
          return;
       }
@@ -615,7 +633,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
        */
       if ((newViewNumLayers % 6) != 0) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glTextureView(clamped numlayers %d is not a multiple of 6)",
+                     "glTextureView(clamped numlayers %d is not"
+                     " a multiple of 6)",
                      newViewNumLayers);
          return;
       }
@@ -628,7 +647,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
     */
    if ((target == GL_TEXTURE_CUBE_MAP || target == GL_TEXTURE_CUBE_MAP_ARRAY) &&
        (origTexImage->Width != origTexImage->Height)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(origtexture width (%d) != height (%d))",
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(origtexture width (%d) != height (%d))",
                   origTexImage->Width, origTexImage->Height);
       return;
    }
@@ -662,7 +682,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
    texObj->ImmutableLevels = origTexObj->ImmutableLevels;
    texObj->Target = target;
 
-   if (ctx->Driver.TextureView != NULL && !ctx->Driver.TextureView(ctx, texObj, origTexObj)) {
+   if (ctx->Driver.TextureView != NULL &&
+       !ctx->Driver.TextureView(ctx, texObj, origTexObj)) {
       return; /* driver recorded error */
    }
 }
diff --git a/src/mesa/main/textureview.h b/src/mesa/main/textureview.h
index 549a13cd809..59e24b68dd0 100644
--- a/src/mesa/main/textureview.h
+++ b/src/mesa/main/textureview.h
@@ -29,8 +29,8 @@
 #ifndef TEXTUREVIEW_H
 #define TEXTUREVIEW_H
 
-GLboolean
-_mesa_texture_view_compatible_format(struct gl_context *ctx,
+bool
+_mesa_texture_view_compatible_format(const struct gl_context *ctx,
                                      GLenum origInternalFormat,
                                      GLenum newInternalFormat);
 
@@ -41,7 +41,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
                   GLuint minlayer, GLuint numlayers);
 
 extern void
-_mesa_set_texture_view_state(struct gl_context *ctx, struct gl_texture_object *texObj,
-                       GLenum target, GLuint levels);
+_mesa_set_texture_view_state(struct gl_context *ctx,
+                             struct gl_texture_object *texObj,
+                             GLenum target, GLuint levels);
 
 #endif /* TEXTUREVIEW_H */
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index 728bd1bac10..cab5083e81b 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -237,6 +237,13 @@ validate_uniform_parameters(struct gl_context *ctx,
 
    struct gl_uniform_storage *const uni = shProg->UniformRemapTable[location];
 
+   /* Even though no location is assigned to a built-in uniform and this
+    * function should already have returned NULL, this test makes it explicit
+    * that we are not allowing to update the value of a built-in.
+    */
+   if (uni->builtin)
+      return NULL;
+
    if (uni->array_elements == 0) {
       if (count > 1) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -1028,6 +1035,10 @@ _mesa_get_uniform_location(struct gl_shader_program *shProg,
    if (!found)
       return GL_INVALID_INDEX;
 
+   /* If the uniform is built-in, fail. */
+   if (shProg->UniformStorage[location].builtin)
+      return GL_INVALID_INDEX;
+
    /* If the uniform is an array, fail if the index is out of bounds.
     * (A negative index is caught above.)  This also fails if the uniform
     * is not an array, but the user is trying to index it, because
@@ -1047,7 +1058,7 @@ _mesa_sampler_uniforms_are_valid(const struct gl_shader_program *shProg,
 				 char *errMsg, size_t errMsgLength)
 {
    /* Shader does not have samplers. */
-   if (shProg->NumUserUniformStorage == 0)
+   if (shProg->NumUniformStorage == 0)
       return true;
 
    if (!shProg->SamplersValidated) {
@@ -1087,7 +1098,7 @@ _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *pipeline)
       if (!shProg[idx])
          continue;
 
-      for (unsigned i = 0; i < shProg[idx]->NumUserUniformStorage; i++) {
+      for (unsigned i = 0; i < shProg[idx]->NumUniformStorage; i++) {
          const struct gl_uniform_storage *const storage =
             &shProg[idx]->UniformStorage[i];
          const glsl_type *const t = (storage->type->is_array())
diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h
index 55fa2357e38..bd7b05e207a 100644
--- a/src/mesa/main/uniforms.h
+++ b/src/mesa/main/uniforms.h
@@ -343,10 +343,6 @@ void GLAPIENTRY
 _mesa_ProgramUniformMatrix4x3dv(GLuint program, GLint location, GLsizei count,
                                 GLboolean transpose, const GLdouble *value);
 
-long
-_mesa_parse_program_resource_name(const GLchar *name,
-                                  const GLchar **out_base_name_end);
-
 unsigned
 _mesa_get_uniform_location(struct gl_shader_program *shProg,
 			   const GLchar *name, unsigned *offset);
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index 7389037ae85..ebdd9eaf02e 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -2309,10 +2309,10 @@ print_array(const char *name, GLint index, const struct gl_client_array *array)
       fprintf(stderr, "  %s[%d]: ", name, index);
    else
       fprintf(stderr, "  %s: ", name);
-   fprintf(stderr, "Ptr=%p, Type=0x%x, Size=%d, ElemSize=%u, Stride=%d, Buffer=%u(Size %lu)\n",
-	  array->Ptr, array->Type, array->Size,
-	  array->_ElementSize, array->StrideB,
-	  array->BufferObj->Name, (unsigned long) array->BufferObj->Size);
+   fprintf(stderr, "Ptr=%p, Type=%s, Size=%d, ElemSize=%u, Stride=%d, Buffer=%u(Size %lu)\n",
+           array->Ptr, _mesa_lookup_enum_by_nr(array->Type), array->Size,
+           array->_ElementSize, array->StrideB, array->BufferObj->Name,
+           (unsigned long) array->BufferObj->Size);
 }
 
 
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 699a0de46c2..8bc00ace5c4 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -51,31 +51,51 @@ check_for_ending(const char *string, const char *ending)
  * fwd_context is only valid if version > 0
  */
 static void
-get_gl_override(int *version, bool *fwd_context, bool *compat_context)
+get_gl_override(gl_api api, int *version, bool *fwd_context,
+                bool *compat_context)
 {
-   const char *env_var = "MESA_GL_VERSION_OVERRIDE";
+   const char *env_var = (api == API_OPENGL_CORE || api == API_OPENGL_COMPAT)
+      ? "MESA_GL_VERSION_OVERRIDE" : "MESA_GLES_VERSION_OVERRIDE";
    const char *version_str;
    int major, minor, n;
-   static int override_version = -1;
-   static bool fc_suffix = false;
-   static bool compat_suffix = false;
+   static struct override_info {
+      int version;
+      bool fc_suffix;
+      bool compat_suffix;
+   } override[] = {
+      { -1, false, false},
+      { -1, false, false},
+      { -1, false, false},
+      { -1, false, false},
+   };
 
-   if (override_version < 0) {
-      override_version = 0;
+   STATIC_ASSERT(ARRAY_SIZE(override) == API_OPENGL_LAST + 1);
+
+   if (api == API_OPENGLES)
+      goto exit;
+
+   if (override[api].version < 0) {
+      override[api].version = 0;
 
       version_str = getenv(env_var);
       if (version_str) {
-         fc_suffix = check_for_ending(version_str, "FC");
-         compat_suffix = check_for_ending(version_str, "COMPAT");
+         override[api].fc_suffix = check_for_ending(version_str, "FC");
+         override[api].compat_suffix = check_for_ending(version_str, "COMPAT");
 
          n = sscanf(version_str, "%u.%u", &major, &minor);
          if (n != 2) {
             fprintf(stderr, "error: invalid value for %s: %s\n",
                     env_var, version_str);
-            override_version = 0;
+            override[api].version = 0;
          } else {
-            override_version = major * 10 + minor;
-            if (override_version < 30 && fc_suffix) {
+            override[api].version = major * 10 + minor;
+
+            /* There is no such thing as compatibility or forward-compatible for
+             * OpenGL ES 2.0 or 3.x APIs.
+             */
+            if ((override[api].version < 30 && override[api].fc_suffix) ||
+                (api == API_OPENGLES2 && (override[api].fc_suffix ||
+                                          override[api].compat_suffix))) {
                fprintf(stderr, "error: invalid value for %s: %s\n",
                        env_var, version_str);
             }
@@ -83,9 +103,10 @@ get_gl_override(int *version, bool *fwd_context, bool *compat_context)
       }
    }
 
-   *version = override_version;
-   *fwd_context = fc_suffix;
-   *compat_context = compat_suffix;
+exit:
+   *version = override[api].version;
+   *fwd_context = override[api].fc_suffix;
+   *compat_context = override[api].compat_suffix;
 }
 
 /**
@@ -130,18 +151,26 @@ _mesa_override_gl_version_contextless(struct gl_constants *consts,
    int version;
    bool fwd_context, compat_context;
 
-   get_gl_override(&version, &fwd_context, &compat_context);
+   get_gl_override(*apiOut, &version, &fwd_context, &compat_context);
 
    if (version > 0) {
       *versionOut = version;
-      if (version >= 30 && fwd_context) {
-         *apiOut = API_OPENGL_CORE;
-         consts->ContextFlags |= GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT;
-      } else if (version >= 31 && !compat_context) {
-         *apiOut = API_OPENGL_CORE;
-      } else {
-         *apiOut = API_OPENGL_COMPAT;
+
+      /* If the API is a desktop API, adjust the context flags.  We may also
+       * need to modify the API depending on the version.  For example, Mesa
+       * does not support a GL 3.3 compatibility profile.
+       */
+      if (*apiOut == API_OPENGL_CORE || *apiOut == API_OPENGL_COMPAT) {
+         if (version >= 30 && fwd_context) {
+            *apiOut = API_OPENGL_CORE;
+            consts->ContextFlags |= GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT;
+         } else if (version >= 31 && !compat_context) {
+            *apiOut = API_OPENGL_CORE;
+         } else {
+            *apiOut = API_OPENGL_COMPAT;
+         }
       }
+
       return true;
    }
    return false;
@@ -157,22 +186,6 @@ _mesa_override_gl_version(struct gl_context *ctx)
 }
 
 /**
- * Returns the gl override value
- *
- * version > 0 indicates there is an override requested
- */
-int
-_mesa_get_gl_version_override(void)
-{
-   int version;
-   bool fwd_context, compat_context;
-
-   get_gl_override(&version, &fwd_context, &compat_context);
-
-   return version;
-}
-
-/**
  * Override the context's GLSL version if the environment variable
  * MESA_GLSL_VERSION_OVERRIDE is set. Valid values for
  * MESA_GLSL_VERSION_OVERRIDE are integers, such as "130".
@@ -433,7 +446,23 @@ compute_version_es2(const struct gl_extensions *extensions)
                          extensions->EXT_texture_snorm &&
                          extensions->NV_primitive_restart &&
                          extensions->OES_depth_texture_cube_map);
-   if (ver_3_0) {
+   const bool ver_3_1 = (ver_3_0 &&
+                         extensions->ARB_arrays_of_arrays &&
+                         extensions->ARB_compute_shader &&
+                         extensions->ARB_draw_indirect &&
+                         false /*extensions->ARB_framebuffer_no_attachments*/ &&
+                         extensions->ARB_shader_atomic_counters &&
+                         extensions->ARB_shader_image_load_store &&
+                         false /*extensions->ARB_shader_image_size*/ &&
+                         false /*extensions->ARB_shader_storage_buffer_object*/ &&
+                         extensions->ARB_shading_language_packing &&
+                         extensions->ARB_stencil_texturing &&
+                         extensions->ARB_gpu_shader5 &&
+                         extensions->EXT_shader_integer_mix);
+
+   if (ver_3_1) {
+      return 31;
+   } else if (ver_3_0) {
       return 30;
    } else if (ver_2_0) {
       return 20;
diff --git a/src/mesa/main/version.h b/src/mesa/main/version.h
index 450a0e31d3d..ee7cb7501eb 100644
--- a/src/mesa/main/version.h
+++ b/src/mesa/main/version.h
@@ -47,7 +47,4 @@ _mesa_override_gl_version(struct gl_context *ctx);
 extern void
 _mesa_override_glsl_version(struct gl_constants *consts);
 
-extern int
-_mesa_get_gl_version_override(void);
-
 #endif /* VERSION_H */
diff --git a/src/mesa/main/vtxfmt.c b/src/mesa/main/vtxfmt.c
index d7ef7e278cd..81bf4c589ea 100644
--- a/src/mesa/main/vtxfmt.c
+++ b/src/mesa/main/vtxfmt.c
@@ -207,7 +207,7 @@ install_vtxfmt(struct gl_context *ctx, struct _glapi_table *tab,
       SET_VertexAttribP4uiv(tab, vfmt->VertexAttribP4uiv);
    }
 
-   if (_mesa_is_desktop_gl(ctx)) {
+   if (ctx->API == API_OPENGL_CORE) {
       SET_VertexAttribL1d(tab, vfmt->VertexAttribL1d);
       SET_VertexAttribL2d(tab, vfmt->VertexAttribL2d);
       SET_VertexAttribL3d(tab, vfmt->VertexAttribL3d);
diff --git a/src/mesa/program/dummy_errors.c b/src/mesa/program/dummy_errors.c
new file mode 100644
index 00000000000..d69f54d1d05
--- /dev/null
+++ b/src/mesa/program/dummy_errors.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <stdio.h>
+#include "main/errors.h"
+
+void
+_mesa_error_no_memory(const char *caller)
+{
+   fprintf(stderr, "Mesa error: out of memory in %s", caller);
+}
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index fceed712bdb..3bffe90ff1f 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -262,6 +262,7 @@ public:
    virtual void visit(ir_if *);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void visit(ir_barrier *);
    /*@}*/
 
    src_reg result;
@@ -405,7 +406,7 @@ ir_to_mesa_visitor::emit_dp(ir_instruction *ir,
 			    dst_reg dst, src_reg src0, src_reg src1,
 			    unsigned elements)
 {
-   static const gl_inst_opcode dot_opcodes[] = {
+   static const enum prog_opcode dot_opcodes[] = {
       OPCODE_DP2, OPCODE_DP3, OPCODE_DP4
    };
 
@@ -2118,6 +2119,12 @@ ir_to_mesa_visitor::visit(ir_end_primitive *)
    assert(!"Geometry shaders not supported.");
 }
 
+void
+ir_to_mesa_visitor::visit(ir_barrier *)
+{
+   unreachable("GLSL barrier() not supported.");
+}
+
 ir_to_mesa_visitor::ir_to_mesa_visitor()
 {
    result.file = PROGRAM_UNDEFINED;
@@ -2407,9 +2414,14 @@ _mesa_associate_uniform_storage(struct gl_context *ctx,
       if (!found)
 	 continue;
 
+      struct gl_uniform_storage *storage =
+         &shader_program->UniformStorage[location];
+
+      /* Do not associate any uniform storage to built-in uniforms */
+      if (storage->builtin)
+         continue;
+
       if (location != last_location) {
-	 struct gl_uniform_storage *storage =
-	    &shader_program->UniformStorage[location];
 	 enum gl_uniform_driver_format format = uniform_native;
 
 	 unsigned columns = 0;
@@ -2722,7 +2734,7 @@ get_mesa_program(struct gl_context *ctx,
       mesa_inst->Opcode = inst->op;
       mesa_inst->CondUpdate = inst->cond_update;
       if (inst->saturate)
-	 mesa_inst->SaturateMode = SATURATE_ZERO_ONE;
+	 mesa_inst->Saturate = GL_TRUE;
       mesa_inst->DstReg.File = inst->dst.file;
       mesa_inst->DstReg.Index = inst->dst.index;
       mesa_inst->DstReg.CondMask = inst->dst.cond_mask;
diff --git a/src/mesa/program/prog_execute.c b/src/mesa/program/prog_execute.c
index 16e8e340d8d..46260b54882 100644
--- a/src/mesa/program/prog_execute.c
+++ b/src/mesa/program/prog_execute.c
@@ -397,7 +397,7 @@ store_vector4(const struct prog_instruction *inst,
               struct gl_program_machine *machine, const GLfloat value[4])
 {
    const struct prog_dst_register *dstReg = &(inst->DstReg);
-   const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
+   const GLboolean clamp = inst->Saturate;
    GLuint writeMask = dstReg->WriteMask;
    GLfloat clampedValue[4];
    GLfloat *dst = get_dst_register_pointer(dstReg, machine);
diff --git a/src/mesa/program/prog_instruction.c b/src/mesa/program/prog_instruction.c
index f9ebe4e8fd2..21ef35337f6 100644
--- a/src/mesa/program/prog_instruction.c
+++ b/src/mesa/program/prog_instruction.c
@@ -55,7 +55,7 @@ _mesa_init_instructions(struct prog_instruction *inst, GLuint count)
       inst[i].DstReg.CondMask = COND_TR;
       inst[i].DstReg.CondSwizzle = SWIZZLE_NOOP;
 
-      inst[i].SaturateMode = SATURATE_OFF;
+      inst[i].Saturate = GL_FALSE;
       inst[i].Precision = FLOAT32;
    }
 }
@@ -114,7 +114,7 @@ _mesa_free_instructions(struct prog_instruction *inst, GLuint count)
  */
 struct instruction_info
 {
-   gl_inst_opcode Opcode;
+   enum prog_opcode Opcode;
    const char *Name;
    GLuint NumSrcRegs;
    GLuint NumDstRegs;
@@ -198,7 +198,7 @@ static const struct instruction_info InstInfo[MAX_OPCODE] = {
  * Return the number of src registers for the given instruction/opcode.
  */
 GLuint
-_mesa_num_inst_src_regs(gl_inst_opcode opcode)
+_mesa_num_inst_src_regs(enum prog_opcode opcode)
 {
    assert(opcode < MAX_OPCODE);
    assert(opcode == InstInfo[opcode].Opcode);
@@ -211,7 +211,7 @@ _mesa_num_inst_src_regs(gl_inst_opcode opcode)
  * Return the number of dst registers for the given instruction/opcode.
  */
 GLuint
-_mesa_num_inst_dst_regs(gl_inst_opcode opcode)
+_mesa_num_inst_dst_regs(enum prog_opcode opcode)
 {
    assert(opcode < MAX_OPCODE);
    assert(opcode == InstInfo[opcode].Opcode);
@@ -221,7 +221,7 @@ _mesa_num_inst_dst_regs(gl_inst_opcode opcode)
 
 
 GLboolean
-_mesa_is_tex_instruction(gl_inst_opcode opcode)
+_mesa_is_tex_instruction(enum prog_opcode opcode)
 {
    return (opcode == OPCODE_TEX ||
            opcode == OPCODE_TXB ||
@@ -285,7 +285,7 @@ _mesa_check_soa_dependencies(const struct prog_instruction *inst)
  * Return string name for given program opcode.
  */
 const char *
-_mesa_opcode_string(gl_inst_opcode opcode)
+_mesa_opcode_string(enum prog_opcode opcode)
 {
    if (opcode < MAX_OPCODE)
       return InstInfo[opcode].Name;
diff --git a/src/mesa/program/prog_instruction.h b/src/mesa/program/prog_instruction.h
index 96da198f86d..d56f96cfaa1 100644
--- a/src/mesa/program/prog_instruction.h
+++ b/src/mesa/program/prog_instruction.h
@@ -118,15 +118,6 @@
 
 
 /**
- * Saturation modes when storing values.
- */
-/*@{*/
-#define SATURATE_OFF            0
-#define SATURATE_ZERO_ONE       1
-/*@}*/
-
-
-/**
  * Per-component negation masks
  */
 /*@{*/
@@ -143,7 +134,7 @@
 /**
  * Program instruction opcodes for vertex, fragment and geometry programs.
  */
-typedef enum prog_opcode {
+enum prog_opcode {
                      /* ARB_vp   ARB_fp   NV_vp   NV_fp     GLSL */
                      /*------------------------------------------*/
    OPCODE_NOP = 0,   /*                                      X   */
@@ -213,7 +204,7 @@ typedef enum prog_opcode {
    OPCODE_TRUNC,     /*                                      X   */
    OPCODE_XPD,       /*   X        X                             */
    MAX_OPCODE
-} gl_inst_opcode;
+};
 
 
 /**
@@ -300,7 +291,7 @@ struct prog_dst_register
  */
 struct prog_instruction
 {
-   gl_inst_opcode Opcode;
+   enum prog_opcode Opcode;
    struct prog_src_register SrcReg[3];
    struct prog_dst_register DstReg;
 
@@ -327,15 +318,12 @@ struct prog_instruction
    GLuint CondDst:1;
 
    /**
-    * Saturate each value of the vectored result to the range [0,1] or the
-    * range [-1,1].  \c SSAT mode (i.e., saturation to the range [-1,1]) is
-    * only available in NV_fragment_program2 mode.
-    * Value is one of the SATURATE_* tokens.
+    * Saturate each value of the vectored result to the range [0,1].
     *
     * \since
     * NV_fragment_program_option, NV_vertex_program3.
     */
-   GLuint SaturateMode:2;
+   GLuint Saturate:1;
 
    /**
     * Per-instruction selectable precision: FLOAT32, FLOAT16, FIXED12.
@@ -368,9 +356,6 @@ struct prog_instruction
     */
    GLint BranchTarget;
 
-   /** for driver use (try to remove someday) */
-   GLint Aux;
-
    /** for debugging purposes */
    const char *Comment;
 };
@@ -394,19 +379,19 @@ extern void
 _mesa_free_instructions(struct prog_instruction *inst, GLuint count);
 
 extern GLuint
-_mesa_num_inst_src_regs(gl_inst_opcode opcode);
+_mesa_num_inst_src_regs(enum prog_opcode opcode);
 
 extern GLuint
-_mesa_num_inst_dst_regs(gl_inst_opcode opcode);
+_mesa_num_inst_dst_regs(enum prog_opcode opcode);
 
 extern GLboolean
-_mesa_is_tex_instruction(gl_inst_opcode opcode);
+_mesa_is_tex_instruction(enum prog_opcode opcode);
 
 extern GLboolean
 _mesa_check_soa_dependencies(const struct prog_instruction *inst);
 
 extern const char *
-_mesa_opcode_string(gl_inst_opcode opcode);
+_mesa_opcode_string(enum prog_opcode opcode);
 
 
 #ifdef __cplusplus
diff --git a/src/mesa/program/prog_optimize.c b/src/mesa/program/prog_optimize.c
index 6d4485acb65..f9e9035fc3e 100644
--- a/src/mesa/program/prog_optimize.c
+++ b/src/mesa/program/prog_optimize.c
@@ -478,7 +478,7 @@ can_upward_mov_be_modifed(const struct prog_instruction *mov)
    return
       can_downward_mov_be_modifed(mov) &&
       mov->DstReg.File == PROGRAM_TEMPORARY &&
-      mov->SaturateMode == SATURATE_OFF;
+      !mov->Saturate;
 }
 
 
@@ -653,7 +653,7 @@ _mesa_merge_mov_into_inst(struct prog_instruction *inst,
    if (mask != (inst->DstReg.WriteMask & mask))
       return GL_FALSE;
 
-   inst->SaturateMode |= mov->SaturateMode;
+   inst->Saturate |= mov->Saturate;
 
    /* Depending on the instruction, we may need to recompute the swizzles.
     * Also, some other instructions (like TEX) are not linear. We will only
diff --git a/src/mesa/program/prog_print.c b/src/mesa/program/prog_print.c
index d588d07ffe4..e4faa63c06f 100644
--- a/src/mesa/program/prog_print.c
+++ b/src/mesa/program/prog_print.c
@@ -600,7 +600,7 @@ _mesa_fprint_alu_instruction(FILE *f,
       fprintf(f, ".C");
 
    /* frag prog only */
-   if (inst->SaturateMode == SATURATE_ZERO_ONE)
+   if (inst->Saturate)
       fprintf(f, "_SAT");
 
    fprintf(f, " ");
@@ -658,7 +658,7 @@ _mesa_fprint_instruction_opt(FILE *f,
    switch (inst->Opcode) {
    case OPCODE_SWZ:
       fprintf(f, "SWZ");
-      if (inst->SaturateMode == SATURATE_ZERO_ONE)
+      if (inst->Saturate)
          fprintf(f, "_SAT");
       fprintf(f, " ");
       fprint_dst_reg(f, &inst->DstReg, mode, prog);
@@ -675,7 +675,7 @@ _mesa_fprint_instruction_opt(FILE *f,
    case OPCODE_TXB:
    case OPCODE_TXD:
       fprintf(f, "%s", _mesa_opcode_string(inst->Opcode));
-      if (inst->SaturateMode == SATURATE_ZERO_ONE)
+      if (inst->Saturate)
          fprintf(f, "_SAT");
       fprintf(f, " ");
       fprint_dst_reg(f, &inst->DstReg, mode, prog);
@@ -864,7 +864,7 @@ _mesa_fprint_program_opt(FILE *f,
       else
          fprintf(f, "# Fragment Program/Shader %u\n", prog->Id);
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       fprintf(f, "# Geometry Shader\n");
    }
 
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c
index 0c0c87faa28..bdb335e4ba3 100644
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -244,14 +244,14 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[],
       {
          /* state[1] is the texture unit */
          const GLuint unit = (GLuint) state[1];
-         if (_mesa_get_clamp_fragment_color(ctx))
+         if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
             COPY_4V(value, ctx->Texture.Unit[unit].EnvColor);
          else
             COPY_4V(value, ctx->Texture.Unit[unit].EnvColorUnclamped);
       }
       return;
    case STATE_FOG_COLOR:
-      if (_mesa_get_clamp_fragment_color(ctx))
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
          COPY_4V(value, ctx->Fog.Color);
       else
          COPY_4V(value, ctx->Fog.ColorUnclamped);
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index 6c5fa51ec61..d54f934247d 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -47,6 +47,7 @@ struct ptn_compile {
    nir_builder build;
    bool error;
 
+   nir_variable *parameters;
    nir_variable *input_vars[VARYING_SLOT_MAX];
    nir_variable *output_vars[VARYING_SLOT_MAX];
    nir_register **output_regs;
@@ -112,21 +113,6 @@ ptn_get_dest(struct ptn_compile *c, const struct prog_dst_register *prog_dst)
    return dest;
 }
 
-/**
- * Multiply the contents of the ADDR register by 4 to convert from the number
- * of vec4s to the number of floating point components.
- */
-static nir_ssa_def *
-ptn_addr_reg_value(struct ptn_compile *c)
-{
-   nir_builder *b = &c->build;
-   nir_alu_src src;
-   memset(&src, 0, sizeof(src));
-   src.src = nir_src_for_reg(c->addr_reg);
-
-   return nir_imul(b, nir_fmov_alu(b, src, 1), nir_imm_int(b, 4));
-}
-
 static nir_ssa_def *
 ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
 {
@@ -180,27 +166,38 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
          }
          /* FALLTHROUGH */
       case PROGRAM_STATE_VAR: {
-         nir_intrinsic_op load_op =
-            prog_src->RelAddr ? nir_intrinsic_load_uniform_indirect :
-                                nir_intrinsic_load_uniform;
-         nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, load_op);
+         nir_intrinsic_instr *load =
+            nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var);
          nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
          load->num_components = 4;
 
-         /* Multiply src->Index by 4 to scale from # of vec4s to components. */
-         load->const_index[0] = 4 * prog_src->Index;
-         load->const_index[1] = 1;
+         load->variables[0] = nir_deref_var_create(load, c->parameters);
+         nir_deref_array *deref_arr =
+            nir_deref_array_create(load->variables[0]);
+         deref_arr->deref.type = glsl_vec4_type();
+         load->variables[0]->deref.child = &deref_arr->deref;
 
          if (prog_src->RelAddr) {
-            nir_ssa_def *reladdr = ptn_addr_reg_value(c);
+            deref_arr->deref_array_type = nir_deref_array_type_indirect;
+
+            nir_alu_src addr_src = { NIR_SRC_INIT };
+            addr_src.src = nir_src_for_reg(c->addr_reg);
+            nir_ssa_def *reladdr = nir_imov_alu(b, addr_src, 1);
+
             if (prog_src->Index < 0) {
                /* This is a negative offset which should be added to the address
                 * register's value.
                 */
-               reladdr = nir_iadd(b, reladdr, nir_imm_int(b, load->const_index[0]));
-               load->const_index[0] = 0;
+               reladdr = nir_iadd(b, reladdr, nir_imm_int(b, prog_src->Index));
+
+               deref_arr->base_offset = 0;
+            } else {
+               deref_arr->base_offset = prog_src->Index;
             }
-            load->src[0] = nir_src_for_ssa(reladdr);
+            deref_arr->indirect = nir_src_for_ssa(reladdr);
+         } else {
+            deref_arr->deref_array_type = nir_deref_array_type_direct;
+            deref_arr->base_offset = prog_src->Index;
          }
 
          nir_instr_insert_after_cf_list(b->cf_node_list, &load->instr);
@@ -700,7 +697,7 @@ static const nir_op op_trans[MAX_OPCODE] = {
    [OPCODE_ADD] = nir_op_fadd,
    [OPCODE_ARL] = 0,
    [OPCODE_CMP] = 0,
-   [OPCODE_COS] = nir_op_fcos,
+   [OPCODE_COS] = 0,
    [OPCODE_DDX] = nir_op_fddx,
    [OPCODE_DDY] = nir_op_fddy,
    [OPCODE_DP2] = 0,
@@ -709,11 +706,11 @@ static const nir_op op_trans[MAX_OPCODE] = {
    [OPCODE_DPH] = 0,
    [OPCODE_DST] = 0,
    [OPCODE_END] = 0,
-   [OPCODE_EX2] = nir_op_fexp2,
+   [OPCODE_EX2] = 0,
    [OPCODE_EXP] = 0,
    [OPCODE_FLR] = nir_op_ffloor,
    [OPCODE_FRC] = nir_op_ffract,
-   [OPCODE_LG2] = nir_op_flog2,
+   [OPCODE_LG2] = 0,
    [OPCODE_LIT] = 0,
    [OPCODE_LOG] = 0,
    [OPCODE_LRP] = 0,
@@ -722,15 +719,15 @@ static const nir_op op_trans[MAX_OPCODE] = {
    [OPCODE_MIN] = nir_op_fmin,
    [OPCODE_MOV] = nir_op_fmov,
    [OPCODE_MUL] = nir_op_fmul,
-   [OPCODE_POW] = nir_op_fpow,
-   [OPCODE_RCP] = nir_op_frcp,
+   [OPCODE_POW] = 0,
+   [OPCODE_RCP] = 0,
 
-   [OPCODE_RSQ] = nir_op_frsq,
+   [OPCODE_RSQ] = 0,
    [OPCODE_SCS] = 0,
    [OPCODE_SEQ] = 0,
    [OPCODE_SGE] = 0,
    [OPCODE_SGT] = 0,
-   [OPCODE_SIN] = nir_op_fsin,
+   [OPCODE_SIN] = 0,
    [OPCODE_SLE] = 0,
    [OPCODE_SLT] = 0,
    [OPCODE_SNE] = 0,
@@ -767,7 +764,8 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
 
    switch (op) {
    case OPCODE_RSQ:
-      ptn_move_dest(b, dest, nir_frsq(b, ptn_channel(b, src[0], X)));
+      ptn_move_dest(b, dest,
+                    nir_frsq(b, nir_fabs(b, ptn_channel(b, src[0], X))));
       break;
 
    case OPCODE_RCP:
@@ -894,7 +892,7 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
       break;
 
    default:
-      if (op_trans[op] != 0 || op == OPCODE_MOV) {
+      if (op_trans[op] != 0) {
          ptn_alu(b, op_trans[op], dest, src);
       } else {
          fprintf(stderr, "unknown opcode: %s\n", _mesa_opcode_string(op));
@@ -903,8 +901,8 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
       break;
    }
 
-   if (prog_inst->SaturateMode) {
-      assert(prog_inst->SaturateMode == SATURATE_ZERO_ONE);
+   if (prog_inst->Saturate) {
+      assert(prog_inst->Saturate);
       assert(!dest.dest.is_ssa);
       ptn_move_dest(b, dest, nir_fsat(b, ptn_src_for_dest(c, &dest)));
    }
@@ -926,10 +924,23 @@ ptn_add_output_stores(struct ptn_compile *c)
    foreach_list_typed(nir_variable, var, node, &b->shader->outputs) {
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
-      store->num_components = 4;
+      store->num_components = glsl_get_vector_elements(var->type);
       store->variables[0] =
          nir_deref_var_create(store, c->output_vars[var->data.location]);
-      store->src[0].reg.reg = c->output_regs[var->data.location];
+
+      if (c->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
+          var->data.location == FRAG_RESULT_DEPTH) {
+         /* result.depth has this strange convention of being the .z component of
+          * a vec4 with undefined .xyw components.  We resolve it to a scalar, to
+          * match GLSL's gl_FragDepth and the expectations of most backends.
+          */
+         nir_alu_src alu_src = { NIR_SRC_INIT };
+         alu_src.src = nir_src_for_reg(c->output_regs[FRAG_RESULT_DEPTH]);
+         alu_src.swizzle[0] = SWIZZLE_Z;
+         store->src[0] = nir_src_for_ssa(nir_fmov_alu(b, alu_src, 1));
+      } else {
+         store->src[0].reg.reg = c->output_regs[var->data.location];
+      }
       nir_instr_insert_after_cf_list(c->build.cf_node_list, &store->instr);
    }
 }
@@ -1022,7 +1033,10 @@ setup_registers_and_variables(struct ptn_compile *c)
       reg->num_components = 4;
 
       nir_variable *var = rzalloc(shader, nir_variable);
-      var->type = glsl_vec4_type();
+      if (c->prog->Target == GL_FRAGMENT_PROGRAM_ARB && i == FRAG_RESULT_DEPTH)
+         var->type = glsl_float_type();
+      else
+         var->type = glsl_vec4_type();
       var->data.mode = nir_var_shader_out;
       var->name = ralloc_asprintf(var, "out_%d", i);
 
@@ -1057,13 +1071,11 @@ setup_registers_and_variables(struct ptn_compile *c)
    }
    reg->num_components = 1;
    c->addr_reg = reg;
-
-   /* Set the number of uniforms */
-   shader->num_uniforms = 4 * c->prog->Parameters->NumParameters;
 }
 
 struct nir_shader *
-prog_to_nir(const struct gl_program *prog, const nir_shader_compiler_options *options)
+prog_to_nir(const struct gl_program *prog,
+            const nir_shader_compiler_options *options)
 {
    struct ptn_compile *c;
    struct nir_shader *s;
@@ -1076,6 +1088,14 @@ prog_to_nir(const struct gl_program *prog, const nir_shader_compiler_options *op
       goto fail;
    c->prog = prog;
 
+   c->parameters = rzalloc(s, nir_variable);
+   c->parameters->type = glsl_array_type(glsl_vec4_type(),
+                                            prog->Parameters->NumParameters);
+   c->parameters->name = "parameters";
+   c->parameters->data.read_only = true;
+   c->parameters->data.mode = nir_var_uniform;
+   exec_list_push_tail(&s->uniforms, &c->parameters->node);
+
    nir_function *func = nir_function_create(s, "main");
    nir_function_overload *overload = nir_function_overload_create(func);
    nir_function_impl *impl = nir_function_impl_create(overload);
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index fb61f4d360d..c13e61b1630 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -97,13 +97,6 @@ _mesa_init_program(struct gl_context *ctx)
    assert(ctx->FragmentProgram.Current);
    ctx->FragmentProgram.Cache = _mesa_new_program_cache();
 
-   ctx->GeometryProgram.Enabled = GL_FALSE;
-   /* right now by default we don't have a geometry program */
-   _mesa_reference_geomprog(ctx, &ctx->GeometryProgram.Current,
-                            NULL);
-
-   _mesa_reference_compprog(ctx, &ctx->ComputeProgram.Current, NULL);
-
    /* XXX probably move this stuff */
    ctx->ATIFragmentShader.Enabled = GL_FALSE;
    ctx->ATIFragmentShader.Current = ctx->Shared->DefaultFragmentShader;
@@ -122,8 +115,6 @@ _mesa_free_program_data(struct gl_context *ctx)
    _mesa_delete_program_cache(ctx, ctx->VertexProgram.Cache);
    _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, NULL);
    _mesa_delete_shader_cache(ctx, ctx->FragmentProgram.Cache);
-   _mesa_reference_geomprog(ctx, &ctx->GeometryProgram.Current, NULL);
-   _mesa_reference_compprog(ctx, &ctx->ComputeProgram.Current, NULL);
 
    /* XXX probably move this stuff */
    if (ctx->ATIFragmentShader.Current) {
@@ -153,9 +144,6 @@ _mesa_update_default_objects_program(struct gl_context *ctx)
                             ctx->Shared->DefaultFragmentProgram);
    assert(ctx->FragmentProgram.Current);
 
-   _mesa_reference_geomprog(ctx, &ctx->GeometryProgram.Current,
-                      ctx->Shared->DefaultGeometryProgram);
-
    /* XXX probably move this stuff */
    if (ctx->ATIFragmentShader.Current) {
       ctx->ATIFragmentShader.Current->RefCount--;
@@ -340,7 +328,7 @@ _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id)
                                          CALLOC_STRUCT(gl_fragment_program),
                                          target, id );
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       prog = _mesa_init_geometry_program(ctx,
                                          CALLOC_STRUCT(gl_geometry_program),
                                          target, id);
@@ -426,8 +414,8 @@ _mesa_reference_program_(struct gl_context *ctx,
       else if ((*ptr)->Target == GL_FRAGMENT_PROGRAM_ARB)
          assert(prog->Target == GL_FRAGMENT_PROGRAM_ARB ||
                 prog->Target == GL_FRAGMENT_PROGRAM_NV);
-      else if ((*ptr)->Target == MESA_GEOMETRY_PROGRAM)
-         assert(prog->Target == MESA_GEOMETRY_PROGRAM);
+      else if ((*ptr)->Target == GL_GEOMETRY_PROGRAM_NV)
+         assert(prog->Target == GL_GEOMETRY_PROGRAM_NV);
    }
 #endif
 
@@ -439,7 +427,7 @@ _mesa_reference_program_(struct gl_context *ctx,
       printf("Program %p ID=%u Target=%s  Refcount-- to %d\n",
              *ptr, (*ptr)->Id,
              ((*ptr)->Target == GL_VERTEX_PROGRAM_ARB ? "VP" :
-              ((*ptr)->Target == MESA_GEOMETRY_PROGRAM ? "GP" : "FP")),
+              ((*ptr)->Target == GL_GEOMETRY_PROGRAM_NV ? "GP" : "FP")),
              (*ptr)->RefCount - 1);
 #endif
       assert((*ptr)->RefCount > 0);
@@ -464,7 +452,7 @@ _mesa_reference_program_(struct gl_context *ctx,
       printf("Program %p ID=%u Target=%s  Refcount++ to %d\n",
              prog, prog->Id,
              (prog->Target == GL_VERTEX_PROGRAM_ARB ? "VP" :
-              (prog->Target == MESA_GEOMETRY_PROGRAM ? "GP" : "FP")),
+              (prog->Target == GL_GEOMETRY_PROGRAM_NV ? "GP" : "FP")),
              prog->RefCount);
 #endif
       /*mtx_unlock(&prog->Mutex);*/
@@ -554,7 +542,7 @@ _mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog)
          fpc->PixelCenterInteger = fp->PixelCenterInteger;
       }
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       {
          const struct gl_geometry_program *gp = gl_geometry_program_const(prog);
          struct gl_geometry_program *gpc = gl_geometry_program(clone);
diff --git a/src/mesa/program/program_parse.y b/src/mesa/program/program_parse.y
index 716b83d2d07..635f5d09d60 100644
--- a/src/mesa/program/program_parse.y
+++ b/src/mesa/program/program_parse.y
@@ -84,7 +84,7 @@ static void asm_instruction_set_operands(struct asm_instruction *inst,
     const struct prog_dst_register *dst, const struct asm_src_register *src0,
     const struct asm_src_register *src1, const struct asm_src_register *src2);
 
-static struct asm_instruction *asm_instruction_ctor(gl_inst_opcode op,
+static struct asm_instruction *asm_instruction_ctor(enum prog_opcode op,
     const struct prog_dst_register *dst, const struct asm_src_register *src0,
     const struct asm_src_register *src1, const struct asm_src_register *src2);
 
@@ -139,7 +139,7 @@ static struct asm_instruction *asm_instruction_copy_ctor(
    gl_state_index state[STATE_LENGTH];
    int negate;
    struct asm_vector vector;
-   gl_inst_opcode opcode;
+   enum prog_opcode opcode;
 
    struct {
       unsigned swz;
@@ -2275,7 +2275,7 @@ asm_instruction_set_operands(struct asm_instruction *inst,
 
 
 struct asm_instruction *
-asm_instruction_ctor(gl_inst_opcode op,
+asm_instruction_ctor(enum prog_opcode op,
 		     const struct prog_dst_register *dst,
 		     const struct asm_src_register *src0,
 		     const struct asm_src_register *src1,
@@ -2308,7 +2308,7 @@ asm_instruction_copy_ctor(const struct prog_instruction *base,
       inst->Base.Opcode = base->Opcode;
       inst->Base.CondUpdate = base->CondUpdate;
       inst->Base.CondDst = base->CondDst;
-      inst->Base.SaturateMode = base->SaturateMode;
+      inst->Base.Saturate = base->Saturate;
       inst->Base.Precision = base->Precision;
 
       asm_instruction_set_operands(inst, dst, src0, src1, src2);
diff --git a/src/mesa/program/program_parse_extra.c b/src/mesa/program/program_parse_extra.c
index a9e36404580..32b54afc57b 100644
--- a/src/mesa/program/program_parse_extra.c
+++ b/src/mesa/program/program_parse_extra.c
@@ -40,7 +40,7 @@ _mesa_parse_instruction_suffix(const struct asm_parser_state *state,
 {
    inst->CondUpdate = 0;
    inst->CondDst = 0;
-   inst->SaturateMode = SATURATE_OFF;
+   inst->Saturate = GL_FALSE;
    inst->Precision = FLOAT32;
 
 
@@ -82,7 +82,7 @@ _mesa_parse_instruction_suffix(const struct asm_parser_state *state,
     */
    if (state->mode == ARB_fragment) {
       if (strcmp(suffix, "_SAT") == 0) {
-	 inst->SaturateMode = SATURATE_ZERO_ONE;
+	 inst->Saturate = GL_TRUE;
 	 suffix += 4;
       }
    }
diff --git a/src/mesa/program/programopt.c b/src/mesa/program/programopt.c
index e82c68a5305..af78150d594 100644
--- a/src/mesa/program/programopt.c
+++ b/src/mesa/program/programopt.c
@@ -305,7 +305,7 @@ _mesa_append_fog_code(struct gl_context *ctx,
          /* change the instruction to write to colorTemp w/ clamping */
          inst->DstReg.File = PROGRAM_TEMPORARY;
          inst->DstReg.Index = colorTemp;
-         inst->SaturateMode = saturate;
+         inst->Saturate = saturate;
          /* don't break (may be several writes to result.color) */
       }
       inst++;
@@ -331,7 +331,7 @@ _mesa_append_fog_code(struct gl_context *ctx,
       inst->SrcReg[2].File = PROGRAM_STATE_VAR;
       inst->SrcReg[2].Index = fogPRefOpt;
       inst->SrcReg[2].Swizzle = SWIZZLE_YYYY;
-      inst->SaturateMode = SATURATE_ZERO_ONE;
+      inst->Saturate = GL_TRUE;
       inst++;
    }
    else {
@@ -374,7 +374,7 @@ _mesa_append_fog_code(struct gl_context *ctx,
       inst->SrcReg[0].Index = fogFactorTemp;
       inst->SrcReg[0].Negate = NEGATE_XYZW;
       inst->SrcReg[0].Swizzle = SWIZZLE_XXXX;
-      inst->SaturateMode = SATURATE_ZERO_ONE;
+      inst->Saturate = GL_TRUE;
       inst++;
    }
    /* LRP result.color.xyz, fogFactorTemp.xxxx, colorTemp, fogColorRef; */
diff --git a/src/mesa/state_tracker/st_atom_framebuffer.c b/src/mesa/state_tracker/st_atom_framebuffer.c
index b195c55b347..ae883a2535e 100644
--- a/src/mesa/state_tracker/st_atom_framebuffer.c
+++ b/src/mesa/state_tracker/st_atom_framebuffer.c
@@ -134,7 +134,10 @@ update_framebuffer_state( struct st_context *st )
    else {
       strb = st_renderbuffer(fb->Attachment[BUFFER_STENCIL].Renderbuffer);
       if (strb) {
-         assert(strb->surface);
+         if (strb->is_rtt) {
+            /* rendering to a GL texture, may have to update surface */
+            st_update_renderbuffer_surface(st, strb);
+         }
          pipe_surface_reference(&framebuffer->zsbuf, strb->surface);
          update_framebuffer_size(framebuffer, strb->surface);
       }
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 629f54f25de..ad8d2624fc9 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -189,7 +189,7 @@ update_gp( struct st_context *st )
    }
 
    stgp = st_geometry_program(st->ctx->GeometryProgram._Current);
-   assert(stgp->Base.Base.Target == MESA_GEOMETRY_PROGRAM);
+   assert(stgp->Base.Base.Target == GL_GEOMETRY_PROGRAM_NV);
 
    memset(&key, 0, sizeof(key));
    key.st = st;
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index 2107ab16739..c881e194f70 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -452,6 +452,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
+   cso_save_tessctrl_shader(cso);
+   cso_save_tesseval_shader(cso);
    cso_save_geometry_shader(cso);
    cso_save_vertex_elements(cso);
    cso_save_aux_vertex_buffer_slot(cso);
@@ -466,7 +468,9 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* vertex shader state: position + texcoord pass-through */
    cso_set_vertex_shader_handle(cso, st->bitmap.vs);
 
-   /* geometry shader state: disabled */
+   /* disable other shaders */
+   cso_set_tessctrl_shader_handle(cso, NULL);
+   cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
 
    /* user samplers, plus our bitmap sampler */
@@ -536,6 +540,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_restore_viewport(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
+   cso_restore_tessctrl_shader(cso);
+   cso_restore_tesseval_shader(cso);
    cso_restore_geometry_shader(cso);
    cso_restore_vertex_elements(cso);
    cso_restore_aux_vertex_buffer_slot(cso);
diff --git a/src/mesa/state_tracker/st_cb_blit.c b/src/mesa/state_tracker/st_cb_blit.c
index bbaedd108f6..6d9371852c5 100644
--- a/src/mesa/state_tracker/st_cb_blit.c
+++ b/src/mesa/state_tracker/st_cb_blit.c
@@ -36,6 +36,7 @@
 
 #include "st_context.h"
 #include "st_texture.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_blit.h"
 #include "st_cb_fbo.h"
 #include "st_atom.h"
@@ -93,6 +94,9 @@ st_BlitFramebuffer(struct gl_context *ctx,
 
    st_validate_state(st);
 
+   /* Make sure bitmap rendering has landed in the framebuffers */
+   st_flush_bitmap_cache(st);
+
    clip.srcX0 = srcX0;
    clip.srcY0 = srcY0;
    clip.srcX1 = srcX1;
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index f10e9063ac7..137fac8a9a9 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -265,6 +265,8 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
    cso_save_fragment_shader(st->cso_context);
    cso_save_stream_outputs(st->cso_context);
    cso_save_vertex_shader(st->cso_context);
+   cso_save_tessctrl_shader(st->cso_context);
+   cso_save_tesseval_shader(st->cso_context);
    cso_save_geometry_shader(st->cso_context);
    cso_save_vertex_elements(st->cso_context);
    cso_save_aux_vertex_buffer_slot(st->cso_context);
@@ -347,6 +349,8 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
    }
 
    set_fragment_shader(st);
+   cso_set_tessctrl_shader_handle(st->cso_context, NULL);
+   cso_set_tesseval_shader_handle(st->cso_context, NULL);
 
    if (num_layers > 1)
       set_vertex_shader_layered(st);
@@ -371,6 +375,8 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
    cso_restore_viewport(st->cso_context);
    cso_restore_fragment_shader(st->cso_context);
    cso_restore_vertex_shader(st->cso_context);
+   cso_restore_tessctrl_shader(st->cso_context);
+   cso_restore_tesseval_shader(st->cso_context);
    cso_restore_geometry_shader(st->cso_context);
    cso_restore_vertex_elements(st->cso_context);
    cso_restore_aux_vertex_buffer_slot(st->cso_context);
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 3edf31bad52..a6a98c83aa6 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -693,6 +693,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
+   cso_save_tessctrl_shader(cso);
+   cso_save_tesseval_shader(cso);
    cso_save_geometry_shader(cso);
    cso_save_vertex_elements(cso);
    cso_save_aux_vertex_buffer_slot(cso);
@@ -746,7 +748,9 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* vertex shader state: position + texcoord pass-through */
    cso_set_vertex_shader_handle(cso, driver_vp);
 
-   /* geometry shader state: disabled */
+   /* disable other shaders */
+   cso_set_tessctrl_shader_handle(cso, NULL);
+   cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
 
    /* texture sampling state: */
@@ -816,6 +820,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
+   cso_restore_tessctrl_shader(cso);
+   cso_restore_tesseval_shader(cso);
    cso_restore_geometry_shader(cso);
    cso_restore_vertex_elements(cso);
    cso_restore_aux_vertex_buffer_slot(cso);
diff --git a/src/mesa/state_tracker/st_cb_drawtex.c b/src/mesa/state_tracker/st_cb_drawtex.c
index 1420b96e55a..2af4f6d4cf6 100644
--- a/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/src/mesa/state_tracker/st_cb_drawtex.c
@@ -229,6 +229,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
    cso_save_viewport(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
+   cso_save_tessctrl_shader(cso);
+   cso_save_tesseval_shader(cso);
    cso_save_geometry_shader(cso);
    cso_save_vertex_elements(cso);
    cso_save_aux_vertex_buffer_slot(cso);
@@ -238,6 +240,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
                                semantic_names, semantic_indexes);
       cso_set_vertex_shader_handle(cso, vs);
    }
+   cso_set_tessctrl_shader_handle(cso, NULL);
+   cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
 
    for (i = 0; i < numAttribs; i++) {
@@ -279,6 +283,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
    /* restore state */
    cso_restore_viewport(cso);
    cso_restore_vertex_shader(cso);
+   cso_restore_tessctrl_shader(cso);
+   cso_restore_tesseval_shader(cso);
    cso_restore_geometry_shader(cso);
    cso_restore_vertex_elements(cso);
    cso_restore_aux_vertex_buffer_slot(cso);
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 296ea1e0d29..0399eef7204 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -842,7 +842,7 @@ void st_init_fbo_functions(struct dd_function_table *functions)
    functions->NewFramebuffer = st_new_framebuffer;
    functions->NewRenderbuffer = st_new_renderbuffer;
    functions->BindFramebuffer = st_bind_framebuffer;
-   functions->FramebufferRenderbuffer = _mesa_framebuffer_renderbuffer;
+   functions->FramebufferRenderbuffer = _mesa_FramebufferRenderbuffer_sw;
    functions->RenderTexture = st_render_texture;
    functions->FinishRenderTexture = st_finish_render_texture;
    functions->ValidateFramebuffer = st_validate_framebuffer;
diff --git a/src/mesa/state_tracker/st_cb_flush.c b/src/mesa/state_tracker/st_cb_flush.c
index ca51eeee366..82affd2de3e 100644
--- a/src/mesa/state_tracker/st_cb_flush.c
+++ b/src/mesa/state_tracker/st_cb_flush.c
@@ -141,11 +141,44 @@ static void st_glFinish(struct gl_context *ctx)
 }
 
 
-void st_init_flush_functions(struct dd_function_table *functions)
+/**
+ * Query information about GPU resets observed by this context
+ *
+ * Called via \c dd_function_table::GetGraphicsResetStatus.
+ */
+static GLenum
+st_get_graphics_reset_status(struct gl_context *ctx)
+{
+   struct st_context *st = st_context(ctx);
+   enum pipe_reset_status status;
+
+   status = st->pipe->get_device_reset_status(st->pipe);
+
+   switch (status) {
+   case PIPE_NO_RESET:
+      return GL_NO_ERROR;
+   case PIPE_GUILTY_CONTEXT_RESET:
+      return GL_GUILTY_CONTEXT_RESET_ARB;
+   case PIPE_INNOCENT_CONTEXT_RESET:
+      return GL_INNOCENT_CONTEXT_RESET_ARB;
+   case PIPE_UNKNOWN_CONTEXT_RESET:
+      return GL_UNKNOWN_CONTEXT_RESET_ARB;
+   default:
+      assert(0);
+      return GL_NO_ERROR;
+   }
+}
+
+
+void st_init_flush_functions(struct pipe_screen *screen,
+                             struct dd_function_table *functions)
 {
    functions->Flush = st_glFlush;
    functions->Finish = st_glFinish;
 
+   if (screen->get_param(screen, PIPE_CAP_DEVICE_RESET_STATUS_QUERY))
+      functions->GetGraphicsResetStatus = st_get_graphics_reset_status;
+
    /* Windows opengl32.dll calls glFinish prior to every swapbuffers.
     * This is unnecessary and degrades performance.  Luckily we have some
     * scope to work around this, as the externally-visible behaviour of
diff --git a/src/mesa/state_tracker/st_cb_flush.h b/src/mesa/state_tracker/st_cb_flush.h
index 84ffc63ae13..f92dcd56b64 100644
--- a/src/mesa/state_tracker/st_cb_flush.h
+++ b/src/mesa/state_tracker/st_cb_flush.h
@@ -37,7 +37,8 @@ struct pipe_fence_handle;
 struct st_context;
 
 extern void
-st_init_flush_functions(struct dd_function_table *functions);
+st_init_flush_functions(struct pipe_screen *screen,
+                        struct dd_function_table *functions);
 
 extern void
 st_flush(struct st_context *st,
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index c382d7d2ca3..6aa7d5796d9 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -65,7 +65,7 @@ st_bind_program(struct gl_context *ctx, GLenum target, struct gl_program *prog)
    case GL_FRAGMENT_PROGRAM_ARB:
       st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
       break;
    }
@@ -105,7 +105,7 @@ st_new_program(struct gl_context *ctx, GLenum target, GLuint id)
       return _mesa_init_fragment_program(ctx, &prog->Base, target, id);
    }
 
-   case MESA_GEOMETRY_PROGRAM: {
+   case GL_GEOMETRY_PROGRAM_NV: {
       struct st_geometry_program *prog = ST_CALLOC_STRUCT(st_geometry_program);
       return _mesa_init_geometry_program(ctx, &prog->Base, target, id);
    }
@@ -135,7 +135,7 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
             free_glsl_to_tgsi_visitor(stvp->glsl_to_tgsi);
       }
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       {
          struct st_geometry_program *stgp =
             (struct st_geometry_program *) prog;
@@ -198,7 +198,7 @@ st_program_string_notify( struct gl_context *ctx,
       if (st->fp == stfp)
 	 st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
    }
-   else if (target == MESA_GEOMETRY_PROGRAM) {
+   else if (target == GL_GEOMETRY_PROGRAM_NV) {
       struct st_geometry_program *stgp = (struct st_geometry_program *) prog;
 
       st_release_gp_variants(st, stgp);
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index bfb9c8406bd..ed9ed0f1b6c 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -321,7 +321,7 @@ struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
    struct st_context *st;
 
    memset(&funcs, 0, sizeof(funcs));
-   st_init_driver_functions(&funcs);
+   st_init_driver_functions(pipe->screen, &funcs);
 
    ctx = _mesa_create_context(api, visual, shareCtx, &funcs);
    if (!ctx) {
@@ -376,12 +376,6 @@ void st_destroy_context( struct st_context *st )
    }
    pipe_surface_reference(&st->state.framebuffer.zsbuf, NULL);
 
-   pipe->set_index_buffer(pipe, NULL);
-
-   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      pipe->set_constant_buffer(pipe, i, 0, NULL);
-   }
-
    _mesa_delete_program_cache(st->ctx, st->pixel_xfer.cache);
 
    _vbo_DestroyContext(st->ctx);
@@ -401,7 +395,8 @@ void st_destroy_context( struct st_context *st )
 }
 
 
-void st_init_driver_functions(struct dd_function_table *functions)
+void st_init_driver_functions(struct pipe_screen *screen,
+                              struct dd_function_table *functions)
 {
    _mesa_init_shader_object_functions(functions);
    _mesa_init_sampler_object_functions(functions);
@@ -429,7 +424,7 @@ void st_init_driver_functions(struct dd_function_table *functions)
    st_init_readpixels_functions(functions);
    st_init_texture_functions(functions);
    st_init_texture_barrier_functions(functions);
-   st_init_flush_functions(functions);
+   st_init_flush_functions(screen, functions);
    st_init_string_functions(functions);
    st_init_viewport_functions(functions);
 
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 8a9504bb7c1..dac5a4b9006 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -237,7 +237,8 @@ struct st_framebuffer
 };
 
 
-extern void st_init_driver_functions(struct dd_function_table *functions);
+extern void st_init_driver_functions(struct pipe_screen *screen,
+                                     struct dd_function_table *functions);
 
 void st_invalidate_state(struct gl_context * ctx, GLuint new_state);
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 488f6ead201..8b43582c14b 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -141,7 +141,7 @@ check_uniforms(struct gl_context *ctx)
       if (shProg[j] == NULL || !shProg[j]->LinkStatus)
 	 continue;
 
-      for (i = 0; i < shProg[j]->NumUserUniformStorage; i++) {
+      for (i = 0; i < shProg[j]->NumUniformStorage; i++) {
          const struct gl_uniform_storage *u = &shProg[j]->UniformStorage[i];
          if (!u->initialized) {
             _mesa_warning(ctx,
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 1fea8600a75..25e30c7deb2 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -57,11 +57,6 @@
                            (1 << PROGRAM_CONSTANT) |     \
                            (1 << PROGRAM_UNIFORM))
 
-/**
- * Maximum number of arrays
- */
-#define MAX_ARRAYS        256
-
 #define MAX_GLSL_TEXTURE_OFFSET 4
 
 class st_src_reg;
@@ -89,6 +84,7 @@ public:
       this->reladdr2 = NULL;
       this->has_index2 = false;
       this->double_reg2 = false;
+      this->array_id = 0;
    }
 
    st_src_reg(gl_register_file file, int index, int type)
@@ -103,6 +99,7 @@ public:
       this->reladdr2 = NULL;
       this->has_index2 = false;
       this->double_reg2 = false;
+      this->array_id = 0;
    }
 
    st_src_reg(gl_register_file file, int index, int type, int index2D)
@@ -117,6 +114,7 @@ public:
       this->reladdr2 = NULL;
       this->has_index2 = false;
       this->double_reg2 = false;
+      this->array_id = 0;
    }
 
    st_src_reg()
@@ -131,6 +129,7 @@ public:
       this->reladdr2 = NULL;
       this->has_index2 = false;
       this->double_reg2 = false;
+      this->array_id = 0;
    }
 
    explicit st_src_reg(st_dst_reg reg);
@@ -150,6 +149,7 @@ public:
     * currently used for input mapping only.
     */
    bool double_reg2;
+   unsigned array_id;
 };
 
 class st_dst_reg {
@@ -162,6 +162,7 @@ public:
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
       this->type = type;
+      this->array_id = 0;
    }
 
    st_dst_reg(gl_register_file file, int writemask, int type)
@@ -172,6 +173,7 @@ public:
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
       this->type = type;
+      this->array_id = 0;
    }
 
    st_dst_reg()
@@ -182,6 +184,7 @@ public:
       this->writemask = 0;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->array_id = 0;
    }
 
    explicit st_dst_reg(st_src_reg reg);
@@ -193,6 +196,7 @@ public:
    int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
    /** Register index should be offset by the integer in this reg. */
    st_src_reg *reladdr;
+   unsigned array_id;
 };
 
 st_src_reg::st_src_reg(st_dst_reg reg)
@@ -207,6 +211,7 @@ st_src_reg::st_src_reg(st_dst_reg reg)
    this->reladdr2 = NULL;
    this->has_index2 = false;
    this->double_reg2 = false;
+   this->array_id = reg.array_id;
 }
 
 st_dst_reg::st_dst_reg(st_src_reg reg)
@@ -217,6 +222,7 @@ st_dst_reg::st_dst_reg(st_src_reg reg)
    this->writemask = WRITEMASK_XYZW;
    this->cond_mask = COND_TR;
    this->reladdr = reg.reladdr;
+   this->array_id = reg.array_id;
 }
 
 class glsl_to_tgsi_instruction : public exec_node {
@@ -233,6 +239,7 @@ public:
    st_src_reg sampler; /**< sampler register */
    int sampler_array_size; /**< 1-based size of sampler array, 1 if not array */
    int tex_target; /**< One of TEXTURE_*_INDEX */
+   glsl_base_type tex_type;
    GLboolean tex_shadow;
 
    st_src_reg tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
@@ -244,8 +251,9 @@ public:
 
 class variable_storage : public exec_node {
 public:
-   variable_storage(ir_variable *var, gl_register_file file, int index)
-      : file(file), index(index), var(var)
+   variable_storage(ir_variable *var, gl_register_file file, int index,
+                    unsigned array_id = 0)
+      : file(file), index(index), var(var), array_id(array_id)
    {
       /* empty */
    }
@@ -253,6 +261,7 @@ public:
    gl_register_file file;
    int index;
    ir_variable *var; /* variable that maps to this, if any */
+   unsigned array_id;
 };
 
 class immediate_storage : public exec_node {
@@ -302,6 +311,15 @@ public:
    st_src_reg return_reg;
 };
 
+static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
+static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
+
+struct array_decl {
+   unsigned mesa_index;
+   unsigned array_id;
+   unsigned array_size;
+};
+
 struct glsl_to_tgsi_visitor : public ir_visitor {
 public:
    glsl_to_tgsi_visitor();
@@ -317,11 +335,19 @@ public:
 
    int next_temp;
 
-   unsigned array_sizes[MAX_ARRAYS];
+   unsigned *array_sizes;
+   unsigned max_num_arrays;
    unsigned next_array;
 
+   struct array_decl input_arrays[PIPE_MAX_SHADER_INPUTS];
+   unsigned num_input_arrays;
+   struct array_decl output_arrays[PIPE_MAX_SHADER_OUTPUTS];
+   unsigned num_output_arrays;
+
    int num_address_regs;
    int samplers_used;
+   glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
+   int sampler_targets[PIPE_MAX_SAMPLERS];   /**< One of TGSI_TEXTURE_* */
    bool indirect_addr_consts;
    int wpos_transform_const;
 
@@ -372,6 +398,7 @@ public:
    virtual void visit(ir_if *);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void visit(ir_barrier *);
    /*@}*/
 
    st_src_reg result;
@@ -390,31 +417,19 @@ public:
    /** List of glsl_to_tgsi_instruction */
    exec_list instructions;
 
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst, st_src_reg src0);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst, st_dst_reg dst1,
-                                  st_src_reg src0);
+   glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, unsigned op,
+                                      st_dst_reg dst = undef_dst,
+                                      st_src_reg src0 = undef_src,
+                                      st_src_reg src1 = undef_src,
+                                      st_src_reg src2 = undef_src,
+                                      st_src_reg src3 = undef_src);
 
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst, st_src_reg src0, st_src_reg src1);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst,
-                                  st_src_reg src0, st_src_reg src1, st_src_reg src2);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst,
-                                  st_src_reg src0, st_src_reg src1,
-                                  st_src_reg src2, st_src_reg src3);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst, st_dst_reg dst1,
-                                  st_src_reg src0, st_src_reg src1,
-                                  st_src_reg src2, st_src_reg src3);
+   glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, unsigned op,
+                                      st_dst_reg dst, st_dst_reg dst1,
+                                      st_src_reg src0 = undef_src,
+                                      st_src_reg src1 = undef_src,
+                                      st_src_reg src2 = undef_src,
+                                      st_src_reg src3 = undef_src);
 
    unsigned get_opcode(ir_instruction *ir, unsigned op,
                     st_dst_reg dst,
@@ -468,10 +483,6 @@ public:
    void *mem_ctx;
 };
 
-static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
-
-static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
-
 static st_dst_reg address_reg = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 0);
 static st_dst_reg address_reg2 = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 1);
 static st_dst_reg sampler_reladdr = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 2);
@@ -526,10 +537,10 @@ num_inst_src_regs(unsigned opcode)
 }
 
 glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_dst_reg dst1,
-                           st_src_reg src0, st_src_reg src1,
-                           st_src_reg src2, st_src_reg src3)
+glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
+                               st_dst_reg dst, st_dst_reg dst1,
+                               st_src_reg src0, st_src_reg src1,
+                               st_src_reg src2, st_src_reg src3)
 {
    glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
    int num_reladdr = 0, i, j;
@@ -571,6 +582,10 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
    inst->src[3] = src3;
    inst->ir = ir;
    inst->dead_mask = 0;
+   /* default to float, for paths where this is not initialized
+    * (since 0==UINT which is likely wrong):
+    */
+   inst->tex_type = GLSL_TYPE_FLOAT;
 
    inst->function = NULL;
 
@@ -716,48 +731,12 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
 }
 
 glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst,
-                           st_src_reg src0, st_src_reg src1,
-                           st_src_reg src2, st_src_reg src3)
+glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
+                               st_dst_reg dst,
+                               st_src_reg src0, st_src_reg src1,
+                               st_src_reg src2, st_src_reg src3)
 {
-   return emit(ir, op, dst, undef_dst, src0, src1, src2, src3);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_src_reg src0,
-                           st_src_reg src1, st_src_reg src2)
-{
-   return emit(ir, op, dst, undef_dst, src0, src1, src2, undef_src);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_src_reg src0, st_src_reg src1)
-{
-   return emit(ir, op, dst, undef_dst, src0, src1, undef_src, undef_src);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_src_reg src0)
-{
-   assert(dst.writemask != 0);
-   return emit(ir, op, dst, undef_dst, src0, undef_src, undef_src, undef_src);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_dst_reg dst1, st_src_reg src0)
-{
-   return emit(ir, op, dst, dst1, src0, undef_src, undef_src, undef_src);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op)
-{
-   return emit(ir, op, undef_dst, undef_dst, undef_src, undef_src, undef_src, undef_src);
+   return emit_asm(ir, op, dst, undef_dst, src0, src1, src2, src3);
 }
 
 /**
@@ -879,7 +858,7 @@ glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
       TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
    };
 
-   return emit(ir, dot_opcodes[elements - 2], dst, src0, src1);
+   return emit_asm(ir, dot_opcodes[elements - 2], dst, src0, src1);
 }
 
 /**
@@ -929,7 +908,7 @@ glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
                                    src1_swiz, src1_swiz);
 
       dst.writemask = this_mask;
-      emit(ir, op, dst, src0, src1);
+      emit_asm(ir, op, dst, src0, src1);
       done_mask |= this_mask;
    }
 }
@@ -958,7 +937,7 @@ glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
    if (dst.index >= this->num_address_regs)
       this->num_address_regs = dst.index + 1;
 
-   emit(NULL, op, dst, src0);
+   emit_asm(NULL, op, dst, src0);
 }
 
 int
@@ -1142,6 +1121,12 @@ glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
    if (!options->EmitNoIndirectTemp &&
        (type->is_array() || type->is_matrix())) {
 
+      if (next_array >= max_num_arrays) {
+         max_num_arrays += 32;
+         array_sizes = (unsigned*)
+            realloc(array_sizes, sizeof(array_sizes[0]) * max_num_arrays);
+      }
+
       src.file = PROGRAM_ARRAY;
       src.index = next_array << 16 | 0x8000;
       array_sizes[next_array] = type_size(type);
@@ -1242,7 +1227,7 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
              */
             st_src_reg src(PROGRAM_STATE_VAR, index, GLSL_TYPE_FLOAT);
             src.swizzle = slots[i].swizzle;
-            emit(ir, TGSI_OPCODE_MOV, dst, src);
+            emit_asm(ir, TGSI_OPCODE_MOV, dst, src);
             /* even a float takes up a whole vec4 reg in a struct/array. */
             dst.index++;
          }
@@ -1261,11 +1246,11 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
 void
 glsl_to_tgsi_visitor::visit(ir_loop *ir)
 {
-   emit(NULL, TGSI_OPCODE_BGNLOOP);
+   emit_asm(NULL, TGSI_OPCODE_BGNLOOP);
 
    visit_exec_list(&ir->body_instructions, this);
 
-   emit(NULL, TGSI_OPCODE_ENDLOOP);
+   emit_asm(NULL, TGSI_OPCODE_ENDLOOP);
 }
 
 void
@@ -1273,10 +1258,10 @@ glsl_to_tgsi_visitor::visit(ir_loop_jump *ir)
 {
    switch (ir->mode) {
    case ir_loop_jump::jump_break:
-      emit(NULL, TGSI_OPCODE_BRK);
+      emit_asm(NULL, TGSI_OPCODE_BRK);
       break;
    case ir_loop_jump::jump_continue:
-      emit(NULL, TGSI_OPCODE_CONT);
+      emit_asm(NULL, TGSI_OPCODE_CONT);
       break;
    }
 }
@@ -1330,7 +1315,7 @@ glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
    this->result = get_temp(ir->type);
    result_dst = st_dst_reg(this->result);
    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
-   emit(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
+   emit_asm(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
 
    return true;
 }
@@ -1370,7 +1355,7 @@ glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir, int try_operan
    b.negate = ~b.negate;
 
    this->result = get_temp(ir->type);
-   emit(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
+   emit_asm(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
 
    return true;
 }
@@ -1388,7 +1373,7 @@ glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
    if (*num_reladdr != 1) {
       st_src_reg temp = get_temp(glsl_type::vec4_type);
 
-      emit(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
+      emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
       *reg = temp;
    }
 
@@ -1464,7 +1449,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
    switch (ir->operation) {
    case ir_unop_logic_not:
       if (result_dst.type != GLSL_TYPE_FLOAT)
-         emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
       else {
          /* Previously 'SEQ dst, src, 0.0' was used for this.  However, many
           * older GPUs implement SEQ using multiple instructions (i915 uses two
@@ -1472,24 +1457,24 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           * 0.0 and 1.0, 1-x also implements !x.
           */
          op[0].negate = ~op[0].negate;
-         emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
+         emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
       }
       break;
    case ir_unop_neg:
       if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
-         emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
       else if (result_dst.type == GLSL_TYPE_DOUBLE)
-         emit(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
       else {
          op[0].negate = ~op[0].negate;
          result_src = op[0];
       }
       break;
    case ir_unop_abs:
-      emit(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
       break;
    case ir_unop_sign:
-      emit(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
       break;
    case ir_unop_rcp:
       emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, op[0]);
@@ -1513,17 +1498,17 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_saturate: {
       glsl_to_tgsi_instruction *inst;
-      inst = emit(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      inst = emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
       inst->saturate = true;
       break;
    }
 
    case ir_unop_dFdx:
    case ir_unop_dFdx_coarse:
-      emit(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
       break;
    case ir_unop_dFdx_fine:
-      emit(ir, TGSI_OPCODE_DDX_FINE, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_DDX_FINE, result_dst, op[0]);
       break;
    case ir_unop_dFdy:
    case ir_unop_dFdy_coarse:
@@ -1547,18 +1532,18 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
       st_src_reg temp = get_temp(glsl_type::vec4_type);
 
-      emit(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
-      emit(ir, ir->operation == ir_unop_dFdy_fine ?
+      emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
+      emit_asm(ir, ir->operation == ir_unop_dFdy_fine ?
            TGSI_OPCODE_DDY_FINE : TGSI_OPCODE_DDY, result_dst, temp);
       break;
    }
 
    case ir_unop_frexp_sig:
-      emit(ir, TGSI_OPCODE_DFRACEXP, result_dst, undef_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_DFRACEXP, result_dst, undef_dst, op[0]);
       break;
 
    case ir_unop_frexp_exp:
-      emit(ir, TGSI_OPCODE_DFRACEXP, undef_dst, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_DFRACEXP, undef_dst, result_dst, op[0]);
       break;
 
    case ir_unop_noise: {
@@ -1568,50 +1553,50 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        * place to do this is in the GL state tracker, not the poor
        * driver.
        */
-      emit(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
+      emit_asm(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
       break;
    }
 
    case ir_binop_add:
-      emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
       break;
    case ir_binop_sub:
-      emit(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_mul:
-      emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
       break;
    case ir_binop_div:
       if (result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_DOUBLE)
          assert(!"not reached: should be handled by ir_div_to_mul_rcp");
       else
-         emit(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
       break;
    case ir_binop_mod:
       if (result_dst.type == GLSL_TYPE_FLOAT)
          assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
       else
-         emit(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_less:
-      emit(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
       break;
    case ir_binop_greater:
-      emit(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
+      emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
       break;
    case ir_binop_lequal:
-      emit(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
+      emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
       break;
    case ir_binop_gequal:
-      emit(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
       break;
    case ir_binop_equal:
-      emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
       break;
    case ir_binop_nequal:
-      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
       break;
    case ir_binop_all_equal:
       /* "==" operator producing a scalar boolean. */
@@ -1625,7 +1610,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
             st_dst_reg temp_dst = st_dst_reg(temp);
             st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
 
-            emit(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
+            emit_asm(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
 
             /* Emit 1-3 AND operations to combine the SEQ results. */
             switch (ir->operands[0]->type->vector_elements) {
@@ -1635,24 +1620,24 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                temp_dst.writemask = WRITEMASK_Y;
                temp1.swizzle = SWIZZLE_YYYY;
                temp2.swizzle = SWIZZLE_ZZZZ;
-               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
                break;
             case 4:
                temp_dst.writemask = WRITEMASK_X;
                temp1.swizzle = SWIZZLE_XXXX;
                temp2.swizzle = SWIZZLE_YYYY;
-               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
                temp_dst.writemask = WRITEMASK_Y;
                temp1.swizzle = SWIZZLE_ZZZZ;
                temp2.swizzle = SWIZZLE_WWWW;
-               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
             }
 
             temp1.swizzle = SWIZZLE_XXXX;
             temp2.swizzle = SWIZZLE_YYYY;
-            emit(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
+            emit_asm(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
          } else {
-            emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
+            emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
 
             /* After the dot-product, the value will be an integer on the
              * range [0,4].  Zero becomes 1.0, and positive values become zero.
@@ -1665,10 +1650,10 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
              */
             st_src_reg sge_src = result_src;
             sge_src.negate = ~sge_src.negate;
-            emit(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
+            emit_asm(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
          }
       } else {
-         emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
       }
       break;
    case ir_binop_any_nequal:
@@ -1678,7 +1663,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
          st_src_reg temp = get_temp(native_integers ?
                                     glsl_type::uvec4_type :
                                     glsl_type::vec4_type);
-         emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
 
          if (native_integers) {
             st_dst_reg temp_dst = st_dst_reg(temp);
@@ -1692,22 +1677,22 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                temp_dst.writemask = WRITEMASK_Y;
                temp1.swizzle = SWIZZLE_YYYY;
                temp2.swizzle = SWIZZLE_ZZZZ;
-               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
                break;
             case 4:
                temp_dst.writemask = WRITEMASK_X;
                temp1.swizzle = SWIZZLE_XXXX;
                temp2.swizzle = SWIZZLE_YYYY;
-               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
                temp_dst.writemask = WRITEMASK_Y;
                temp1.swizzle = SWIZZLE_ZZZZ;
                temp2.swizzle = SWIZZLE_WWWW;
-               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
             }
 
             temp1.swizzle = SWIZZLE_XXXX;
             temp2.swizzle = SWIZZLE_YYYY;
-            emit(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
+            emit_asm(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
          } else {
             /* After the dot-product, the value will be an integer on the
              * range [0,4].  Zero stays zero, and positive values become 1.0.
@@ -1726,11 +1711,11 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                 */
                st_src_reg slt_src = result_src;
                slt_src.negate = ~slt_src.negate;
-               emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+               emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
             }
          }
       } else {
-         emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
       }
       break;
 
@@ -1763,7 +1748,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                           GET_SWZ(op0_swizzle, 3),
                                           GET_SWZ(op0_swizzle, 3),
                                           GET_SWZ(op0_swizzle, 3));
-            emit(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
+            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
             accum = st_src_reg(result_dst);
             accum.swizzle = dst_swizzle;
             /* fallthrough */
@@ -1772,7 +1757,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                           GET_SWZ(op0_swizzle, 2),
                                           GET_SWZ(op0_swizzle, 2),
                                           GET_SWZ(op0_swizzle, 2));
-            emit(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
+            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
             accum = st_src_reg(result_dst);
             accum.swizzle = dst_swizzle;
             /* fallthrough */
@@ -1781,7 +1766,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                           GET_SWZ(op0_swizzle, 1),
                                           GET_SWZ(op0_swizzle, 1),
                                           GET_SWZ(op0_swizzle, 1));
-            emit(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
+            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
             break;
          default:
             assert(!"Unexpected vector size");
@@ -1807,11 +1792,11 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
              */
             st_src_reg slt_src = result_src;
             slt_src.negate = ~slt_src.negate;
-            emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+            emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
          }
          else {
             /* Use SNE 0 if integers are being used as boolean values. */
-            emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
+            emit_asm(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
          }
       }
       break;
@@ -1819,9 +1804,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
    case ir_binop_logic_xor:
       if (native_integers)
-         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
       else
-         emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_logic_or: {
@@ -1830,13 +1815,13 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           * instruction.
           */
          assert(native_integers);
-         emit(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
       } else {
          /* After the addition, the value will be an integer on the
           * range [0,2].  Zero stays zero, and positive values become 1.0.
           */
          glsl_to_tgsi_instruction *add =
-            emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
+            emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
          if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
             /* The clamping to [0,1] can be done for free in the fragment
              * shader with a saturate if floats are being used as boolean values.
@@ -1849,7 +1834,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
              */
             st_src_reg slt_src = result_src;
             slt_src.negate = ~slt_src.negate;
-            emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+            emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
          }
       }
       break;
@@ -1861,9 +1846,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        * actual AND opcode.
        */
       if (native_integers)
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
       else
-         emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_dot:
@@ -1879,10 +1864,10 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       } else {
          /* sqrt(x) = x * rsq(x). */
          emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
-         emit(ir, TGSI_OPCODE_MUL, result_dst, result_src, op[0]);
+         emit_asm(ir, TGSI_OPCODE_MUL, result_dst, result_src, op[0]);
          /* For incoming channels <= 0, set the result to 0. */
          op[0].negate = ~op[0].negate;
-         emit(ir, TGSI_OPCODE_CMP, result_dst,
+         emit_asm(ir, TGSI_OPCODE_CMP, result_dst,
               op[0], result_src, st_src_reg_for_float(0.0));
       }
       break;
@@ -1891,13 +1876,13 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_i2f:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
          break;
       }
       /* fallthrough to next case otherwise */
    case ir_unop_b2f:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_float(1.0));
+         emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_float(1.0));
          break;
       }
       /* fallthrough to next case otherwise */
@@ -1912,7 +1897,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           * GLSL requires that int(bool) return 1 for true and 0 for false.
           * This conversion is done with AND, but it could be done with NEG.
           */
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_int(1));
+         emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_int(1));
       } else {
          /* Booleans and integers are both stored as floats when native
           * integers are disabled.
@@ -1922,15 +1907,15 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_f2i:
       if (native_integers)
-         emit(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
       else
-         emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_f2u:
       if (native_integers)
-         emit(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
       else
-         emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_bitcast_f2i:
       result_src = op[0];
@@ -1946,38 +1931,38 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       result_src.type = GLSL_TYPE_FLOAT;
       break;
    case ir_unop_f2b:
-      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
+      emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
       break;
    case ir_unop_d2b:
-      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_double(0.0));
+      emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_double(0.0));
       break;
    case ir_unop_i2b:
       if (native_integers)
-         emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_USNE, result_dst, op[0], st_src_reg_for_int(0));
       else
-         emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
+         emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
       break;
    case ir_unop_trunc:
-      emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_ceil:
-      emit(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
       break;
    case ir_unop_floor:
-      emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
       break;
    case ir_unop_round_even:
-      emit(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
       break;
    case ir_unop_fract:
-      emit(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
       break;
 
    case ir_binop_min:
-      emit(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
       break;
    case ir_binop_max:
-      emit(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
       break;
    case ir_binop_pow:
       emit_scalar(ir, TGSI_OPCODE_POW, result_dst, op[0], op[1]);
@@ -1985,37 +1970,37 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
    case ir_unop_bit_not:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
          break;
       }
    case ir_unop_u2f:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
          break;
       }
    case ir_binop_lshift:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_SHL, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SHL, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_rshift:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_ISHR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_ISHR, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_and:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_xor:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_or:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
          break;
       }
 
@@ -2045,7 +2030,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       else {
          /* Relative/variable index into constant buffer */
-         emit(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), op[1],
+         emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), op[1],
               st_src_reg_for_int(4));
          cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
          memcpy(cbuf.reladdr, &index_reg, sizeof(index_reg));
@@ -2078,88 +2063,88 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                        const_offset % 16 / 4);
 
       if (ir->type->base_type == GLSL_TYPE_BOOL) {
-         emit(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
+         emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
       } else {
-         emit(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
+         emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
       }
       break;
    }
    case ir_triop_lrp:
       /* note: we have to reorder the three args here */
-      emit(ir, TGSI_OPCODE_LRP, result_dst, op[2], op[1], op[0]);
+      emit_asm(ir, TGSI_OPCODE_LRP, result_dst, op[2], op[1], op[0]);
       break;
    case ir_triop_csel:
       if (this->ctx->Const.NativeIntegers)
-         emit(ir, TGSI_OPCODE_UCMP, result_dst, op[0], op[1], op[2]);
+         emit_asm(ir, TGSI_OPCODE_UCMP, result_dst, op[0], op[1], op[2]);
       else {
          op[0].negate = ~op[0].negate;
-         emit(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
+         emit_asm(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
       }
       break;
    case ir_triop_bitfield_extract:
-      emit(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
+      emit_asm(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
       break;
    case ir_quadop_bitfield_insert:
-      emit(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
+      emit_asm(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
       break;
    case ir_unop_bitfield_reverse:
-      emit(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
       break;
    case ir_unop_bit_count:
-      emit(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
       break;
    case ir_unop_find_msb:
-      emit(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
       break;
    case ir_unop_find_lsb:
-      emit(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
       break;
    case ir_binop_imul_high:
-      emit(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
       break;
    case ir_triop_fma:
       /* In theory, MAD is incorrect here. */
       if (have_fma)
-         emit(ir, TGSI_OPCODE_FMA, result_dst, op[0], op[1], op[2]);
+         emit_asm(ir, TGSI_OPCODE_FMA, result_dst, op[0], op[1], op[2]);
       else
-         emit(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
+         emit_asm(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
       break;
    case ir_unop_interpolate_at_centroid:
-      emit(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
       break;
    case ir_binop_interpolate_at_offset:
-      emit(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], op[1]);
       break;
    case ir_binop_interpolate_at_sample:
-      emit(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
       break;
 
    case ir_unop_d2f:
-      emit(ir, TGSI_OPCODE_D2F, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_D2F, result_dst, op[0]);
       break;
    case ir_unop_f2d:
-      emit(ir, TGSI_OPCODE_F2D, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_F2D, result_dst, op[0]);
       break;
    case ir_unop_d2i:
-      emit(ir, TGSI_OPCODE_D2I, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_D2I, result_dst, op[0]);
       break;
    case ir_unop_i2d:
-      emit(ir, TGSI_OPCODE_I2D, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_I2D, result_dst, op[0]);
       break;
    case ir_unop_d2u:
-      emit(ir, TGSI_OPCODE_D2U, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_D2U, result_dst, op[0]);
       break;
    case ir_unop_u2d:
-      emit(ir, TGSI_OPCODE_U2D, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_U2D, result_dst, op[0]);
       break;
    case ir_unop_unpack_double_2x32:
    case ir_unop_pack_double_2x32:
-      emit(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
       break;
 
    case ir_binop_ldexp:
       if (ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE) {
-         emit(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
       } else {
          assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
       }
@@ -2243,11 +2228,38 @@ glsl_to_tgsi_visitor::visit(ir_swizzle *ir)
    this->result = src;
 }
 
+/* Test if the variable is an array. Note that geometry and
+ * tessellation shader inputs are outputs are always arrays (except
+ * for patch inputs), so only the array element type is considered.
+ */
+static bool
+is_inout_array(unsigned stage, ir_variable *var, bool *is_2d)
+{
+   const glsl_type *type = var->type;
+
+   if ((stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) ||
+       (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out))
+      return false;
+
+   *is_2d = false;
+
+   if (stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) {
+      if (!var->type->is_array())
+         return false; /* a system value probably */
+
+      type = var->type->fields.array;
+      *is_2d = true;
+   }
+
+   return type->is_array() || type->is_matrix();
+}
+
 void
 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 {
    variable_storage *entry = find_variable_storage(ir->var);
    ir_variable *var = ir->var;
+   bool is_2d;
 
    if (!entry) {
       switch (var->data.mode) {
@@ -2263,16 +2275,56 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
           * user-defined varyings.
           */
          assert(var->data.location != -1);
-         entry = new(mem_ctx) variable_storage(var,
-                                               PROGRAM_INPUT,
-                                               var->data.location);
+
+         if (is_inout_array(shader->Stage, var, &is_2d)) {
+            struct array_decl *decl = &input_arrays[num_input_arrays];
+
+            decl->mesa_index = var->data.location;
+            decl->array_id = num_input_arrays + 1;
+            if (is_2d)
+               decl->array_size = type_size(var->type->fields.array);
+            else
+               decl->array_size = type_size(var->type);
+            num_input_arrays++;
+
+            entry = new(mem_ctx) variable_storage(var,
+                                                  PROGRAM_INPUT,
+                                                  var->data.location,
+                                                  decl->array_id);
+         }
+         else {
+            entry = new(mem_ctx) variable_storage(var,
+                                                  PROGRAM_INPUT,
+                                                  var->data.location);
+         }
+         this->variables.push_tail(entry);
          break;
       case ir_var_shader_out:
          assert(var->data.location != -1);
-         entry = new(mem_ctx) variable_storage(var,
-                                               PROGRAM_OUTPUT,
-                                               var->data.location
-                                               + var->data.index);
+
+         if (is_inout_array(shader->Stage, var, &is_2d)) {
+            struct array_decl *decl = &output_arrays[num_output_arrays];
+
+            decl->mesa_index = var->data.location;
+            decl->array_id = num_output_arrays + 1;
+            if (is_2d)
+               decl->array_size = type_size(var->type->fields.array);
+            else
+               decl->array_size = type_size(var->type);
+            num_output_arrays++;
+
+            entry = new(mem_ctx) variable_storage(var,
+                                                  PROGRAM_OUTPUT,
+                                                  var->data.location,
+                                                  decl->array_id);
+         }
+         else {
+            entry = new(mem_ctx) variable_storage(var,
+                                                  PROGRAM_OUTPUT,
+                                                  var->data.location
+                                                  + var->data.index);
+         }
+         this->variables.push_tail(entry);
          break;
       case ir_var_system_value:
          entry = new(mem_ctx) variable_storage(var,
@@ -2296,10 +2348,43 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
    }
 
    this->result = st_src_reg(entry->file, entry->index, var->type);
+   this->result.array_id = entry->array_id;
    if (!native_integers)
       this->result.type = GLSL_TYPE_FLOAT;
 }
 
+static void
+shrink_array_declarations(struct array_decl *arrays, unsigned count,
+                          GLbitfield64 usage_mask)
+{
+   unsigned i, j;
+
+   /* Fix array declarations by removing unused array elements at both ends
+    * of the arrays. For example, mat4[3] where only mat[1] is used.
+    */
+   for (i = 0; i < count; i++) {
+      struct array_decl *decl = &arrays[i];
+
+      /* Shrink the beginning. */
+      for (j = 0; j < decl->array_size; j++) {
+         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+            break;
+
+         decl->mesa_index++;
+         decl->array_size--;
+         j--;
+      }
+
+      /* Shrink the end. */
+      for (j = decl->array_size-1; j >= 0; j--) {
+         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+            break;
+
+         decl->array_size--;
+      }
+   }
+}
+
 void
 glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
 {
@@ -2341,7 +2426,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
          index_reg = get_temp(native_integers ?
                               glsl_type::int_type : glsl_type::float_type);
 
-         emit(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
+         emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
               this->result, st_src_reg_for_type(index_reg.type, element_size));
       }
 
@@ -2352,7 +2437,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
          st_src_reg accum_reg = get_temp(native_integers ?
                                 glsl_type::int_type : glsl_type::float_type);
 
-         emit(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
+         emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
               index_reg, *src.reladdr);
 
          index_reg = accum_reg;
@@ -2589,16 +2674,16 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
       l_src.swizzle = swizzle_for_size(type->vector_elements);
 
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_UCMP, *l, *cond,
+         emit_asm(ir, TGSI_OPCODE_UCMP, *l, *cond,
               cond_swap ? l_src : *r,
               cond_swap ? *r : l_src);
       } else {
-         emit(ir, TGSI_OPCODE_CMP, *l, *cond,
+         emit_asm(ir, TGSI_OPCODE_CMP, *l, *cond,
               cond_swap ? l_src : *r,
               cond_swap ? *r : l_src);
       }
    } else {
-      emit(ir, TGSI_OPCODE_MOV, *l, *r);
+      emit_asm(ir, TGSI_OPCODE_MOV, *l, *r);
    }
    l->index++;
    r->index++;
@@ -2679,7 +2764,7 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
        */
       glsl_to_tgsi_instruction *inst, *new_inst;
       inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
-      new_inst = emit(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2]);
+      new_inst = emit_asm(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2]);
       new_inst->saturate = inst->saturate;
       inst->dead_mask = inst->dst[0].writemask;
    } else {
@@ -2717,7 +2802,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
          src = this->result;
 
          for (i = 0; i < (unsigned int)size; i++) {
-            emit(ir, TGSI_OPCODE_MOV, temp, src);
+            emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
 
             src.index++;
             temp.index++;
@@ -2739,7 +2824,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
          ir->array_elements[i]->accept(this);
          src = this->result;
          for (int j = 0; j < size; j++) {
-            emit(ir, TGSI_OPCODE_MOV, temp, src);
+            emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
 
             src.index++;
             temp.index++;
@@ -2764,7 +2849,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
                                   ir->type->vector_elements,
                                   GL_FLOAT,
                                   &src.swizzle);
-         emit(ir, TGSI_OPCODE_MOV, mat_column, src);
+         emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
 
          mat_column.index++;
       }
@@ -2889,7 +2974,7 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
          l.cond_mask = COND_TR;
 
          for (i = 0; i < type_size(param->type); i++) {
-            emit(ir, TGSI_OPCODE_MOV, l, r);
+            emit_asm(ir, TGSI_OPCODE_MOV, l, r);
             l.index++;
             r.index++;
          }
@@ -2897,7 +2982,7 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
    }
 
    /* Emit call instruction */
-   call_inst = emit(ir, TGSI_OPCODE_CAL);
+   call_inst = emit_asm(ir, TGSI_OPCODE_CAL);
    call_inst->function = entry;
 
    /* Process out parameters. */
@@ -2922,7 +3007,7 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
          st_dst_reg l = st_dst_reg(this->result);
 
          for (i = 0; i < type_size(param->type); i++) {
-            emit(ir, TGSI_OPCODE_MOV, l, r);
+            emit_asm(ir, TGSI_OPCODE_MOV, l, r);
             l.index++;
             r.index++;
          }
@@ -2965,7 +3050,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       coord = get_temp(glsl_type::vec4_type);
       coord_dst = st_dst_reg(coord);
       coord_dst.writemask = (1 << ir->coordinate->type->vector_elements) - 1;
-      emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
+      emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
    }
 
    if (ir->projector) {
@@ -3074,7 +3159,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       if (opcode == TGSI_OPCODE_TEX) {
          /* Slot the projector in as the last component of the coord. */
          coord_dst.writemask = WRITEMASK_W;
-         emit(ir, TGSI_OPCODE_MOV, coord_dst, projector);
+         emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, projector);
          coord_dst.writemask = WRITEMASK_XYZW;
          opcode = TGSI_OPCODE_TXP;
       } else {
@@ -3086,7 +3171,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
           * projective divide now.
           */
          coord_dst.writemask = WRITEMASK_W;
-         emit(ir, TGSI_OPCODE_RCP, coord_dst, projector);
+         emit_asm(ir, TGSI_OPCODE_RCP, coord_dst, projector);
 
          /* In the case where we have to project the coordinates "by hand,"
           * the shadow comparator value must also be projected.
@@ -3105,14 +3190,14 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
             assert(!sampler_type->sampler_array);
 
             tmp_dst.writemask = WRITEMASK_Z;
-            emit(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
+            emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
 
             tmp_dst.writemask = WRITEMASK_XY;
-            emit(ir, TGSI_OPCODE_MOV, tmp_dst, coord);
+            emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, coord);
          }
 
          coord_dst.writemask = WRITEMASK_XYZ;
-         emit(ir, TGSI_OPCODE_MUL, coord_dst, tmp_src, coord_w);
+         emit_asm(ir, TGSI_OPCODE_MUL, coord_dst, tmp_src, coord_w);
 
          coord_dst.writemask = WRITEMASK_XYZW;
          coord.swizzle = SWIZZLE_XYZW;
@@ -3133,7 +3218,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
          cube_sc = get_temp(glsl_type::float_type);
          cube_sc_dst = st_dst_reg(cube_sc);
          cube_sc_dst.writemask = WRITEMASK_X;
-         emit(ir, TGSI_OPCODE_MOV, cube_sc_dst, this->result);
+         emit_asm(ir, TGSI_OPCODE_MOV, cube_sc_dst, this->result);
          cube_sc_dst.writemask = WRITEMASK_X;
       }
       else {
@@ -3144,20 +3229,20 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
          } else {
             coord_dst.writemask = WRITEMASK_Z;
          }
-         emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
+         emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
          coord_dst.writemask = WRITEMASK_XYZW;
       }
    }
 
    if (ir->op == ir_txf_ms) {
       coord_dst.writemask = WRITEMASK_W;
-      emit(ir, TGSI_OPCODE_MOV, coord_dst, sample_index);
+      emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, sample_index);
       coord_dst.writemask = WRITEMASK_XYZW;
    } else if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXB ||
        opcode == TGSI_OPCODE_TXF) {
       /* TGSI stores LOD or LOD bias in the last channel of the coords. */
       coord_dst.writemask = WRITEMASK_W;
-      emit(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
+      emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
       coord_dst.writemask = WRITEMASK_XYZW;
    }
 
@@ -3167,30 +3252,30 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
    }
 
    if (opcode == TGSI_OPCODE_TXD)
-      inst = emit(ir, opcode, result_dst, coord, dx, dy);
+      inst = emit_asm(ir, opcode, result_dst, coord, dx, dy);
    else if (opcode == TGSI_OPCODE_TXQ) {
       if (ir->op == ir_query_levels) {
          /* the level is stored in W */
-         inst = emit(ir, opcode, st_dst_reg(levels_src), lod_info);
+         inst = emit_asm(ir, opcode, st_dst_reg(levels_src), lod_info);
          result_dst.writemask = WRITEMASK_X;
          levels_src.swizzle = SWIZZLE_WWWW;
-         emit(ir, TGSI_OPCODE_MOV, result_dst, levels_src);
+         emit_asm(ir, TGSI_OPCODE_MOV, result_dst, levels_src);
       } else
-         inst = emit(ir, opcode, result_dst, lod_info);
+         inst = emit_asm(ir, opcode, result_dst, lod_info);
    } else if (opcode == TGSI_OPCODE_TXF) {
-      inst = emit(ir, opcode, result_dst, coord);
+      inst = emit_asm(ir, opcode, result_dst, coord);
    } else if (opcode == TGSI_OPCODE_TXL2 || opcode == TGSI_OPCODE_TXB2) {
-      inst = emit(ir, opcode, result_dst, coord, lod_info);
+      inst = emit_asm(ir, opcode, result_dst, coord, lod_info);
    } else if (opcode == TGSI_OPCODE_TEX2) {
-      inst = emit(ir, opcode, result_dst, coord, cube_sc);
+      inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
    } else if (opcode == TGSI_OPCODE_TG4) {
       if (is_cube_array && ir->shadow_comparitor) {
-         inst = emit(ir, opcode, result_dst, coord, cube_sc);
+         inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
       } else {
-         inst = emit(ir, opcode, result_dst, coord, component);
+         inst = emit_asm(ir, opcode, result_dst, coord, component);
       }
    } else
-      inst = emit(ir, opcode, result_dst, coord);
+      inst = emit_asm(ir, opcode, result_dst, coord);
 
    if (ir->shadow_comparitor)
       inst->tex_shadow = GL_TRUE;
@@ -3246,6 +3331,8 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       assert(!"Should not get here.");
    }
 
+   inst->tex_type = ir->type->base_type;
+
    this->result = result_src;
 }
 
@@ -3264,13 +3351,13 @@ glsl_to_tgsi_visitor::visit(ir_return *ir)
       l = st_dst_reg(current_function->return_reg);
 
       for (i = 0; i < type_size(current_function->sig->return_type); i++) {
-         emit(ir, TGSI_OPCODE_MOV, l, r);
+         emit_asm(ir, TGSI_OPCODE_MOV, l, r);
          l.index++;
          r.index++;
       }
    }
 
-   emit(ir, TGSI_OPCODE_RET);
+   emit_asm(ir, TGSI_OPCODE_RET);
 }
 
 void
@@ -3283,16 +3370,16 @@ glsl_to_tgsi_visitor::visit(ir_discard *ir)
       /* Convert the bool condition to a float so we can negate. */
       if (native_integers) {
          st_src_reg temp = get_temp(ir->condition->type);
-         emit(ir, TGSI_OPCODE_AND, st_dst_reg(temp),
+         emit_asm(ir, TGSI_OPCODE_AND, st_dst_reg(temp),
               condition, st_src_reg_for_float(1.0));
          condition = temp;
       }
 
       condition.negate = ~condition.negate;
-      emit(ir, TGSI_OPCODE_KILL_IF, undef_dst, condition);
+      emit_asm(ir, TGSI_OPCODE_KILL_IF, undef_dst, condition);
    } else {
       /* unconditional kil */
-      emit(ir, TGSI_OPCODE_KILL);
+      emit_asm(ir, TGSI_OPCODE_KILL);
    }
 }
 
@@ -3307,18 +3394,18 @@ glsl_to_tgsi_visitor::visit(ir_if *ir)
 
    if_opcode = native_integers ? TGSI_OPCODE_UIF : TGSI_OPCODE_IF;
 
-   if_inst = emit(ir->condition, if_opcode, undef_dst, this->result);
+   if_inst = emit_asm(ir->condition, if_opcode, undef_dst, this->result);
 
    this->instructions.push_tail(if_inst);
 
    visit_exec_list(&ir->then_instructions, this);
 
    if (!ir->else_instructions.is_empty()) {
-      emit(ir->condition, TGSI_OPCODE_ELSE);
+      emit_asm(ir->condition, TGSI_OPCODE_ELSE);
       visit_exec_list(&ir->else_instructions, this);
    }
 
-   if_inst = emit(ir->condition, TGSI_OPCODE_ENDIF);
+   if_inst = emit_asm(ir->condition, TGSI_OPCODE_ENDIF);
 }
 
 
@@ -3328,7 +3415,7 @@ glsl_to_tgsi_visitor::visit(ir_emit_vertex *ir)
    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
 
    ir->stream->accept(this);
-   emit(ir, TGSI_OPCODE_EMIT, undef_dst, this->result);
+   emit_asm(ir, TGSI_OPCODE_EMIT, undef_dst, this->result);
 }
 
 void
@@ -3337,14 +3424,24 @@ glsl_to_tgsi_visitor::visit(ir_end_primitive *ir)
    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
 
    ir->stream->accept(this);
-   emit(ir, TGSI_OPCODE_ENDPRIM, undef_dst, this->result);
+   emit_asm(ir, TGSI_OPCODE_ENDPRIM, undef_dst, this->result);
+}
+
+void
+glsl_to_tgsi_visitor::visit(ir_barrier *ir)
+{
+   unreachable("Not implemented!");
 }
 
 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
 {
    result.file = PROGRAM_UNDEFINED;
    next_temp = 1;
+   array_sizes = NULL;
+   max_num_arrays = 0;
    next_array = 0;
+   num_input_arrays = 0;
+   num_output_arrays = 0;
    next_signature_id = 1;
    num_immediates = 0;
    current_function = NULL;
@@ -3366,6 +3463,7 @@ glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
 
 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
 {
+   free(array_sizes);
    ralloc_free(mem_ctx);
 }
 
@@ -3387,7 +3485,13 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
       if (is_tex_instruction(inst->op)) {
          for (int i = 0; i < inst->sampler_array_size; i++) {
-            v->samplers_used |= 1 << (inst->sampler.index + i);
+            unsigned idx = inst->sampler.index + i;
+            v->samplers_used |= 1 << idx;
+
+            debug_assert(idx < (int)ARRAY_SIZE(v->sampler_types));
+            v->sampler_types[idx] = inst->tex_type;
+            v->sampler_targets[idx] =
+               st_translate_texture_target(inst->tex_target, inst->tex_shadow);
 
             if (inst->tex_shadow) {
                prog->ShadowSamplers |= 1 << (inst->sampler.index + i);
@@ -3734,6 +3838,7 @@ glsl_to_tgsi_visitor::copy_propagate(void)
             inst->src[r].index2D = first->src[0].index2D;
             inst->src[r].has_index2 = first->src[0].has_index2;
             inst->src[r].double_reg2 = first->src[0].double_reg2;
+            inst->src[r].array_id = first->src[0].array_id;
 
             int swizzle = 0;
             for (int i = 0; i < 4; i++) {
@@ -4177,7 +4282,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
    coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type);
    src0 = v->get_temp(glsl_type::vec4_type);
    dst0 = st_dst_reg(src0);
-   inst = v->emit(NULL, TGSI_OPCODE_TEX, dst0, coord);
+   inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord);
    inst->sampler_array_size = 1;
    inst->tex_target = TEXTURE_2D_INDEX;
 
@@ -4201,7 +4306,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
       /* MAD colorTemp, colorTemp, scale, bias; */
       scale = st_src_reg(PROGRAM_STATE_VAR, scale_p, GLSL_TYPE_FLOAT);
       bias = st_src_reg(PROGRAM_STATE_VAR, bias_p, GLSL_TYPE_FLOAT);
-      inst = v->emit(NULL, TGSI_OPCODE_MAD, dst0, src0, scale, bias);
+      inst = v->emit_asm(NULL, TGSI_OPCODE_MAD, dst0, src0, scale, bias);
    }
 
    if (pixel_maps) {
@@ -4209,6 +4314,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
       st_dst_reg temp_dst = st_dst_reg(temp);
 
       assert(st->pixel_xfer.pixelmap_texture);
+      (void) st;
 
       /* With a little effort, we can do four pixel map look-ups with
        * two TEX instructions:
@@ -4216,7 +4322,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
 
       /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */
       temp_dst.writemask = WRITEMASK_XY; /* write R,G */
-      inst = v->emit(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
+      inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
       inst->sampler.index = 1;
       inst->sampler_array_size = 1;
       inst->tex_target = TEXTURE_2D_INDEX;
@@ -4224,7 +4330,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
       /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */
       src0.swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W);
       temp_dst.writemask = WRITEMASK_ZW; /* write B,A */
-      inst = v->emit(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
+      inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
       inst->sampler.index = 1;
       inst->sampler_array_size = 1;
       inst->tex_target = TEXTURE_2D_INDEX;
@@ -4233,7 +4339,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
       v->samplers_used |= (1 << 1);
 
       /* MOV colorTemp, temp; */
-      inst = v->emit(NULL, TGSI_OPCODE_MOV, dst0, temp);
+      inst = v->emit_asm(NULL, TGSI_OPCODE_MOV, dst0, temp);
    }
 
    /* Now copy the instructions from the original glsl_to_tgsi_visitor into the
@@ -4256,7 +4362,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
             prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      newinst = v->emit(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
       newinst->tex_target = inst->tex_target;
       newinst->sampler_array_size = inst->sampler_array_size;
    }
@@ -4306,7 +4412,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
    coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type);
    src0 = v->get_temp(glsl_type::vec4_type);
    dst0 = st_dst_reg(src0);
-   inst = v->emit(NULL, TGSI_OPCODE_TEX, dst0, coord);
+   inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord);
    inst->sampler.index = samplerIndex;
    inst->sampler_array_size = 1;
    inst->tex_target = TEXTURE_2D_INDEX;
@@ -4319,7 +4425,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
    src0.negate = NEGATE_XYZW;
    if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM)
       src0.swizzle = SWIZZLE_XXXX;
-   inst = v->emit(NULL, TGSI_OPCODE_KILL_IF, undef_dst, src0);
+   inst = v->emit_asm(NULL, TGSI_OPCODE_KILL_IF, undef_dst, src0);
 
    /* Now copy the instructions from the original glsl_to_tgsi_visitor into the
     * new visitor. */
@@ -4336,7 +4442,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
             prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      newinst = v->emit(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
       newinst->tex_target = inst->tex_target;
       newinst->sampler_array_size = inst->sampler_array_size;
    }
@@ -4362,7 +4468,8 @@ struct st_translate {
    unsigned temps_size;
    struct ureg_dst *temps;
 
-   struct ureg_dst arrays[MAX_ARRAYS];
+   struct ureg_dst *arrays;
+   unsigned num_temp_arrays;
    struct ureg_src *constants;
    int num_constants;
    struct ureg_src *immediates;
@@ -4373,7 +4480,9 @@ struct st_translate {
    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
    struct tgsi_texture_offset tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
-   unsigned array_sizes[MAX_ARRAYS];
+   unsigned *array_sizes;
+   struct array_decl *input_arrays;
+   struct array_decl *output_arrays;
 
    const GLuint *inputMapping;
    const GLuint *outputMapping;
@@ -4497,9 +4606,8 @@ emit_immediate(struct st_translate *t,
  * Map a glsl_to_tgsi dst register to a TGSI ureg_dst register.
  */
 static struct ureg_dst
-dst_register(struct st_translate *t,
-             gl_register_file file,
-             GLuint index)
+dst_register(struct st_translate *t, gl_register_file file, unsigned index,
+             unsigned array_id)
 {
    unsigned array;
 
@@ -4530,7 +4638,7 @@ dst_register(struct st_translate *t,
    case PROGRAM_ARRAY:
       array = index >> 16;
 
-      assert(array < ARRAY_SIZE(t->arrays));
+      assert(array < t->num_temp_arrays);
 
       if (ureg_dst_is_undef(t->arrays[array]))
          t->arrays[array] = ureg_DECL_array_temporary(
@@ -4540,16 +4648,25 @@ dst_register(struct st_translate *t,
                                    (int)(index & 0xFFFF) - 0x8000);
 
    case PROGRAM_OUTPUT:
-      if (t->procType == TGSI_PROCESSOR_VERTEX)
-         assert(index < VARYING_SLOT_MAX);
-      else if (t->procType == TGSI_PROCESSOR_FRAGMENT)
-         assert(index < FRAG_RESULT_MAX);
-      else
-         assert(index < VARYING_SLOT_MAX);
+      if (!array_id) {
+         if (t->procType == TGSI_PROCESSOR_FRAGMENT)
+            assert(index < FRAG_RESULT_MAX);
+         else
+            assert(index < VARYING_SLOT_MAX);
 
-      assert(t->outputMapping[index] < ARRAY_SIZE(t->outputs));
+         assert(t->outputMapping[index] < ARRAY_SIZE(t->outputs));
+         assert(t->outputs[t->outputMapping[index]].File != TGSI_FILE_NULL);
+         return t->outputs[t->outputMapping[index]];
+      }
+      else {
+         struct array_decl *decl = &t->output_arrays[array_id-1];
+         unsigned mesa_index = decl->mesa_index;
+         int slot = t->outputMapping[mesa_index];
 
-      return t->outputs[t->outputMapping[index]];
+         assert(slot != -1 && t->outputs[slot].File == TGSI_FILE_OUTPUT);
+         assert(t->outputs[slot].ArrayID == array_id);
+         return ureg_dst_array_offset(t->outputs[slot], index - mesa_index);
+      }
 
    case PROGRAM_ADDRESS:
       return t->address[index];
@@ -4575,7 +4692,8 @@ src_register(struct st_translate *t, const st_src_reg *reg)
 
    case PROGRAM_TEMPORARY:
    case PROGRAM_ARRAY:
-      return ureg_src(dst_register(t, reg->file, reg->index));
+   case PROGRAM_OUTPUT:
+      return ureg_src(dst_register(t, reg->file, reg->index, reg->array_id));
 
    case PROGRAM_UNIFORM:
       assert(reg->index >= 0);
@@ -4598,12 +4716,20 @@ src_register(struct st_translate *t, const st_src_reg *reg)
        * map back to the original index and add the offset after
        * mapping. */
       index -= double_reg2;
-      assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
-      return t->inputs[t->inputMapping[index] + double_reg2];
+      if (!reg->array_id) {
+         assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
+         assert(t->inputs[t->inputMapping[index]].File != TGSI_FILE_NULL);
+         return t->inputs[t->inputMapping[index]];
+      }
+      else {
+         struct array_decl *decl = &t->input_arrays[reg->array_id-1];
+         unsigned mesa_index = decl->mesa_index;
+         int slot = t->inputMapping[mesa_index];
 
-   case PROGRAM_OUTPUT:
-      assert(t->outputMapping[reg->index] < ARRAY_SIZE(t->outputs));
-      return ureg_src(t->outputs[t->outputMapping[reg->index]]); /* not needed? */
+         assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
+         assert(t->inputs[slot].ArrayID == reg->array_id);
+         return ureg_src_array_offset(t->inputs[slot], index - mesa_index);
+      }
 
    case PROGRAM_ADDRESS:
       return ureg_src(t->address[reg->index]);
@@ -4626,9 +4752,8 @@ translate_dst(struct st_translate *t,
               const st_dst_reg *dst_reg,
               bool saturate, bool clamp_color)
 {
-   struct ureg_dst dst = dst_register(t,
-                                      dst_reg->file,
-                                      dst_reg->index);
+   struct ureg_dst dst = dst_register(t, dst_reg->file, dst_reg->index,
+                                      dst_reg->array_id);
 
    if (dst.File == TGSI_FILE_NULL)
       return dst;
@@ -4738,7 +4863,7 @@ translate_tex_offset(struct st_translate *t,
       array = in_offset->index >> 16;
 
       assert(array >= 0);
-      assert(array < (int) ARRAY_SIZE(t->arrays));
+      assert(array < (int)t->num_temp_arrays);
 
       dst = t->arrays[array];
       offset.File = dst.File;
@@ -5060,6 +5185,25 @@ emit_edgeflags(struct st_translate *t)
    ureg_MOV(ureg, edge_dst, edge_src);
 }
 
+static bool
+find_array(unsigned attr, struct array_decl *arrays, unsigned count,
+           unsigned *array_id, unsigned *array_size)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      struct array_decl *decl = &arrays[i];
+
+      if (attr == decl->mesa_index) {
+         *array_id = decl->array_id;
+         *array_size = decl->array_size;
+         assert(*array_size);
+         return true;
+      }
+   }
+   return false;
+}
+
 /**
  * Translate intermediate IR (glsl_to_tgsi_instruction) to TGSI format.
  * \param program  the program to translate
@@ -5089,12 +5233,14 @@ st_translate_program(
    const struct gl_program *proginfo,
    GLuint numInputs,
    const GLuint inputMapping[],
+   const GLuint inputSlotToAttr[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
    const GLuint interpMode[],
    const GLuint interpLocation[],
    GLuint numOutputs,
    const GLuint outputMapping[],
+   const GLuint outputSlotToAttr[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[],
    boolean passthrough_edgeflags,
@@ -5132,25 +5278,101 @@ st_translate_program(
       goto out;
    }
 
-   memset(t, 0, sizeof *t);
-
    t->procType = procType;
    t->inputMapping = inputMapping;
    t->outputMapping = outputMapping;
    t->ureg = ureg;
+   t->num_temp_arrays = program->next_array;
+   if (t->num_temp_arrays)
+      t->arrays = (struct ureg_dst*)
+                  calloc(1, sizeof(t->arrays[0]) * t->num_temp_arrays);
 
    /*
     * Declare input attributes.
     */
-   if (procType == TGSI_PROCESSOR_FRAGMENT) {
+   switch (procType) {
+   case TGSI_PROCESSOR_FRAGMENT:
       for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
-                                                        inputSemanticName[i],
-                                                        inputSemanticIndex[i],
-                                                        interpMode[i], 0,
-                                                        interpLocation[i]);
+         unsigned array_id = 0;
+         unsigned array_size;
+
+         if (find_array(inputSlotToAttr[i], program->input_arrays,
+                        program->num_input_arrays, &array_id, &array_size)) {
+            /* We've found an array. Declare it so. */
+            t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
+                              inputSemanticName[i], inputSemanticIndex[i],
+                              interpMode[i], 0, interpLocation[i],
+                              array_id, array_size);
+            i += array_size - 1;
+         }
+         else {
+            t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
+                              inputSemanticName[i], inputSemanticIndex[i],
+                              interpMode[i], 0, interpLocation[i], 0, 1);
+         }
+      }
+      break;
+   case TGSI_PROCESSOR_GEOMETRY:
+      for (i = 0; i < numInputs; i++) {
+         unsigned array_id = 0;
+         unsigned array_size;
+
+         if (find_array(inputSlotToAttr[i], program->input_arrays,
+                        program->num_input_arrays, &array_id, &array_size)) {
+            /* We've found an array. Declare it so. */
+            t->inputs[i] = ureg_DECL_input(ureg, inputSemanticName[i],
+                                           inputSemanticIndex[i],
+                                           array_id, array_size);
+            i += array_size - 1;
+         }
+         else {
+            t->inputs[i] = ureg_DECL_input(ureg, inputSemanticName[i],
+                                           inputSemanticIndex[i], 0, 1);
+         }
       }
+      break;
+   case TGSI_PROCESSOR_VERTEX:
+      for (i = 0; i < numInputs; i++) {
+         t->inputs[i] = ureg_DECL_vs_input(ureg, i);
+      }
+      break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Declare output attributes.
+    */
+   switch (procType) {
+   case TGSI_PROCESSOR_FRAGMENT:
+      break;
+   case TGSI_PROCESSOR_GEOMETRY:
+   case TGSI_PROCESSOR_VERTEX:
+      for (i = 0; i < numOutputs; i++) {
+         unsigned array_id = 0;
+         unsigned array_size;
+
+         if (find_array(outputSlotToAttr[i], program->output_arrays,
+                        program->num_output_arrays, &array_id, &array_size)) {
+            /* We've found an array. Declare it so. */
+            t->outputs[i] = ureg_DECL_output_array(ureg,
+                                                   outputSemanticName[i],
+                                                   outputSemanticIndex[i],
+                                                   array_id, array_size);
+            i += array_size - 1;
+         }
+         else {
+            t->outputs[i] = ureg_DECL_output(ureg,
+                                             outputSemanticName[i],
+                                             outputSemanticIndex[i]);
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   if (procType == TGSI_PROCESSOR_FRAGMENT) {
       if (proginfo->InputsRead & VARYING_BIT_POS) {
           /* Must do this after setting up t->inputs. */
           emit_wpos(st_context(ctx), t, proginfo, ureg,
@@ -5160,9 +5382,6 @@ st_translate_program(
       if (proginfo->InputsRead & VARYING_BIT_FACE)
          emit_face_var(ctx, t);
 
-      /*
-       * Declare output attributes.
-       */
       for (i = 0; i < numOutputs; i++) {
          switch (outputSemanticName[i]) {
          case TGSI_SEMANTIC_POSITION:
@@ -5198,31 +5417,8 @@ st_translate_program(
          }
       }
    }
-   else if (procType == TGSI_PROCESSOR_GEOMETRY) {
-      for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_gs_input(ureg,
-                                           i,
-                                           inputSemanticName[i],
-                                           inputSemanticIndex[i]);
-      }
-
+   else if (procType == TGSI_PROCESSOR_VERTEX) {
       for (i = 0; i < numOutputs; i++) {
-         t->outputs[i] = ureg_DECL_output(ureg,
-                                          outputSemanticName[i],
-                                          outputSemanticIndex[i]);
-      }
-   }
-   else {
-      assert(procType == TGSI_PROCESSOR_VERTEX);
-
-      for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_vs_input(ureg, i);
-      }
-
-      for (i = 0; i < numOutputs; i++) {
-         t->outputs[i] = ureg_DECL_output(ureg,
-                                          outputSemanticName[i],
-                                          outputSemanticIndex[i]);
          if (outputSemanticName[i] == TGSI_SEMANTIC_FOG) {
             /* force register to contain a fog coordinate in the form (F, 0, 0, 1). */
             ureg_MOV(ureg,
@@ -5277,9 +5473,9 @@ st_translate_program(
       }
    }
 
-   /* Copy over array sizes
-    */
-   memcpy(t->array_sizes, program->array_sizes, sizeof(unsigned) * program->next_array);
+   t->array_sizes = program->array_sizes;
+   t->input_arrays = program->input_arrays;
+   t->output_arrays = program->output_arrays;
 
    /* Emit constants and uniforms.  TGSI uses a single index space for these,
     * so we put all the translated regs in t->constants.
@@ -5355,7 +5551,26 @@ st_translate_program(
    /* texture samplers */
    for (i = 0; i < ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; i++) {
       if (program->samplers_used & (1 << i)) {
+         unsigned type;
+
          t->samplers[i] = ureg_DECL_sampler(ureg, i);
+
+         switch (program->sampler_types[i]) {
+         case GLSL_TYPE_INT:
+            type = TGSI_RETURN_TYPE_SINT;
+            break;
+         case GLSL_TYPE_UINT:
+            type = TGSI_RETURN_TYPE_UINT;
+            break;
+         case GLSL_TYPE_FLOAT:
+            type = TGSI_RETURN_TYPE_FLOAT;
+            break;
+         default:
+            unreachable("not reached");
+         }
+
+         ureg_DECL_sampler_view( ureg, i, program->sampler_targets[i],
+                                 type, type, type, type );
       }
    }
 
@@ -5375,6 +5590,7 @@ st_translate_program(
 
 out:
    if (t) {
+      free(t->arrays);
       free(t->temps);
       free(t->insn);
       free(t->labels);
@@ -5470,7 +5686,7 @@ get_mesa_program(struct gl_context *ctx,
          if (!entry->bgn_inst) {
             v->current_function = entry;
 
-            entry->bgn_inst = v->emit(NULL, TGSI_OPCODE_BGNSUB);
+            entry->bgn_inst = v->emit_asm(NULL, TGSI_OPCODE_BGNSUB);
             entry->bgn_inst->function = entry;
 
             visit_exec_list(&entry->sig->body, v);
@@ -5478,10 +5694,10 @@ get_mesa_program(struct gl_context *ctx,
             glsl_to_tgsi_instruction *last;
             last = (glsl_to_tgsi_instruction *)v->instructions.get_tail();
             if (last->op != TGSI_OPCODE_RET)
-               v->emit(NULL, TGSI_OPCODE_RET);
+               v->emit_asm(NULL, TGSI_OPCODE_RET);
 
             glsl_to_tgsi_instruction *end;
-            end = v->emit(NULL, TGSI_OPCODE_ENDSUB);
+            end = v->emit_asm(NULL, TGSI_OPCODE_ENDSUB);
             end->function = entry;
 
             progress = GL_TRUE;
@@ -5513,7 +5729,7 @@ get_mesa_program(struct gl_context *ctx,
    v->renumber_registers();
 
    /* Write the END instruction. */
-   v->emit(NULL, TGSI_OPCODE_END);
+   v->emit_asm(NULL, TGSI_OPCODE_END);
 
    if (ctx->_Shader->Flags & GLSL_DUMP) {
       _mesa_log("\n");
@@ -5528,6 +5744,10 @@ get_mesa_program(struct gl_context *ctx,
    prog->NumInstructions = 0;
 
    do_set_program_inouts(shader->ir, prog, shader->Stage);
+   shrink_array_declarations(v->input_arrays, v->num_input_arrays,
+                             prog->InputsRead);
+   shrink_array_declarations(v->output_arrays, v->num_output_arrays,
+                             prog->OutputsWritten);
    count_resources(v, prog);
 
    /* This must be done before the uniform storage is associated. */
@@ -5549,6 +5769,7 @@ get_mesa_program(struct gl_context *ctx,
     */
    _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters);
    if (!shader_program->LinkStatus) {
+      free_glsl_to_tgsi_visitor(v);
       return NULL;
    }
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h
index 2cb80bcf961..4af747fa9de 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.h
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h
@@ -43,12 +43,14 @@ enum pipe_error st_translate_program(
    const struct gl_program *proginfo,
    GLuint numInputs,
    const GLuint inputMapping[],
+   const GLuint inputSlotToAttr[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
    const GLuint interpMode[],
    const GLuint interpLocation[],
    GLuint numOutputs,
    const GLuint outputMapping[],
+   const GLuint outputSlotToAttr[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[],
    boolean passthrough_edgeflags,
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 840f76a1307..a2dee6298fa 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -680,6 +680,10 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
 
    if (attribs->flags & ST_CONTEXT_FLAG_FORWARD_COMPATIBLE)
       st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT;
+   if (attribs->flags & ST_CONTEXT_FLAG_ROBUST_ACCESS)
+      st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_ROBUST_ACCESS_BIT_ARB;
+   if (attribs->flags & ST_CONTEXT_FLAG_RESET_NOTIFICATION_ENABLED)
+      st->ctx->Const.ResetStrategy = GL_LOSE_CONTEXT_ON_RESET_ARB;
 
    /* need to perform version check */
    if (attribs->major > 1 || attribs->minor > 0) {
@@ -920,8 +924,7 @@ static unsigned get_version(struct pipe_screen *screen,
    struct gl_extensions extensions = {0};
    GLuint version;
 
-   if ((api == API_OPENGL_COMPAT || api == API_OPENGL_CORE) &&
-       _mesa_override_gl_version_contextless(&consts, &api, &version)) {
+   if (_mesa_override_gl_version_contextless(&consts, &api, &version)) {
       return version;
    }
 
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 98d525c86c2..896e239ee68 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -665,7 +665,7 @@ compile_instruction(
    if (num_dst) 
       dst[0] = translate_dst( t, 
                               &inst->DstReg,
-                              inst->SaturateMode,
+                              inst->Saturate,
                               clamp_dst_color_output);
 
    for (i = 0; i < num_src; i++) 
@@ -1095,10 +1095,9 @@ st_translate_mesa_program(
    }
    else if (procType == TGSI_PROCESSOR_GEOMETRY) {
       for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_gs_input(ureg,
-                                           i,
-                                           inputSemanticName[i],
-                                           inputSemanticIndex[i]);
+         t->inputs[i] = ureg_DECL_input(ureg,
+                                        inputSemanticName[i],
+                                        inputSemanticIndex[i], 0, 1);
       }
 
       for (i = 0; i < numOutputs; i++) {
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index a9110d3c674..fa792bc349b 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -215,6 +215,7 @@ st_prepare_vertex_program(struct gl_context *ctx,
          unsigned slot = stvp->num_outputs++;
 
          stvp->result_to_output[attr] = slot;
+         stvp->output_slot_to_attr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_POS:
@@ -285,7 +286,8 @@ st_prepare_vertex_program(struct gl_context *ctx,
             /* fall through */
          case VARYING_SLOT_VAR0:
          default:
-            assert(attr < VARYING_SLOT_MAX);
+            assert(attr >= VARYING_SLOT_VAR0 ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
             stvp->output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
             stvp->output_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
@@ -321,7 +323,7 @@ st_translate_vertex_program(struct st_context *st,
       _mesa_remove_output_reads(&stvp->Base.Base, PROGRAM_OUTPUT);
    }
 
-   ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_VERTEX, st->pipe->screen);
    if (ureg == NULL) {
       free(vpv);
       return NULL;
@@ -351,6 +353,7 @@ st_translate_vertex_program(struct st_context *st,
                                    /* inputs */
                                    vpv->num_inputs,
                                    stvp->input_to_index,
+                                   NULL, /* inputSlotToAttr */
                                    NULL, /* input semantic name */
                                    NULL, /* input semantic index */
                                    NULL, /* interp mode */
@@ -358,6 +361,7 @@ st_translate_vertex_program(struct st_context *st,
                                    /* outputs */
                                    num_outputs,
                                    stvp->result_to_output,
+                                   stvp->output_slot_to_attr,
                                    stvp->output_semantic_name,
                                    stvp->output_semantic_index,
                                    key->passthrough_edgeflags,
@@ -482,6 +486,7 @@ st_translate_fragment_program(struct st_context *st,
 
    GLuint outputMapping[FRAG_RESULT_MAX];
    GLuint inputMapping[VARYING_SLOT_MAX];
+   GLuint inputSlotToAttr[VARYING_SLOT_MAX];
    GLuint interpMode[PIPE_MAX_SHADER_INPUTS];  /* XXX size? */
    GLuint interpLocation[PIPE_MAX_SHADER_INPUTS];
    GLuint attr;
@@ -502,6 +507,7 @@ st_translate_fragment_program(struct st_context *st,
       return NULL;
 
    assert(!(key->bitmap && key->drawpixels));
+   memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr));
 
    if (key->bitmap) {
       /* glBitmap drawing */
@@ -543,6 +549,7 @@ st_translate_fragment_program(struct st_context *st,
          const GLuint slot = fs_num_inputs++;
 
          inputMapping[attr] = slot;
+         inputSlotToAttr[slot] = attr;
          if (stfp->Base.IsCentroid & BITFIELD64_BIT(attr))
             interpLocation[slot] = TGSI_INTERPOLATE_LOC_CENTROID;
          else if (stfp->Base.IsSample & BITFIELD64_BIT(attr))
@@ -657,7 +664,8 @@ st_translate_fragment_program(struct st_context *st,
              * consumed for the TEXi varyings, and we can base the locations of
              * the user varyings on VAR0.  Otherwise, we use TEX0 as base index.
              */
-            assert(attr >= VARYING_SLOT_TEX0);
+            assert(attr >= VARYING_SLOT_VAR0 || attr == VARYING_SLOT_PNTC ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
             input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
             input_semantic_index[slot] = st_get_generic_varying_index(st, attr);
             if (attr == VARYING_SLOT_PNTC)
@@ -732,7 +740,7 @@ st_translate_fragment_program(struct st_context *st,
       }
    }
 
-   ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_FRAGMENT, st->pipe->screen);
    if (ureg == NULL) {
       free(variant);
       return NULL;
@@ -778,6 +786,7 @@ st_translate_fragment_program(struct st_context *st,
                            /* inputs */
                            fs_num_inputs,
                            inputMapping,
+                           inputSlotToAttr,
                            input_semantic_name,
                            input_semantic_index,
                            interpMode,
@@ -785,6 +794,7 @@ st_translate_fragment_program(struct st_context *st,
                            /* outputs */
                            fs_num_outputs,
                            outputMapping,
+                           NULL,
                            fs_output_semantic_name,
                            fs_output_semantic_index, FALSE,
                            key->clamp_color );
@@ -867,7 +877,9 @@ st_translate_geometry_program(struct st_context *st,
                               struct st_geometry_program *stgp,
                               const struct st_gp_variant_key *key)
 {
+   GLuint inputSlotToAttr[VARYING_SLOT_MAX];
    GLuint inputMapping[VARYING_SLOT_MAX];
+   GLuint outputSlotToAttr[VARYING_SLOT_MAX];
    GLuint outputMapping[VARYING_SLOT_MAX];
    struct pipe_context *pipe = st->pipe;
    GLuint attr;
@@ -890,13 +902,15 @@ st_translate_geometry_program(struct st_context *st,
    if (!gpv)
       return NULL;
 
-   ureg = ureg_create(TGSI_PROCESSOR_GEOMETRY);
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
    if (ureg == NULL) {
       free(gpv);
       return NULL;
    }
 
+   memset(inputSlotToAttr, 0, sizeof(inputSlotToAttr));
    memset(inputMapping, 0, sizeof(inputMapping));
+   memset(outputSlotToAttr, 0, sizeof(outputSlotToAttr));
    memset(outputMapping, 0, sizeof(outputMapping));
 
    /*
@@ -907,6 +921,7 @@ st_translate_geometry_program(struct st_context *st,
          const GLuint slot = gs_num_inputs++;
 
          inputMapping[attr] = slot;
+         inputSlotToAttr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_PRIMITIVE_ID:
@@ -985,6 +1000,7 @@ st_translate_geometry_program(struct st_context *st,
          GLuint slot = gs_num_outputs++;
 
          outputMapping[attr] = slot;
+         outputSlotToAttr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_POS:
@@ -1080,6 +1096,7 @@ st_translate_geometry_program(struct st_context *st,
                         /* inputs */
                         gs_num_inputs,
                         inputMapping,
+                        inputSlotToAttr,
                         input_semantic_name,
                         input_semantic_index,
                         NULL,
@@ -1087,6 +1104,7 @@ st_translate_geometry_program(struct st_context *st,
                         /* outputs */
                         gs_num_outputs,
                         outputMapping,
+                        outputSlotToAttr,
                         gs_output_semantic_name,
                         gs_output_semantic_index,
                         FALSE,
@@ -1201,7 +1219,7 @@ destroy_program_variants(struct st_context *st, struct gl_program *program)
          }
       }
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       {
          struct st_geometry_program *stgp =
             (struct st_geometry_program *) program;
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index a2c56062d6e..bb77eb6ed65 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -163,6 +163,7 @@ struct st_vertex_program
 
    /** Maps VARYING_SLOT_x to slot */
    GLuint result_to_output[VARYING_SLOT_MAX];
+   GLuint output_slot_to_attr[VARYING_SLOT_MAX];
    ubyte output_semantic_name[VARYING_SLOT_MAX];
    ubyte output_semantic_index[VARYING_SLOT_MAX];
    GLuint num_outputs;
diff --git a/src/mesa/swrast/s_texrender.c b/src/mesa/swrast/s_texrender.c
index fa853c9197f..4e41b3b72a8 100644
--- a/src/mesa/swrast/s_texrender.c
+++ b/src/mesa/swrast/s_texrender.c
@@ -72,7 +72,7 @@ update_wrapper(struct gl_context *ctx, struct gl_renderbuffer_attachment *att)
  * \param fb  the framebuffer object the texture is being bound to
  * \param att  the fb attachment point of the texture
  *
- * \sa _mesa_framebuffer_renderbuffer
+ * \sa _mesa_FramebufferRenderbuffer_sw
  */
 void
 _swrast_render_texture(struct gl_context *ctx,
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index 5b9dd54d75a..bc77ba8bf95 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -36,6 +36,7 @@
 #include "math/m_xform.h"
 #include "main/state.h"
 #include "main/viewport.h"
+#include "util/simple_list.h"
 
 #include "tnl.h"
 #include "t_context.h"
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 3ea775c0e4a..72b8206ec23 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -1817,9 +1817,12 @@ vbo_initialize_exec_dispatch(const struct gl_context *ctx,
       SET_DrawElementsInstancedBaseVertexBaseInstance(exec, vbo_exec_DrawElementsInstancedBaseVertexBaseInstance);
    }
 
-   if (ctx->API == API_OPENGL_CORE) {
+   if (ctx->API == API_OPENGL_CORE || _mesa_is_gles31(ctx)) {
       SET_DrawArraysIndirect(exec, vbo_exec_DrawArraysIndirect);
       SET_DrawElementsIndirect(exec, vbo_exec_DrawElementsIndirect);
+   }
+
+   if (ctx->API == API_OPENGL_CORE) {
       SET_MultiDrawArraysIndirect(exec, vbo_exec_MultiDrawArraysIndirect);
       SET_MultiDrawElementsIndirect(exec, vbo_exec_MultiDrawElementsIndirect);
    }